123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- from instaloader import Instaloader, Profile
- import traceback
- import copy
- import operator
- import dataset
- import pandas as pd
- import networkx as nx
- #import pysftp
- import codecs
- import pyvis
- import sys
- import pickle
- import os
- import searchconsole
- from pyvis.network import Network
- import jieba
- db = dataset.connect('sqlite:///:memory:')
- table=db['tmp']
- #pname='cont'
- #pname='damanwoo'
- #pname='drama'
- #pname='news'
- #pname='www'
- #pname='ipromise'
- #pname='sports'
- #pname='rumor'
- #pname='korea'
- pname='hhh.rb'
- rid=0
- def get_css():
- fr=codecs.open('jared/data/css.txt','r','utf-8')
- lines=fr.readlines()
- content=' '.join(lines)
- fr.close()
- return content
- def modify_file(fname):
- fr=codecs.open(fname,'r','utf-8')
- lines=fr.readlines()
- fr.close()
- # css=get_css()
- css=''
- content_output=''
- for l in lines:
- if '<body>' in l[0:10]:
- content_output+=l
- content_output+='\n<div id="google">\n'
- continue
- if '<style type="text' in l[0:22]:
- content_output+=l
- content_output+="\n"+css+"\n"
- continue
- if '<div id = "mynetwork"' in l[0:30]:
- content_output+=l
- content_output+='\n</div>\n'
- continue
- content_output+=l
- fw=codecs.open("mod_"+fname,'w','utf-8')
- fw.write(content_output)
- fw.close()
- def checkig(pgnum):
- global instl
- global table
- global pname
- global rid
- lst=[]
- cntdict={}
- codelist={}
- idx=0
- flag_break=False
- fname=os.path.abspath(__file__)
- elmts=fname.split(os.path.sep)
- path2=os.path.sep.join(elmts[0:-1])
- keysdir=path2+os.path.sep+'keys'+os.path.sep
- # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
- account = searchconsole.authenticate(client_config=keysdir+'client_secret.json',credentials=keysdir+'credentials.json')
- # webproperty = account['https://ipromise.com.tw/']
- # webproperty = account['https://'+pname+'.face8ook.org/']
- # webproperty = account['https://www.damanwoo.com/']
- # webproperty = account['https://hhh.com.tw/']
- webproperty = account['https://innews.com.tw/']
- # report=webproperty.query.range('2021-03-01', '2021-06-17').dimension('page','query').get()
- # report=webproperty.query.range('2021-06-01', '2021-06-17').dimension('page','query').get()
- # report=webproperty.query.range('2020-06-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/(491|31|293|278|31|24|594|356|307|491|33|385)', 'equals').get()
- # report=webproperty.query.range('2020-03-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/'+pgnum, 'contains').get()
- # report=webproperty.query.range('2021-12-01', '2022-01-18').dimension('page','query').get()
- report=webproperty.query.range('2022-01-01', '2022-04-16').dimension('page','query').get()
- urlq={}
- for r in report.rows:
- if urlq.get(r[0]) is None:
- urlq[r[0]]=[r[1]]
- else:
- urlq[r[0]].append(r[1])
- print(urlq)
- allrows=[]
- for k,v in urlq.items():
- # if len(v)<40:
- if len(v)<0:
- continue
- # print(k)
- for q in v:
- # elmts=jieba.cut(q)
- elmts=q.split(' ')
- for elmt in elmts:
- # print(elmt)
- table.insert({'q':elmt,'rid':rid,'url':k})
- rid+=1
- allrows.append([r[0],r[1] ])
- db.commit()
- # cursor=db.query('(select q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 )')
- def gen_pic():
- G=None
- # if os.path.exists(pname):
- # G = pickle.load( open( pname, "rb" ) )
- # else:
- # G = nx.Graph()
- G = nx.Graph()
- finallist=[]
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q ) as tbl1 ) order by q')
- cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
- riddict={}
- prev=''
- curnode=''
- cururl=''
- total_idx=0
- for c in cursor:
- if c['q']!=prev:
- cururl=c['url']
- prev=c['q']
- total_idx+=1
- # if total_idx >= 200:
- # break
- else:
- # G.add_edge(cururl,c['url'],weight=3,width=3,borderwidth=3)
- # G.add_edge(cururl[40:51],c['url'][40:51],weight=3,width=3,borderwidth=3)
- # G.add_edge(c['q'],cururl[40:51],weight=3,width=3,borderwidth=3)
- # G.add_edge(c['q'],c['url'][40:51],weight=3,width=3,borderwidth=3)
- query=c['q']
- rid=c['rid']
- if riddict.get(rid) is None:
- riddict[rid]={}
- riddict[rid][query]=1
- else:
- if riddict[rid].get(query) is not None:
- riddict[rid][query]=1
- else:
- riddict[rid][query]={}
- G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
- # G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
- pickle.dump( G, open( pname, "wb" ) )
- # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)]
- # remove = [node for node,degree in dict(G.degree()).items() if degree <2]
- # G.remove_nodes_from(remove)
- remove=[]
- # for n in G.nodes:
- # if '承諾' in n:
- # remove.append(n)
- # if 'promise' in n:
- # remove.append(n)
- # G.remove_nodes_from(remove)
- G.remove_edges_from(nx.selfloop_edges(G))
- G.remove_nodes_from(list(nx.isolates(G)))
- # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)]
- # lst=[]
- # for c in nx.connected_components(G):
- # cc=G.subgraph(c).copy()
- # if cc.number_of_nodes()>7:
- # lst.append(cc)
- # if nx.diameter(cc, e=None, usebounds=False)>1:
- # lst.append(cc)
- # G2=nx.compose_all(lst)
- G2=G
- # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
- pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
- pyG.from_nx(G2)
- pyG.show(pname+'.html')
- modify_file(pname+'.html')
- # cnopts = pysftp.CnOpts()
- # cnopts.hostkeys = None
- # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts)
- # local_path = "mod_"+pname+".html"
- # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html"
- # s.put(local_path, remote_path)
- return finallist
- #r=checkig('投資')
- #r=checkig('保險')
- #r=checkig('嘖嘖')
- #r=checkig('募資')
- #r=checkig('遠赤外線')
- #lst=['491','31','293','278','24','594','356','307','491','33','385']
- #lst=['491','31','293','278','24','594','356','307','491','33','385']
- #for l in lst:
- # r=checkig(l)
- checkig('12')
- gen_pic()
- #r=checkig('信用卡')
- #print(r)
- # network.on( 'click', function(properties) {
- # var ids = properties.nodes;
- # var clickedNodes = nodes.get(ids);
- # var copyText = clickedNodes[0].label;
- # var promise = navigator.clipboard.writeText(copyText);
- #// console.log('clicked nodes:', clickedNodes);
- #});
|