from instaloader import Instaloader, Profile import traceback import copy import operator import dataset import pandas as pd import networkx as nx #import pysftp import codecs import pyvis import sys import pickle import os import searchconsole from pyvis.network import Network import jieba pname='hhh.rb' db = dataset.connect('sqlite:///'+pname+".db") table=db['tmp'] #pname='cont' #pname='damanwoo' #pname='drama' #pname='news' #pname='www' #pname='ipromise' #pname='sports' #pname='rumor' #pname='korea' pname='hhh.rb' rid=0 def get_css(): fr=codecs.open('jared/data/css.txt','r','utf-8') lines=fr.readlines() content=' '.join(lines) fr.close() return content def modify_file(fname): fr=codecs.open(fname,'r','utf-8') lines=fr.readlines() fr.close() # css=get_css() css='' content_output='' for l in lines: if '<body>' in l[0:10]: content_output+=l content_output+='\n<div id="google">\n' continue if '<style type="text' in l[0:22]: content_output+=l content_output+="\n"+css+"\n" continue if '<div id = "mynetwork"' in l[0:30]: content_output+=l content_output+='\n</div>\n' continue content_output+=l fw=codecs.open("mod_"+fname,'w','utf-8') fw.write(content_output) fw.close() def gen_pic(): G=None # if os.path.exists(pname): # G = pickle.load( open( pname, "rb" ) ) # else: # G = nx.Graph() G = nx.Graph() finallist=[] # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q') # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q') # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q') # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q ) as tbl1 ) order by q') # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) >20) as tbl1 ) order by q') # cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >5 and count(q)<10 ) as tbl1) order by url') cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >4 and count(q)<9 ) as tbl1) order by url') urldict={} for c in cursor: url=c['url'].replace('https://www.hhh.com.tw','') url=url.replace('https://hhh.com.tw','') url=url.replace('https://m.hhh.com.tw','') q=c['q'] if urldict.get(url) is None: urldict[url]=[q] else: urldict[url].append(q) print(len(urldict.keys())) cnt=0 for k,v in urldict.items(): if len(v)>=2: for itm in v: G.add_edge(k,itm,weight=3,width=3,borderwidth=3) cnt+=1 if cnt%1000 == 0: print(cnt) # G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3) # pickle.dump( G, open( pname, "wb" ) ) # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)] # remove = [node for node,degree in dict(G.degree()).items() if degree <2] # G.remove_nodes_from(remove) remove=[] # for n in G.nodes: # if '承諾' in n: # remove.append(n) # if 'promise' in n: # remove.append(n) # G.remove_nodes_from(remove) to_remove=[] for n in G: dg=G.degree(n) if dg > 15: to_remove.append(n) G.remove_nodes_from(to_remove) G.remove_edges_from(nx.selfloop_edges(G)) G.remove_nodes_from(list(nx.isolates(G))) # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)] # lst=[] # for c in nx.connected_components(G): # cc=G.subgraph(c).copy() # if cc.number_of_nodes()>7: # lst.append(cc) # if nx.diameter(cc, e=None, usebounds=False)>1: # lst.append(cc) # G2=nx.compose_all(lst) G2=G # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white") pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white") pyG.from_nx(G2) pyG.show(pname+'.html') modify_file(pname+'.html') # cnopts = pysftp.CnOpts() # cnopts.hostkeys = None # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts) # local_path = "mod_"+pname+".html" # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html" # s.put(local_path, remote_path) return finallist gen_pic() #r=checkig('信用卡') #print(r) # network.on( 'click', function(properties) { # var ids = properties.nodes; # var clickedNodes = nodes.get(ids); # var copyText = clickedNodes[0].label; # var promise = navigator.clipboard.writeText(copyText); #// console.log('clicked nodes:', clickedNodes); #});