123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178 |
- from instaloader import Instaloader, Profile
- import traceback
- import copy
- import operator
- import dataset
- import pandas as pd
- import networkx as nx
- #import pysftp
- import codecs
- import pyvis
- import sys
- import pickle
- import os
- import searchconsole
- from pyvis.network import Network
- import jieba
- pname='hhh.rb'
- db = dataset.connect('sqlite:///'+pname+".db")
- table=db['tmp']
- #pname='cont'
- #pname='damanwoo'
- #pname='drama'
- #pname='news'
- #pname='www'
- #pname='ipromise'
- #pname='sports'
- #pname='rumor'
- #pname='korea'
- pname='hhh.rb'
- rid=0
- def get_css():
- fr=codecs.open('jared/data/css.txt','r','utf-8')
- lines=fr.readlines()
- content=' '.join(lines)
- fr.close()
- return content
- def modify_file(fname):
- fr=codecs.open(fname,'r','utf-8')
- lines=fr.readlines()
- fr.close()
- # css=get_css()
- css=''
- content_output=''
- for l in lines:
- if '<body>' in l[0:10]:
- content_output+=l
- content_output+='\n<div id="google">\n'
- continue
- if '<style type="text' in l[0:22]:
- content_output+=l
- content_output+="\n"+css+"\n"
- continue
- if '<div id = "mynetwork"' in l[0:30]:
- content_output+=l
- content_output+='\n</div>\n'
- continue
- content_output+=l
- fw=codecs.open("mod_"+fname,'w','utf-8')
- fw.write(content_output)
- fw.close()
- def gen_pic():
- G=None
- # if os.path.exists(pname):
- # G = pickle.load( open( pname, "rb" ) )
- # else:
- # G = nx.Graph()
- G = nx.Graph()
- finallist=[]
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q ) as tbl1 ) order by q')
- # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) >20) as tbl1 ) order by q')
- # cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >5 and count(q)<10 ) as tbl1) order by url')
- cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >4 and count(q)<9 ) as tbl1) order by url')
- urldict={}
- for c in cursor:
- url=c['url'].replace('https://www.hhh.com.tw','')
- url=url.replace('https://hhh.com.tw','')
- url=url.replace('https://m.hhh.com.tw','')
- q=c['q']
- if urldict.get(url) is None:
- urldict[url]=[q]
- else:
- urldict[url].append(q)
- print(len(urldict.keys()))
- cnt=0
- for k,v in urldict.items():
- if len(v)>=2:
- for itm in v:
- G.add_edge(k,itm,weight=3,width=3,borderwidth=3)
- cnt+=1
- if cnt%1000 == 0:
- print(cnt)
- # G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
- # pickle.dump( G, open( pname, "wb" ) )
- # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)]
- # remove = [node for node,degree in dict(G.degree()).items() if degree <2]
- # G.remove_nodes_from(remove)
- remove=[]
- # for n in G.nodes:
- # if '承諾' in n:
- # remove.append(n)
- # if 'promise' in n:
- # remove.append(n)
- # G.remove_nodes_from(remove)
- to_remove=[]
- for n in G:
- dg=G.degree(n)
- if dg > 15:
- to_remove.append(n)
- G.remove_nodes_from(to_remove)
- G.remove_edges_from(nx.selfloop_edges(G))
- G.remove_nodes_from(list(nx.isolates(G)))
- # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)]
- # lst=[]
- # for c in nx.connected_components(G):
- # cc=G.subgraph(c).copy()
- # if cc.number_of_nodes()>7:
- # lst.append(cc)
- # if nx.diameter(cc, e=None, usebounds=False)>1:
- # lst.append(cc)
- # G2=nx.compose_all(lst)
- G2=G
- # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
- pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
- pyG.from_nx(G2)
- pyG.show(pname+'.html')
- modify_file(pname+'.html')
- # cnopts = pysftp.CnOpts()
- # cnopts.hostkeys = None
- # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts)
- # local_path = "mod_"+pname+".html"
- # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html"
- # s.put(local_path, remote_path)
- return finallist
- gen_pic()
- #r=checkig('信用卡')
- #print(r)
- # network.on( 'click', function(properties) {
- # var ids = properties.nodes;
- # var clickedNodes = nodes.get(ids);
- # var copyText = clickedNodes[0].label;
- # var promise = navigator.clipboard.writeText(copyText);
- #// console.log('clicked nodes:', clickedNodes);
- #});
|