123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178 |
- from instaloader import Instaloader, Profile
- import traceback
- import copy
- import operator
- import dataset
- import pandas as pd
- import networkx as nx
- import codecs
- import pyvis
- import sys
- import pickle
- import os
- import searchconsole
- from pyvis.network import Network
- import jieba
- pname='hhh.rb'
- db = dataset.connect('sqlite:///'+pname+".db")
- table=db['tmp']
- pname='hhh.rb'
- rid=0
- def get_css():
- fr=codecs.open('jared/data/css.txt','r','utf-8')
- lines=fr.readlines()
- content=' '.join(lines)
- fr.close()
- return content
- def modify_file(fname):
- fr=codecs.open(fname,'r','utf-8')
- lines=fr.readlines()
- fr.close()
- css=''
- content_output=''
- for l in lines:
- if '<body>' in l[0:10]:
- content_output+=l
- content_output+='\n<div id="google">\n'
- continue
- if '<style type="text' in l[0:22]:
- content_output+=l
- content_output+="\n"+css+"\n"
- continue
- if '<div id = "mynetwork"' in l[0:30]:
- content_output+=l
- content_output+='\n</div>\n'
- continue
- content_output+=l
- fw=codecs.open("mod_"+fname,'w','utf-8')
- fw.write(content_output)
- fw.close()
- def gen_pic():
- G=None
- G = nx.Graph()
- finallist=[]
- cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >4 and count(q)<9 ) as tbl1) order by url')
- urldict={}
- for c in cursor:
- url=c['url'].replace('https://www.hhh.com.tw','')
- url=url.replace('https://hhh.com.tw','')
- url=url.replace('https://m.hhh.com.tw','')
- q=c['q']
- if urldict.get(url) is None:
- urldict[url]=[q]
- else:
- urldict[url].append(q)
- print(len(urldict.keys()))
- cnt=0
- for k,v in urldict.items():
- if len(v)>=2:
- for itm in v:
- G.add_edge(k,itm,weight=3,width=3,borderwidth=3)
- cnt+=1
- if cnt%1000 == 0:
- print(cnt)
- remove=[]
- to_remove=[]
- for n in G:
- dg=G.degree(n)
- if dg > 15:
- to_remove.append(n)
- G.remove_nodes_from(to_remove)
- G.remove_edges_from(nx.selfloop_edges(G))
- G.remove_nodes_from(list(nx.isolates(G)))
- G2=G
- pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
- pyG.from_nx(G2)
- pyG.show(pname+'.html')
- modify_file(pname+'.html')
- return finallist
- gen_pic()
|