from instaloader import Instaloader, Profile import traceback import copy import operator import dataset import pandas as pd import networkx as nx #import pysftp import codecs import pyvis import sys import pickle import os import searchconsole from pyvis.network import Network import jieba import sys import codecs import traceback import requests import re import pandas as pd import random import urllib import dataset import json import gspread import datetime from gspread_pandas import Spread, Client from oauth2client.service_account import ServiceAccountCredentials import os import threading def save_sheet(df,filename,tabname,startpos='A1'): scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] credentials = ServiceAccountCredentials.from_json_keyfile_name('c:\\keys\\spread2.json', scope) # credentials = ServiceAccountCredentials.from_json_keyfile_name('/var/keys/spread2.json', scope) gc = gspread.authorize(credentials) spread = Spread(filename,creds=credentials) spread.df_to_sheet(df, index=False, sheet=tabname, start=startpos, replace=False) db = dataset.connect('sqlite:///:memory:') table=db['tmp'] #pname='cont' #pname='damanwoo' #pname='drama' pname='news' #pname='www' #pname='ipromise' #pname='sports' #pname='rumor' #pname='korea' def get_css(): return '' # fr=codecs.open('jared/data/css.txt','r','utf-8') # lines=fr.readlines() # content=' '.join(lines) # fr.close() # return content def modify_file(fname): fr=codecs.open(fname,'r','utf-8') lines=fr.readlines() fr.close() css=get_css() content_output='' for l in lines: if '<body>' in l[0:10]: content_output+=l content_output+='\n<div id="google">\n' continue if '<style type="text' in l[0:22]: content_output+=l content_output+="\n"+css+"\n" continue if '<div id = "mynetwork"' in l[0:30]: content_output+=l content_output+='\n</div>\n' continue content_output+=l fw=codecs.open("mod_"+fname,'w','utf-8') fw.write(content_output) fw.close() def checkig(kw): global instl global table global pname lst=[] idx=0 cntdict={} codelist={} G=None # if os.path.exists(pname): # G = pickle.load( open( pname, "rb" ) ) # else: # G = nx.Graph() G = nx.Graph() finallist=[] idx=0 flag_break=False # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json') # webproperty = account['https://ipromise.com.tw/'] # webproperty = account['https://'+pname+'.face8ook.org/'] # webproperty = account['https://www.damanwoo.com/'] fname=os.path.abspath(__file__) elmts=fname.split(os.path.sep) path2=os.path.sep.join(elmts[0:-1]) keysdir=path2+os.path.sep+'keys'+os.path.sep account = searchconsole.authenticate(client_config=keysdir+'client_secret.json',credentials=keysdir+'credentials.json') # webproperty = account['https://ipromise.com.tw/'] # webproperty = account['https://'+pname+'.face8ook.org/'] # webproperty = account['https://www.damanwoo.com/'] webproperty = account['https://hhh.com.tw/'] # report=webproperty.query.range('2021-06-11', '2021-06-18').dimension('page','query').get() report=webproperty.query.range('2020-06-22', '2021-06-21').dimension('page','query').limit(10000).get() urlq={} for r in report.rows: if urlq.get(r[0]) is None: urlq[r[0]]=[r[1]] else: urlq[r[0]].append(r[1]) allrows=[] rid=0 for k,v in urlq.items(): # if len(v)<35: if len(v)<2: continue print(len(v)) for q in v: # elmts=jieba.cut(q) elmts=q.split(' ') for elmt in elmts: # print(elmt) table.insert({'q':elmt,'rid':rid}) rid+=1 allrows.append([r[0],r[1] ]) db.commit() cursor=db.query('select q,rid from tmp order by q') prev='' curnode='' df = pd.DataFrame(columns=('rid','query')) repdict={} idx=0 for c in cursor: if c['rid']!=prev: curnode=c['q'] prev=c['rid'] else: if repdict.get((curnode,c['q'])) is None: repdict[(curnode,c['q'])]=1 # repdict[(c['q'],curnode)]=1 df.loc[idx]=[curnode,c['q']] idx+=1 G.add_edge(curnode,c['q'],weight=3,width=3,borderwidth=3) pickle.dump( G, open( pname, "wb" ) ) # save_sheet(df,'BigGraph','nodes') # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)] # remove = [node for node,degree in dict(G.degree()).items() if degree <2] # G.remove_nodes_from(remove) remove=[] for n in G.nodes: if '承諾' in n: remove.append(n) if 'promise' in n: remove.append(n) G.remove_nodes_from(remove) G.remove_edges_from(nx.selfloop_edges(G)) G.remove_nodes_from(list(nx.isolates(G))) # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)] # lst=[] # for c in nx.connected_components(G): # cc=G.subgraph(c).copy() # if cc.number_of_nodes()>7: # lst.append(cc) # if nx.diameter(cc, e=None, usebounds=False)>1: # lst.append(cc) # G2=nx.compose_all(lst) G2=G # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white") pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white") pyG.from_nx(G2) pyG.show(pname+'.html') modify_file(pname+'.html') # cnopts = pysftp.CnOpts() # cnopts.hostkeys = None # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts) # local_path = "mod_"+pname+".html" # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html" # s.put(local_path, remote_path) return finallist #r=checkig('投資') #r=checkig('保險') #r=checkig('嘖嘖') #r=checkig('募資') #r=checkig('遠赤外線') r=checkig('インソール') #r=checkig('信用卡') #print(r) # network.on( 'click', function(properties) { # var ids = properties.nodes; # var clickedNodes = nodes.get(ids); # var copyText = clickedNodes[0].label; # var promise = navigator.clipboard.writeText(copyText); #// console.log('clicked nodes:', clickedNodes); #});