123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- from instaloader import Instaloader, Profile
- import traceback
- import copy
- import operator
- import dataset
- import pandas as pd
- import networkx as nx
- import codecs
- import pyvis
- import sys
- import pickle
- import os
- import searchconsole
- from pyvis.network import Network
- import jieba
- import sys
- import codecs
- import traceback
- import requests
- import re
- import pandas as pd
- import random
- import urllib
- import dataset
- import json
- import gspread
- import datetime
- from gspread_pandas import Spread, Client
- from oauth2client.service_account import ServiceAccountCredentials
- import os
- import threading
- def save_sheet(df,filename,tabname,startpos='A1'):
- scope = ['https://spreadsheets.google.com/feeds',
- 'https://www.googleapis.com/auth/drive']
- credentials = ServiceAccountCredentials.from_json_keyfile_name('c:\\keys\\spread2.json', scope)
- gc = gspread.authorize(credentials)
- spread = Spread(filename,creds=credentials)
- spread.df_to_sheet(df, index=False, sheet=tabname, start=startpos, replace=False)
- db = dataset.connect('sqlite:///:memory:')
- table=db['tmp']
- pname='news'
- def get_css():
- return ''
- def modify_file(fname):
- fr=codecs.open(fname,'r','utf-8')
- lines=fr.readlines()
- fr.close()
- css=get_css()
- content_output=''
- for l in lines:
- if '<body>' in l[0:10]:
- content_output+=l
- content_output+='\n<div id="google">\n'
- continue
- if '<style type="text' in l[0:22]:
- content_output+=l
- content_output+="\n"+css+"\n"
- continue
- if '<div id = "mynetwork"' in l[0:30]:
- content_output+=l
- content_output+='\n</div>\n'
- continue
- content_output+=l
- fw=codecs.open("mod_"+fname,'w','utf-8')
- fw.write(content_output)
- fw.close()
- def checkig(kw):
- global instl
- global table
- global pname
- lst=[]
- idx=0
- cntdict={}
- codelist={}
- G=None
- G = nx.Graph()
- finallist=[]
- idx=0
- flag_break=False
- fname=os.path.abspath(__file__)
- elmts=fname.split(os.path.sep)
- path2=os.path.sep.join(elmts[0:-1])
- keysdir=path2+os.path.sep+'keys'+os.path.sep
- account = searchconsole.authenticate(client_config=keysdir+'client_secret.json',credentials=keysdir+'credentials.json')
- webproperty = account['https://hhh.com.tw/']
- report=webproperty.query.range('2020-06-22', '2021-06-21').dimension('page','query').limit(10000).get()
- urlq={}
- for r in report.rows:
- if urlq.get(r[0]) is None:
- urlq[r[0]]=[r[1]]
- else:
- urlq[r[0]].append(r[1])
- allrows=[]
- rid=0
- for k,v in urlq.items():
- if len(v)<2:
- continue
- print(len(v))
- for q in v:
- elmts=q.split(' ')
- for elmt in elmts:
- table.insert({'q':elmt,'rid':rid})
- rid+=1
- allrows.append([r[0],r[1] ])
- db.commit()
- cursor=db.query('select q,rid from tmp order by q')
- prev=''
- curnode=''
- df = pd.DataFrame(columns=('rid','query'))
- repdict={}
- idx=0
- for c in cursor:
- if c['rid']!=prev:
- curnode=c['q']
- prev=c['rid']
- else:
- if repdict.get((curnode,c['q'])) is None:
- repdict[(curnode,c['q'])]=1
- df.loc[idx]=[curnode,c['q']]
- idx+=1
- G.add_edge(curnode,c['q'],weight=3,width=3,borderwidth=3)
- pickle.dump( G, open( pname, "wb" ) )
- remove=[]
- for n in G.nodes:
- if '承諾' in n:
- remove.append(n)
- if 'promise' in n:
- remove.append(n)
- G.remove_nodes_from(remove)
- G.remove_edges_from(nx.selfloop_edges(G))
- G.remove_nodes_from(list(nx.isolates(G)))
- G2=G
- pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
- pyG.from_nx(G2)
- pyG.show(pname+'.html')
- modify_file(pname+'.html')
- return finallist
- r=checkig('インソール')
|