old_tree.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. from instaloader import Instaloader, Profile
  2. import traceback
  3. import copy
  4. import operator
  5. import dataset
  6. import pandas as pd
  7. import networkx as nx
  8. #import pysftp
  9. import codecs
  10. import pyvis
  11. import sys
  12. import pickle
  13. import os
  14. import searchconsole
  15. from pyvis.network import Network
  16. import jieba
  17. import sys
  18. import codecs
  19. import traceback
  20. import requests
  21. import re
  22. import pandas as pd
  23. import random
  24. import urllib
  25. import dataset
  26. import json
  27. import gspread
  28. import datetime
  29. from gspread_pandas import Spread, Client
  30. from oauth2client.service_account import ServiceAccountCredentials
  31. import os
  32. import threading
  33. def save_sheet(df,filename,tabname,startpos='A1'):
  34. scope = ['https://spreadsheets.google.com/feeds',
  35. 'https://www.googleapis.com/auth/drive']
  36. credentials = ServiceAccountCredentials.from_json_keyfile_name('c:\\keys\\spread2.json', scope)
  37. # credentials = ServiceAccountCredentials.from_json_keyfile_name('/var/keys/spread2.json', scope)
  38. gc = gspread.authorize(credentials)
  39. spread = Spread(filename,creds=credentials)
  40. spread.df_to_sheet(df, index=False, sheet=tabname, start=startpos, replace=False)
  41. db = dataset.connect('sqlite:///:memory:')
  42. table=db['tmp']
  43. #pname='cont'
  44. #pname='damanwoo'
  45. #pname='drama'
  46. pname='news'
  47. #pname='www'
  48. #pname='ipromise'
  49. #pname='sports'
  50. #pname='rumor'
  51. #pname='korea'
  52. def get_css():
  53. return ''
  54. # fr=codecs.open('jared/data/css.txt','r','utf-8')
  55. # lines=fr.readlines()
  56. # content=' '.join(lines)
  57. # fr.close()
  58. # return content
  59. def modify_file(fname):
  60. fr=codecs.open(fname,'r','utf-8')
  61. lines=fr.readlines()
  62. fr.close()
  63. css=get_css()
  64. content_output=''
  65. for l in lines:
  66. if '<body>' in l[0:10]:
  67. content_output+=l
  68. content_output+='\n<div id="google">\n'
  69. continue
  70. if '<style type="text' in l[0:22]:
  71. content_output+=l
  72. content_output+="\n"+css+"\n"
  73. continue
  74. if '<div id = "mynetwork"' in l[0:30]:
  75. content_output+=l
  76. content_output+='\n</div>\n'
  77. continue
  78. content_output+=l
  79. fw=codecs.open("mod_"+fname,'w','utf-8')
  80. fw.write(content_output)
  81. fw.close()
  82. def checkig(kw):
  83. global instl
  84. global table
  85. global pname
  86. lst=[]
  87. idx=0
  88. cntdict={}
  89. codelist={}
  90. G=None
  91. # if os.path.exists(pname):
  92. # G = pickle.load( open( pname, "rb" ) )
  93. # else:
  94. # G = nx.Graph()
  95. G = nx.Graph()
  96. finallist=[]
  97. idx=0
  98. flag_break=False
  99. # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
  100. # webproperty = account['https://ipromise.com.tw/']
  101. # webproperty = account['https://'+pname+'.face8ook.org/']
  102. # webproperty = account['https://www.damanwoo.com/']
  103. fname=os.path.abspath(__file__)
  104. elmts=fname.split(os.path.sep)
  105. path2=os.path.sep.join(elmts[0:-1])
  106. keysdir=path2+os.path.sep+'keys'+os.path.sep
  107. account = searchconsole.authenticate(client_config=keysdir+'client_secret.json',credentials=keysdir+'credentials.json')
  108. # webproperty = account['https://ipromise.com.tw/']
  109. # webproperty = account['https://'+pname+'.face8ook.org/']
  110. # webproperty = account['https://www.damanwoo.com/']
  111. webproperty = account['https://hhh.com.tw/']
  112. # report=webproperty.query.range('2021-06-11', '2021-06-18').dimension('page','query').get()
  113. report=webproperty.query.range('2020-06-22', '2021-06-21').dimension('page','query').limit(10000).get()
  114. urlq={}
  115. for r in report.rows:
  116. if urlq.get(r[0]) is None:
  117. urlq[r[0]]=[r[1]]
  118. else:
  119. urlq[r[0]].append(r[1])
  120. allrows=[]
  121. rid=0
  122. for k,v in urlq.items():
  123. # if len(v)<35:
  124. if len(v)<2:
  125. continue
  126. print(len(v))
  127. for q in v:
  128. # elmts=jieba.cut(q)
  129. elmts=q.split(' ')
  130. for elmt in elmts:
  131. # print(elmt)
  132. table.insert({'q':elmt,'rid':rid})
  133. rid+=1
  134. allrows.append([r[0],r[1] ])
  135. db.commit()
  136. cursor=db.query('select q,rid from tmp order by q')
  137. prev=''
  138. curnode=''
  139. df = pd.DataFrame(columns=('rid','query'))
  140. repdict={}
  141. idx=0
  142. for c in cursor:
  143. if c['rid']!=prev:
  144. curnode=c['q']
  145. prev=c['rid']
  146. else:
  147. if repdict.get((curnode,c['q'])) is None:
  148. repdict[(curnode,c['q'])]=1
  149. # repdict[(c['q'],curnode)]=1
  150. df.loc[idx]=[curnode,c['q']]
  151. idx+=1
  152. G.add_edge(curnode,c['q'],weight=3,width=3,borderwidth=3)
  153. pickle.dump( G, open( pname, "wb" ) )
  154. # save_sheet(df,'BigGraph','nodes')
  155. # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)]
  156. # remove = [node for node,degree in dict(G.degree()).items() if degree <2]
  157. # G.remove_nodes_from(remove)
  158. remove=[]
  159. for n in G.nodes:
  160. if '承諾' in n:
  161. remove.append(n)
  162. if 'promise' in n:
  163. remove.append(n)
  164. G.remove_nodes_from(remove)
  165. G.remove_edges_from(nx.selfloop_edges(G))
  166. G.remove_nodes_from(list(nx.isolates(G)))
  167. # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)]
  168. # lst=[]
  169. # for c in nx.connected_components(G):
  170. # cc=G.subgraph(c).copy()
  171. # if cc.number_of_nodes()>7:
  172. # lst.append(cc)
  173. # if nx.diameter(cc, e=None, usebounds=False)>1:
  174. # lst.append(cc)
  175. # G2=nx.compose_all(lst)
  176. G2=G
  177. # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
  178. pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
  179. pyG.from_nx(G2)
  180. pyG.show(pname+'.html')
  181. modify_file(pname+'.html')
  182. # cnopts = pysftp.CnOpts()
  183. # cnopts.hostkeys = None
  184. # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts)
  185. # local_path = "mod_"+pname+".html"
  186. # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html"
  187. # s.put(local_path, remote_path)
  188. return finallist
  189. #r=checkig('投資')
  190. #r=checkig('保險')
  191. #r=checkig('嘖嘖')
  192. #r=checkig('募資')
  193. #r=checkig('遠赤外線')
  194. r=checkig('インソール')
  195. #r=checkig('信用卡')
  196. #print(r)
  197. # network.on( 'click', function(properties) {
  198. # var ids = properties.nodes;
  199. # var clickedNodes = nodes.get(ids);
  200. # var copyText = clickedNodes[0].label;
  201. # var promise = navigator.clipboard.writeText(copyText);
  202. #// console.log('clicked nodes:', clickedNodes);
  203. #});