hhh_tree.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. from instaloader import Instaloader, Profile
  2. import traceback
  3. import copy
  4. import operator
  5. import dataset
  6. import pandas as pd
  7. import networkx as nx
  8. #import pysftp
  9. import codecs
  10. import pyvis
  11. import sys
  12. import pickle
  13. import os
  14. import searchconsole
  15. from pyvis.network import Network
  16. import jieba
  17. db = dataset.connect('sqlite:///:memory:')
  18. table=db['tmp']
  19. #pname='cont'
  20. #pname='damanwoo'
  21. #pname='drama'
  22. #pname='news'
  23. #pname='www'
  24. #pname='ipromise'
  25. #pname='sports'
  26. pname='rumor'
  27. #pname='korea'
  28. rid=0
  29. def get_css():
  30. fr=codecs.open('jared/data/css.txt','r','utf-8')
  31. lines=fr.readlines()
  32. content=' '.join(lines)
  33. fr.close()
  34. return content
  35. def modify_file(fname):
  36. fr=codecs.open(fname,'r','utf-8')
  37. lines=fr.readlines()
  38. fr.close()
  39. # css=get_css()
  40. css=''
  41. content_output=''
  42. for l in lines:
  43. if '<body>' in l[0:10]:
  44. content_output+=l
  45. content_output+='\n<div id="google">\n'
  46. continue
  47. if '<style type="text' in l[0:22]:
  48. content_output+=l
  49. content_output+="\n"+css+"\n"
  50. continue
  51. if '<div id = "mynetwork"' in l[0:30]:
  52. content_output+=l
  53. content_output+='\n</div>\n'
  54. continue
  55. content_output+=l
  56. fw=codecs.open("mod_"+fname,'w','utf-8')
  57. fw.write(content_output)
  58. fw.close()
  59. def checkig(pgnum):
  60. global instl
  61. global table
  62. global pname
  63. global rid
  64. lst=[]
  65. cntdict={}
  66. codelist={}
  67. idx=0
  68. flag_break=False
  69. fname=os.path.abspath(__file__)
  70. elmts=fname.split(os.path.sep)
  71. path2=os.path.sep.join(elmts[0:-1])
  72. keysdir=path2+os.path.sep+'keys'+os.path.sep
  73. # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
  74. account = searchconsole.authenticate(client_config=keysdir+'client_secret.json',credentials=keysdir+'credentials.json')
  75. # webproperty = account['https://ipromise.com.tw/']
  76. # webproperty = account['https://'+pname+'.face8ook.org/']
  77. # webproperty = account['https://www.damanwoo.com/']
  78. webproperty = account['https://hhh.com.tw/']
  79. # report=webproperty.query.range('2021-03-01', '2021-06-17').dimension('page','query').get()
  80. # report=webproperty.query.range('2021-06-01', '2021-06-17').dimension('page','query').get()
  81. # report=webproperty.query.range('2020-06-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/(491|31|293|278|31|24|594|356|307|491|33|385)', 'equals').get()
  82. report=webproperty.query.range('2020-03-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/'+pgnum, 'contains').get()
  83. urlq={}
  84. for r in report.rows:
  85. if urlq.get(r[0]) is None:
  86. urlq[r[0]]=[r[1]]
  87. else:
  88. urlq[r[0]].append(r[1])
  89. allrows=[]
  90. for k,v in urlq.items():
  91. # if len(v)<40:
  92. if len(v)<0:
  93. continue
  94. # print(k)
  95. for q in v:
  96. # elmts=jieba.cut(q)
  97. elmts=q.split(' ')
  98. for elmt in elmts:
  99. # print(elmt)
  100. table.insert({'q':elmt,'rid':rid,'url':k})
  101. rid+=1
  102. allrows.append([r[0],r[1] ])
  103. db.commit()
  104. # cursor=db.query('(select q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 )')
  105. def gen_pic():
  106. G=None
  107. # if os.path.exists(pname):
  108. # G = pickle.load( open( pname, "rb" ) )
  109. # else:
  110. # G = nx.Graph()
  111. G = nx.Graph()
  112. finallist=[]
  113. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  114. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  115. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q')
  116. cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 1 group by q ) as tbl1 ) order by q')
  117. riddict={}
  118. prev=''
  119. curnode=''
  120. cururl=''
  121. total_idx=0
  122. for c in cursor:
  123. if c['q']!=prev:
  124. cururl=c['url']
  125. prev=c['q']
  126. total_idx+=1
  127. # if total_idx >= 200:
  128. # break
  129. else:
  130. # G.add_edge(cururl,c['url'],weight=3,width=3,borderwidth=3)
  131. # G.add_edge(cururl[40:51],c['url'][40:51],weight=3,width=3,borderwidth=3)
  132. # G.add_edge(c['q'],cururl[40:51],weight=3,width=3,borderwidth=3)
  133. # G.add_edge(c['q'],c['url'][40:51],weight=3,width=3,borderwidth=3)
  134. query=c['q']
  135. rid=c['rid']
  136. if riddict.get(rid) is None:
  137. riddict[rid]={}
  138. riddict[rid][query]=1
  139. else:
  140. if riddict[rid].get(query) is not None:
  141. riddict[rid][query]=1
  142. else:
  143. riddict[rid][query]={}
  144. G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
  145. # G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
  146. pickle.dump( G, open( pname, "wb" ) )
  147. # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)]
  148. # remove = [node for node,degree in dict(G.degree()).items() if degree <2]
  149. # G.remove_nodes_from(remove)
  150. remove=[]
  151. # for n in G.nodes:
  152. # if '承諾' in n:
  153. # remove.append(n)
  154. # if 'promise' in n:
  155. # remove.append(n)
  156. # G.remove_nodes_from(remove)
  157. G.remove_edges_from(nx.selfloop_edges(G))
  158. G.remove_nodes_from(list(nx.isolates(G)))
  159. # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)]
  160. # lst=[]
  161. # for c in nx.connected_components(G):
  162. # cc=G.subgraph(c).copy()
  163. # if cc.number_of_nodes()>7:
  164. # lst.append(cc)
  165. # if nx.diameter(cc, e=None, usebounds=False)>1:
  166. # lst.append(cc)
  167. # G2=nx.compose_all(lst)
  168. G2=G
  169. # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
  170. pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
  171. pyG.from_nx(G2)
  172. pyG.show(pname+'.html')
  173. modify_file(pname+'.html')
  174. # cnopts = pysftp.CnOpts()
  175. # cnopts.hostkeys = None
  176. # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts)
  177. # local_path = "mod_"+pname+".html"
  178. # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html"
  179. # s.put(local_path, remote_path)
  180. return finallist
  181. #r=checkig('投資')
  182. #r=checkig('保險')
  183. #r=checkig('嘖嘖')
  184. #r=checkig('募資')
  185. #r=checkig('遠赤外線')
  186. #lst=['491','31','293','278','24','594','356','307','491','33','385']
  187. lst=['491','31','293','278','24','594','356','307','491','33','385']
  188. for l in lst:
  189. r=checkig(l)
  190. gen_pic()
  191. #r=checkig('信用卡')
  192. #print(r)
  193. # network.on( 'click', function(properties) {
  194. # var ids = properties.nodes;
  195. # var clickedNodes = nodes.get(ids);
  196. # var copyText = clickedNodes[0].label;
  197. # var promise = navigator.clipboard.writeText(copyText);
  198. #// console.log('clicked nodes:', clickedNodes);
  199. #});