hhh_showgraph.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from instaloader import Instaloader, Profile
  2. import traceback
  3. import copy
  4. import operator
  5. import dataset
  6. import pandas as pd
  7. import networkx as nx
  8. #import pysftp
  9. import codecs
  10. import pyvis
  11. import sys
  12. import pickle
  13. import os
  14. import searchconsole
  15. from pyvis.network import Network
  16. import jieba
  17. pname='hhh.rb'
  18. db = dataset.connect('sqlite:///'+pname+".db")
  19. table=db['tmp']
  20. #pname='cont'
  21. #pname='damanwoo'
  22. #pname='drama'
  23. #pname='news'
  24. #pname='www'
  25. #pname='ipromise'
  26. #pname='sports'
  27. #pname='rumor'
  28. #pname='korea'
  29. pname='hhh.rb'
  30. rid=0
  31. def get_css():
  32. fr=codecs.open('jared/data/css.txt','r','utf-8')
  33. lines=fr.readlines()
  34. content=' '.join(lines)
  35. fr.close()
  36. return content
  37. def modify_file(fname):
  38. fr=codecs.open(fname,'r','utf-8')
  39. lines=fr.readlines()
  40. fr.close()
  41. # css=get_css()
  42. css=''
  43. content_output=''
  44. for l in lines:
  45. if '<body>' in l[0:10]:
  46. content_output+=l
  47. content_output+='\n<div id="google">\n'
  48. continue
  49. if '<style type="text' in l[0:22]:
  50. content_output+=l
  51. content_output+="\n"+css+"\n"
  52. continue
  53. if '<div id = "mynetwork"' in l[0:30]:
  54. content_output+=l
  55. content_output+='\n</div>\n'
  56. continue
  57. content_output+=l
  58. fw=codecs.open("mod_"+fname,'w','utf-8')
  59. fw.write(content_output)
  60. fw.close()
  61. def gen_pic():
  62. G=None
  63. # if os.path.exists(pname):
  64. # G = pickle.load( open( pname, "rb" ) )
  65. # else:
  66. # G = nx.Graph()
  67. G = nx.Graph()
  68. finallist=[]
  69. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  70. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  71. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q')
  72. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q ) as tbl1 ) order by q')
  73. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) >20) as tbl1 ) order by q')
  74. # cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >5 and count(q)<10 ) as tbl1) order by url')
  75. cursor=db.query('select distinct q,url from tmp where url in (select distinct url from (select url,count(q) from tmp where length(q)> 2 group by url having count(q) >4 and count(q)<9 ) as tbl1) order by url')
  76. urldict={}
  77. for c in cursor:
  78. url=c['url'].replace('https://www.hhh.com.tw','')
  79. url=url.replace('https://hhh.com.tw','')
  80. url=url.replace('https://m.hhh.com.tw','')
  81. q=c['q']
  82. if urldict.get(url) is None:
  83. urldict[url]=[q]
  84. else:
  85. urldict[url].append(q)
  86. print(len(urldict.keys()))
  87. cnt=0
  88. for k,v in urldict.items():
  89. if len(v)>=2:
  90. for itm in v:
  91. G.add_edge(k,itm,weight=3,width=3,borderwidth=3)
  92. cnt+=1
  93. if cnt%1000 == 0:
  94. print(cnt)
  95. # G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
  96. # pickle.dump( G, open( pname, "wb" ) )
  97. # G2 = [G.subgraph(c).copy() for c in nx.connected_components(G)]
  98. # remove = [node for node,degree in dict(G.degree()).items() if degree <2]
  99. # G.remove_nodes_from(remove)
  100. remove=[]
  101. # for n in G.nodes:
  102. # if '承諾' in n:
  103. # remove.append(n)
  104. # if 'promise' in n:
  105. # remove.append(n)
  106. # G.remove_nodes_from(remove)
  107. to_remove=[]
  108. for n in G:
  109. dg=G.degree(n)
  110. if dg > 15:
  111. to_remove.append(n)
  112. G.remove_nodes_from(to_remove)
  113. G.remove_edges_from(nx.selfloop_edges(G))
  114. G.remove_nodes_from(list(nx.isolates(G)))
  115. # lst= [G.subgraph(c).copy() for c in nx.connected_components(G)]
  116. # lst=[]
  117. # for c in nx.connected_components(G):
  118. # cc=G.subgraph(c).copy()
  119. # if cc.number_of_nodes()>7:
  120. # lst.append(cc)
  121. # if nx.diameter(cc, e=None, usebounds=False)>1:
  122. # lst.append(cc)
  123. # G2=nx.compose_all(lst)
  124. G2=G
  125. # pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
  126. pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
  127. pyG.from_nx(G2)
  128. pyG.show(pname+'.html')
  129. modify_file(pname+'.html')
  130. # cnopts = pysftp.CnOpts()
  131. # cnopts.hostkeys = None
  132. # s = pysftp.Connection(host='www.choozmo.com', username='jared', password='sstc5202',cnopts=cnopts)
  133. # local_path = "mod_"+pname+".html"
  134. # remote_path = "/home/nginx/farmoutput/tags/"+"mod_"+pname+".html"
  135. # s.put(local_path, remote_path)
  136. return finallist
  137. gen_pic()
  138. #r=checkig('信用卡')
  139. #print(r)
  140. # network.on( 'click', function(properties) {
  141. # var ids = properties.nodes;
  142. # var clickedNodes = nodes.get(ids);
  143. # var copyText = clickedNodes[0].label;
  144. # var promise = navigator.clipboard.writeText(copyText);
  145. #// console.log('clicked nodes:', clickedNodes);
  146. #});