hhh_tree2.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #from instaloader import Instaloader, Profile
  2. import traceback
  3. import copy
  4. import operator
  5. import dataset
  6. import pandas as pd
  7. import networkx as nx
  8. #import pysftp
  9. import codecs
  10. import pyvis
  11. import sys
  12. import pickle
  13. import os
  14. import searchconsole
  15. from pyvis.network import Network
  16. import jieba
  17. pname='hhh.rb'
  18. db = dataset.connect('sqlite:///'+pname+".db")
  19. def destroy_db():
  20. global db
  21. try:
  22. db.query('drop table tmp')
  23. except:
  24. traceback.print_exc()
  25. table=db['tmp']
  26. #pname='cont'
  27. #pname='damanwoo'
  28. #pname='drama'
  29. #pname='news'
  30. #pname='www'
  31. #pname='ipromise'
  32. #pname='sports'
  33. #pname='rumor'
  34. #pname='korea'
  35. rid=0
  36. def get_css():
  37. fr=codecs.open('jared/data/css.txt','r','utf-8')
  38. lines=fr.readlines()
  39. content=' '.join(lines)
  40. fr.close()
  41. return content
  42. def modify_file(fname):
  43. fr=codecs.open(fname,'r','utf-8')
  44. lines=fr.readlines()
  45. fr.close()
  46. # css=get_css()
  47. css=''
  48. content_output=''
  49. for l in lines:
  50. if '<body>' in l[0:10]:
  51. content_output+=l
  52. content_output+='\n<div id="google">\n'
  53. continue
  54. if '<style type="text' in l[0:22]:
  55. content_output+=l
  56. content_output+="\n"+css+"\n"
  57. continue
  58. if '<div id = "mynetwork"' in l[0:30]:
  59. content_output+=l
  60. content_output+='\n</div>\n'
  61. continue
  62. content_output+=l
  63. fw=codecs.open("mod_"+fname,'w','utf-8')
  64. fw.write(content_output)
  65. fw.close()
  66. def checkig(pgnum):
  67. global instl
  68. global table
  69. global pname
  70. global rid
  71. lst=[]
  72. cntdict={}
  73. codelist={}
  74. idx=0
  75. flag_break=False
  76. fname=os.path.abspath(__file__)
  77. elmts=fname.split(os.path.sep)
  78. path2=os.path.sep.join(elmts[0:-1])
  79. keysdir=path2+os.path.sep+'keys'+os.path.sep
  80. # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
  81. # account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
  82. #account = searchconsole.authenticate(client_config='c:/keys/client_secret_162277274609-v1fsq5iscscl7e2ta4a8tc0og5tehl44.apps.googleusercontent.com.json',serialize='out.json')
  83. account = searchconsole.authenticate(client_config='c:/keys/client_secret_162277274609-v1fsq5iscscl7e2ta4a8tc0og5tehl44.apps.googleusercontent.com.json',credentials='c:/keys/out.json')
  84. #account.redirect_uri = 'https://localhost'
  85. #http://localhost:8080
  86. # account = searchconsole.authenticate(client_config=keysdir+'client_secret copy.json',credentials=keysdir+'credentials copy.json')
  87. print(account.webproperties)
  88. # sys.exit()
  89. # webproperty = account['https://ipromise.com.tw/']
  90. # webproperty = account['https://'+pname+'.face8ook.org/']
  91. # webproperty = account['https://www.damanwoo.com/']
  92. # webproperty = account['https://hhh.com.tw/']
  93. webproperty = account['https://innews.com.tw/']
  94. # report=webproperty.query.range('2021-03-01', '2021-06-17').dimension('page','query').get()
  95. # report=webproperty.query.range('2021-06-01', '2021-06-17').dimension('page','query').get()
  96. # report=webproperty.query.range('2020-06-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/(491|31|293|278|31|24|594|356|307|491|33|385)', 'equals').get()
  97. # report=webproperty.query.range('2020-03-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/'+pgnum, 'contains').get()
  98. report=webproperty.query.range('2022-04-01', '2022-04-16').dimension('page','query').get()
  99. urlq={}
  100. for r in report.rows:
  101. if urlq.get(r[0]) is None:
  102. urlq[r[0]]=[r[1]]
  103. else:
  104. urlq[r[0]].append(r[1])
  105. # print(urlq)
  106. allrows=[]
  107. for k,v in urlq.items():
  108. for q in v:
  109. elmts=q.split(' ')
  110. for elmt in elmts:
  111. table.insert({'q':elmt,'rid':rid,'url':k})
  112. rid+=1
  113. allrows.append([r[0],r[1] ])
  114. db.commit()
  115. def gen_pic():
  116. global db
  117. G=None
  118. # if os.path.exists(pname):
  119. # G = pickle.load( open( pname, "rb" ) )
  120. # else:
  121. # G = nx.Graph()
  122. G = nx.Graph()
  123. finallist=[]
  124. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  125. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 ) order by q')
  126. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having ) as tbl1 ) order by q')
  127. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q ) as tbl1 ) order by q')
  128. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) >=3) as tbl1 ) order by q')
  129. # cursor=db.query('select q,rid,url from tmp where q in (select distinct q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) >=3) as tbl1 ) order by q')
  130. # cursor=db.query('select q,rid,url from tmp where rid in (select distinct rid from (select rid,count(q) from tmp where length(q)> 2 group by rid having count(q) >=15) as tbl1 ) order by q')
  131. cursor=db.query('select distinct q,rid from tmp where rid in (select distinct rid from (select rid,count(q) from tmp where length(q)> 2 group by rid having count(q) >=50) as tbl1 ) order by q')
  132. riddict={}
  133. prev=''
  134. curnode=''
  135. cururl=''
  136. total_idx=0
  137. cnt=0
  138. for c in cursor:
  139. print(str(c['rid'])+":"+c['q'])
  140. G.add_edge(c['q'],str(c['rid']),weight=3,width=3,borderwidth=3)
  141. cnt+=1
  142. print(cnt)
  143. # pickle.dump( G, open( pname, "wb" ) )
  144. remove=[]
  145. G.remove_edges_from(nx.selfloop_edges(G))
  146. G.remove_nodes_from(list(nx.isolates(G)))
  147. G2=G
  148. pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
  149. pyG.from_nx(G2)
  150. pyG.show(pname+'.html')
  151. # modify_file(pname+'.html')
  152. return finallist
  153. # cursor=db.query('(select q from (select q,count(url) from tmp where length(q)> 2 group by q having count(url) <= 3) as tbl1 )')
  154. #destroy_db()
  155. #checkig('12')
  156. gen_pic()