gsc_tree.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import traceback
  2. import dataset
  3. import codecs
  4. import sys
  5. import pickle
  6. import os
  7. import searchconsole
  8. import pandas as pd
  9. import networkx as nx
  10. #import pysftp
  11. import codecs
  12. import pyvis
  13. import sys
  14. import pickle
  15. import os
  16. import searchconsole
  17. from pyvis.network import Network
  18. import jieba
  19. #db = dataset.connect('mysql://choozmo:pAssw0rd@127.0.0.1:3306/hhh?charset=utf8mb4')
  20. #db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
  21. #db.begin()
  22. db = dataset.connect('sqlite:///:memory:')
  23. table=db['tmp']
  24. #table=db['gsc_page_query_year']
  25. #pname='korea'
  26. rid=0
  27. def checkig():
  28. global instl
  29. global table
  30. global pname
  31. global rid
  32. lst=[]
  33. cntdict={}
  34. codelist={}
  35. idx=0
  36. flag_break=False
  37. fname=os.path.abspath(__file__)
  38. elmts=fname.split(os.path.sep)
  39. path2=os.path.sep.join(elmts[0:-1])
  40. keysdir=path2+os.path.sep+'../keys'+os.path.sep
  41. account = searchconsole.authenticate(client_config='c:/keys/client_secret.json',credentials='c:/keys/credentials.json')
  42. # account = searchconsole.authenticate(client_config='C:\\gitlab\\kw_tools\\kw_tools\\hhh\\keys\\client_secret.json',credentials='C:\\gitlab\\kw_tools\\kw_tools\\hhh\\keys\\credentials.json')
  43. G = nx.Graph()
  44. # webproperty = account['https://ipromise.com.tw/']
  45. # webproperty = account['sc-domain:face8ook.org']
  46. # webproperty = account['sc-domain:hhh.com.tw']
  47. # webproperty = account['sc-domain:hhh.com.tw']
  48. # webproperty = account['https://www.damanwoo.com/']
  49. webproperty = account['https://innews.com.tw/']
  50. # report=webproperty.query.range('2021-03-01', '2021-06-17').dimension('page','query').get()
  51. # report=webproperty.query.range('2021-06-01', '2021-06-17').dimension('page','query').get()
  52. # report=webproperty.query.range('2020-06-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/(491|31|293|278|31|24|594|356|307|491|33|385)', 'equals').get()
  53. # report=webproperty.query.range('2020-03-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/'+pgnum, 'contains').get()
  54. # report=webproperty.query.range('2020-03-01', '2021-06-22').dimension('page','query').filter('page', '/designers/cases/'+pgnum, 'contains').get()
  55. report=webproperty.query.range('2022-01-01', '2022-04-16').dimension('page','query').get()
  56. result=[]
  57. rdict={}
  58. total_idx=0
  59. for r in report.rows:
  60. if 'hhh.com.tw/designers/cases/' not in r[0]:
  61. continue
  62. if rdict.get(r[0]) is None:
  63. total_idx+=1
  64. rid=total_idx
  65. rdict[r[0]]=rid
  66. else:
  67. rid=rdict[r[0]]
  68. entry={'page':r[0],'query':r[1],'rid':rid}
  69. result.append(entry)
  70. print('list done')
  71. for r in result:
  72. table.insert(r)
  73. db.commit()
  74. print('db done')
  75. # cursor=db.query('select query as q,page as url,rid from tmp where rid in (select rid from (select rid,count(*) from tmp group by rid having count(*) > 2 and count(*) < 6) as tbl1) order by rid ')
  76. cursor=db.query('select query as q,page as url,rid from tmp order by rid ')
  77. riddict={}
  78. prev=''
  79. curnode=''
  80. cururl=''
  81. total_idx=0
  82. for c in cursor:
  83. G.add_edge(c['q'],c['rid'],weight=3,width=3,borderwidth=3)
  84. remove=[]
  85. G.remove_edges_from(nx.selfloop_edges(G))
  86. G2=G
  87. pyG = Network(height="600px", width="100%",bgcolor="#444444",font_color="white")
  88. pyG.from_nx(G2)
  89. pyG.show('news.html')
  90. r=checkig()