gtrend_newwire.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. from seleniumwire import webdriver
  2. import time
  3. from selenium.webdriver.support.ui import WebDriverWait
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.common.keys import Keys
  7. from seleniumwire.utils import decode
  8. import sys
  9. import json
  10. import dataset
  11. import os
  12. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrend2?charset=utf8mb4')
  13. table_logs=db['gtrend_logs']
  14. table=db['topics']
  15. singles={}
  16. def init_webdriver():
  17. # os.system('taskkill /f /im chrome.exe')
  18. options = webdriver.ChromeOptions()
  19. options.add_argument("--disable-blink-features=AutomationControlled")
  20. options.add_argument('--ignore-certificate-errors')
  21. options.add_experimental_option("excludeSwitches", ["enable-automation"])
  22. options.add_experimental_option("useAutomationExtension", False)
  23. # options.debugger_address = "127.0.0.1:" + '8888'
  24. # options.add_argument("--no-sandbox")
  25. # options.add_argument("--headless")
  26. # options.add_argument("--incognito")
  27. options.add_argument("--disable-gpu")
  28. options.add_argument("--disable-dev-shm-usage")
  29. options.add_argument("user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\")
  30. # options.add_argument('--profile-directory=Profile 7')
  31. # options.add_argument('--profile-directory=Profile 47')
  32. options.add_argument('--profile-directory=Default')
  33. # options.add_argument('--profile-directory=Profile 64')
  34. # options.add_argument('--profile-directory=Profile 101')
  35. driver = webdriver.Chrome(
  36. options=options
  37. )
  38. driver.set_window_size(1400,1000)
  39. driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
  40. return driver
  41. def interceptor(request):
  42. global sessionid
  43. global singles
  44. for request in driver.requests:
  45. if 'relatedsearches' in request.url:
  46. if request.response is not None:
  47. rows=[]
  48. if request.response.body is None:
  49. continue
  50. # print(request.response.body)
  51. body = decode(request.response.body,'gzip')
  52. bd=body.decode()
  53. print(body)
  54. bd=bd.replace(r")]}\',\n",'')
  55. bd=bd.replace(r")]}',",'')
  56. bd=bd.encode().decode('unicode-escape')
  57. js=json.loads(bd)
  58. print(js)
  59. rlist=js['default']['rankedList']
  60. for r in rlist:
  61. kws=r['rankedKeyword']
  62. for k in kws:
  63. if k.get('topic') is not None:
  64. if singles.get((sessionid,k['topic']['title'])) is None:
  65. singles[(sessionid,k['topic']['title'])]=1
  66. rows.append({'sessionid':sessionid,'mid':k['topic']['mid'],'query':k['topic']['title'],'type':k['topic']['type'],'value':k['value']})
  67. # table.insert()
  68. print(k['topic'])
  69. print(k['value'])
  70. if k.get('query') is not None:
  71. # table.insert({'query':k['query'],'value':k['value']})
  72. if singles.get((sessionid,k['query'])) is None:
  73. singles[(sessionid,k['query'])]=1
  74. rows.append({'sessionid':sessionid,'query':k['query'],'value':k['value']})
  75. print(k['query'])
  76. print(k['value'])
  77. table.insert_many(rows)
  78. # sys.exit()
  79. # print(bd)
  80. #sessionid='20231014-關鍵字'
  81. #sessionid='20231018-ChoozMo'
  82. #sessionid='20231024-AI'
  83. #sessionid='20231124-HHH'
  84. #sessionid='20231201-HHH'
  85. sessionid='20240119-HHH'
  86. cursor=db.query('select distinct sessionid,query from topics ')
  87. for c in cursor:
  88. singles[(c['sessionid'],c['query'])]=1
  89. driver=init_webdriver()
  90. driver.request_interceptor = interceptor
  91. #driver.get('https://google.com.tw/')
  92. #driver.get('https://trends.google.com.tw/')
  93. #time.sleep(9999)
  94. #driver.get('https://trends.google.com.tw/trends/explore?geo=TW&hl=zh-TW')
  95. #driver.get('https://trends.google.com.tw/trends/')
  96. #time.sleep(3)
  97. #elmt = driver.find_element(By.XPATH, "//textarea[@type='search']")
  98. #time.sleep(1)
  99. #elmt.send_keys('家具')
  100. #elmt.send_keys(Keys.ENTER)
  101. #time.sleep(5)
  102. driver.get('https://trends.google.com.tw/trends/explore?date=now%207-d&geo=TW&hl=zh-TW')
  103. time.sleep(5)
  104. #kw_list=['風水','小坪數','老宅','購屋','買房',]
  105. #kw_list=['鍋','洗衣機','冷氣','除濕機','烘碗機','床墊']
  106. kw_list=['/m/01c979','/g/122rvzch','/g/1q6jh4d9s','/m/0c_jw','/m/0d4wf','/m/0bl2jb','/g/11sr9_h44g','/m/06ht1','/m/03gfsp','/m/06wqb','/g/121kx11r','/m/02cwm','/m/02rfdq','/m/01j2bj','/g/11sr9_mdk7']
  107. #%2Fm%2F01748f
  108. #%2Fm%2F02vkqh8
  109. #'室內裝修'
  110. #%2Fm%2F02z51p
  111. #%2Fm%2F0m8q5
  112. #%2Fm%2F04vct9
  113. #kw_list=['建材']
  114. #kw_list=['/m/0mkz']
  115. #kw_list=['nvidia']
  116. #kw_list=['沙發']
  117. #房價
  118. #kw_list=['系統櫃']
  119. for kw in kw_list:
  120. try:
  121. table_logs.insert({'kw':kw,'sessionid':sessionid})
  122. except:
  123. print('dup')
  124. print(kw)
  125. #elmt = driver.find_element(By.XPATH, "//div[@jsname='E470yf']//input[@aria-label='搜尋']")
  126. elmt = driver.find_element(By.XPATH, "//input[@aria-label='新增搜尋字詞']")
  127. elmt.clear()
  128. for i in range(20):
  129. elmt.send_keys(Keys.BACK_SPACE)
  130. elmt.send_keys(kw)
  131. elmt.send_keys(Keys.ENTER)
  132. time.sleep(11)
  133. time.sleep(9999)
  134. #kw_list=['/g/11j7ys83vr','/g/1yqccwk9n']
  135. #,'/m/019dx1']
  136. #kw_list=['/m/01c979','/g/122rvzch','/g/1q6jh4d9s']
  137. #kw_list=['/m/0c_jw','/m/0d4wf','/m/0bl2jb']
  138. #kw_list=['/g/11sr9_h44g','/m/06ht1','/m/03gfsp']
  139. #kw_list=['/m/06wqb','/g/121kx11r','/m/02cwm']
  140. #kw_list=['/m/02rfdq','/m/01j2bj','/g/11sr9_mdk7']
  141. print(elmt)
  142. time.sleep(1)
  143. #elmt.send_keys(Keys.ENTER)
  144. #elmt.clear()
  145. #ais=['/m/0mkz','/g/11rsc2xsp1']
  146. # 電商'/m/02m96'
  147. #elmt.send_keys('/m/0fy6m3')
  148. #elmt.send_keys('/m/077mq')
  149. #elmt.send_keys(Keys.ENTER)
  150. time.sleep(5)
  151. #time.sleep(9999)
  152. #https://trends.google.com.tw/trends/api/widgetdata/relatedsearches?hl=zh-TW&tz=-480&req=%7B%22restriction%22:%7B%22geo%22:%7B%22country%22:%22TW%22%7D,%22time%22:%222023-10-13T06%5C%5C:10%5C%5C:54+2023-10-14T06%5C%5C:10%5C%5C:54%22,%22originalTimeRangeForExploreUrl%22:%22now+1-d%22,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22%E5%8B%95%E7%89%A9%22%7D%5D%7D%7D,%22keywordType%22:%22QUERY%22,%22metric%22:%5B%22TOP%22,%22RISING%22%5D,%22trendinessSettings%22:%7B%22compareTime%22:%222023-10-12T06%5C%5C:10%5C%5C:54+2023-10-13T06%5C%5C:10%5C%5C:54%22%7D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22CM%22,%22category%22:0%7D,%22language%22:%22zh%22,%22userCountryCode%22:%22TW%22,%22userConfig%22:%7B%22userType%22:%22USER_TYPE_LEGIT_USER%22%7D%7D&token=APP6_UEAAAAAZSuCbrHsaUiytOcIA80ZR-ChhKV3nwvA
  153. #driver.get('https://trends.google.com.tw/trends/explore?q=%E5%8F%B0%E7%A9%8D%E9%9B%BB%E9%81%8B%E5%8B%95%E6%9C%83&date=now%201-d&geo=TW&hl=zh-TW')
  154. #time.sleep(9999)