gtrendtest.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import brotli
  20. import gzip
  21. import datetime
  22. import redis
  23. import argparse
  24. #from fp.fp import FreeProxy
  25. localrun=False
  26. geo='TW'
  27. def send(driver, cmd, params={}):
  28. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  29. url = driver.command_executor._url + resource
  30. body = json.dumps({'cmd': cmd, 'params': params})
  31. response = driver.command_executor._request('POST', url, body)
  32. # if response['status']:
  33. # raise Exception(response.get('value'))
  34. return response.get('value')
  35. def add_script(driver, script):
  36. send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
  37. def set_viewport_size(driver, width, height):
  38. window_size = driver.execute_script("""
  39. return [window.outerWidth - window.innerWidth + arguments[0],
  40. window.outerHeight - window.innerHeight + arguments[1]];
  41. """, width, height)
  42. driver.set_window_size(*window_size)
  43. #docker run -d -p 4445:4444 --name p4445 --add-host=host.docker.internal:172.17.0.1 -v /dev/shm:/dev/shm selenium/standalone-chrome
  44. def init_webdriver():
  45. options = webdriver.ChromeOptions()
  46. options.add_argument('--ignore-certificate-errors')
  47. options.add_argument("--no-sandbox")
  48. # options.add_argument("--headless")h
  49. options.add_argument("--disable-gpu")
  50. options.add_argument("--disable-dev-shm-usage")
  51. driver = webdriver.Chrome(
  52. options=options
  53. )
  54. driver.set_window_size(1400,1000)
  55. return driver
  56. class SelGTrend:
  57. def __init__(self):
  58. self.texts = []
  59. self.links = []
  60. self.results = []
  61. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  62. self.headers = {'User-Agent': self.user_agent}
  63. def search(self, key):
  64. self.key = "+".join(key.split(" "))
  65. return self.getpage(self.key)
  66. def getpage(self,query):
  67. global geo
  68. driver=None
  69. result=[]
  70. import urllib.parse
  71. safe_string = urllib.parse.quote_plus(query)
  72. self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
  73. try:
  74. print(self.url)
  75. driver=init_webdriver()
  76. # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
  77. driver.get(self.url)
  78. time.sleep(3)
  79. driver.refresh()
  80. time.sleep(4)
  81. ub = driver.find_element_by_css_selector('body')
  82. for i in range(9):
  83. ub.send_keys(Keys.PAGE_DOWN)
  84. time.sleep(0.5)
  85. # time.sleep(4)
  86. # driver.save_screenshot("/tmp/screenshot.png")
  87. for request in driver.requests:
  88. print(request.url[0:60])
  89. if request.response:
  90. if 'relatedsearches?' in request.url :
  91. print('*** parsing js:')
  92. resp=request.response.body
  93. data=None
  94. try:
  95. data = gzip.decompress(resp)
  96. except:
  97. traceback.print_exc()
  98. data=resp
  99. jstext=data.decode('utf-8')
  100. print(jstext)
  101. jsobj=json.loads(jstext[6:])
  102. print(jsobj)
  103. try:
  104. kws=jsobj['default']['rankedList'][0]['rankedKeyword']
  105. for kw in kws:
  106. if kw['hasData']:
  107. try:
  108. result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
  109. # print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
  110. # trend_table.insert({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
  111. except:
  112. traceback.print_exc()
  113. val=int (kw['value'])
  114. # if val >=150:
  115. # print(kw['query'])
  116. # print(kw['value'])
  117. kws=jsobj['default']['rankedList'][1]['rankedKeyword']
  118. for kw in kws:
  119. try:
  120. result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
  121. # print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
  122. except:
  123. traceback.print_exc()
  124. val=int (kw['value'])
  125. # if val >=150:
  126. ## print(kw['query'])
  127. # print(kw['value'])
  128. except:
  129. traceback.print_exc()
  130. # print(jsobj['default']['rankedList'])
  131. # resultobj=parsing_js(jstext)
  132. # print("before",datetime.now())
  133. # save_js_to_db(resultobj,area_num,keyword)
  134. # print("after",datetime.now())
  135. # time.sleep(9999)
  136. except Exception as e:
  137. traceback.print_exc()
  138. print(e)
  139. pass
  140. driver.quit()
  141. return result
  142. # driver.quit()
  143. def result(self):
  144. return self.results
  145. def gettext(self):
  146. return self.texts
  147. def getlinks(self):
  148. return self.links
  149. def clear(self):
  150. self.texts = []
  151. self.links = []
  152. self.results = []
  153. def save_to_db(json):
  154. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  155. table = db['trend_table']
  156. for j in json:
  157. table.insert(j)
  158. sgtrend=SelGTrend()
  159. data=sgtrend.search('稅')
  160. save_to_db(data)
  161. print(data)