gtrendtest_jsraw.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import brotli
  20. import gzip
  21. import datetime
  22. import redis
  23. import argparse
  24. #from fp.fp import FreeProxy
  25. localrun=False
  26. geo='TW'
  27. def send(driver, cmd, params={}):
  28. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  29. url = driver.command_executor._url + resource
  30. body = json.dumps({'cmd': cmd, 'params': params})
  31. response = driver.command_executor._request('POST', url, body)
  32. # if response['status']:
  33. # raise Exception(response.get('value'))
  34. return response.get('value')
  35. def add_script(driver, script):
  36. send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
  37. def set_viewport_size(driver, width, height):
  38. window_size = driver.execute_script("""
  39. return [window.outerWidth - window.innerWidth + arguments[0],
  40. window.outerHeight - window.innerHeight + arguments[1]];
  41. """, width, height)
  42. driver.set_window_size(*window_size)
  43. #docker run -d -p 4445:4444 --name p4445 --add-host=host.docker.internal:172.17.0.1 -v /dev/shm:/dev/shm selenium/standalone-chrome
  44. def init_webdriver():
  45. options = webdriver.ChromeOptions()
  46. options.add_argument('--ignore-certificate-errors')
  47. options.add_argument("--no-sandbox")
  48. options.add_argument("--headless")
  49. options.add_argument("--disable-gpu")
  50. options.add_argument("--disable-dev-shm-usage")
  51. driver = webdriver.Chrome(
  52. options=options
  53. )
  54. driver.set_window_size(1400,1000)
  55. return driver
  56. class SelGTrend:
  57. def __init__(self):
  58. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  59. self.table=db['gtrend_jsraw']
  60. self.yt=False
  61. self.texts = []
  62. self.links = []
  63. self.results = []
  64. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  65. self.headers = {'User-Agent': self.user_agent}
  66. def search(self, key):
  67. self.original=key
  68. self.key = "+".join(key.split(" "))
  69. return self.getpage(self.key)
  70. def getpage(self,query):
  71. global geo
  72. driver=None
  73. result=[]
  74. import urllib.parse
  75. safe_string = urllib.parse.quote_plus(query)
  76. if self.yt:
  77. self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&gprop=youtube&q='+safe_string
  78. else:
  79. self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
  80. try:
  81. print(self.url)
  82. driver=init_webdriver()
  83. # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
  84. driver.get(self.url)
  85. time.sleep(3)
  86. driver.refresh()
  87. time.sleep(4)
  88. ub = driver.find_element_by_css_selector('body')
  89. for i in range(9):
  90. ub.send_keys(Keys.PAGE_DOWN)
  91. time.sleep(0.5)
  92. # time.sleep(4)
  93. # driver.save_screenshot("/tmp/screenshot.png")
  94. for request in driver.requests:
  95. print(request.url[0:60])
  96. if request.response:
  97. if 'relatedsearches?' in request.url :
  98. print('*** parsing js:')
  99. resp=request.response.body
  100. data=None
  101. try:
  102. data = gzip.decompress(resp)
  103. except:
  104. traceback.print_exc()
  105. data=resp
  106. jstext=data.decode('utf-8')
  107. print(jstext)
  108. jsobj=json.loads(jstext[6:])
  109. jsobj=jsobj['default']['rankedList']
  110. self.table.insert({'kw':self.original,'dt':datetime.datetime.now(),'json':json.dumps(jsobj, ensure_ascii=False).encode('utf8')})
  111. print(jsobj)
  112. except Exception as e:
  113. traceback.print_exc()
  114. print(e)
  115. pass
  116. driver.quit()
  117. return result
  118. # driver.quit()
  119. def result(self):
  120. return self.results
  121. def gettext(self):
  122. return self.texts
  123. def getlinks(self):
  124. return self.links
  125. def clear(self):
  126. self.texts = []
  127. self.links = []
  128. self.results = []
  129. #sgtrend=SelGTrend()
  130. #data=sgtrend.search('居家')
  131. #data=sgtrend.search('7-11 當機')