gtrendtest_jsraw.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import brotli
  20. import gzip
  21. import datetime
  22. import redis
  23. import argparse
  24. #from fp.fp import FreeProxy
  25. localrun=False
  26. geo='TW'
  27. def send(driver, cmd, params={}):
  28. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  29. url = driver.command_executor._url + resource
  30. body = json.dumps({'cmd': cmd, 'params': params})
  31. response = driver.command_executor._request('POST', url, body)
  32. # if response['status']:
  33. # raise Exception(response.get('value'))
  34. return response.get('value')
  35. def add_script(driver, script):
  36. send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
  37. def set_viewport_size(driver, width, height):
  38. window_size = driver.execute_script("""
  39. return [window.outerWidth - window.innerWidth + arguments[0],
  40. window.outerHeight - window.innerHeight + arguments[1]];
  41. """, width, height)
  42. driver.set_window_size(*window_size)
  43. #docker run -d -p 4445:4444 --name p4445 --add-host=host.docker.internal:172.17.0.1 -v /dev/shm:/dev/shm selenium/standalone-chrome
  44. def init_webdriver():
  45. options = webdriver.ChromeOptions()
  46. options.add_argument('--ignore-certificate-errors')
  47. options.add_argument("--no-sandbox")
  48. # options.add_argument("--headless")h
  49. options.add_argument("--disable-gpu")
  50. options.add_argument("--disable-dev-shm-usage")
  51. driver = webdriver.Chrome(
  52. options=options
  53. )
  54. driver.set_window_size(1400,1000)
  55. return driver
  56. class SelGTrend:
  57. def __init__(self):
  58. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  59. self.table=db['gtrend_jsraw']
  60. self.yt=False
  61. self.texts = []
  62. self.links = []
  63. self.results = []
  64. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  65. self.headers = {'User-Agent': self.user_agent}
  66. def search(self, key):
  67. self.key = "+".join(key.split(" "))
  68. return self.getpage(self.key)
  69. def getpage(self,query):
  70. global geo
  71. driver=None
  72. result=[]
  73. import urllib.parse
  74. safe_string = urllib.parse.quote_plus(query)
  75. if self.yt:
  76. self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&gprop=youtube&q='+safe_string
  77. else:
  78. self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
  79. try:
  80. print(self.url)
  81. driver=init_webdriver()
  82. # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
  83. driver.get(self.url)
  84. time.sleep(3)
  85. driver.refresh()
  86. time.sleep(4)
  87. ub = driver.find_element_by_css_selector('body')
  88. for i in range(9):
  89. ub.send_keys(Keys.PAGE_DOWN)
  90. time.sleep(0.5)
  91. # time.sleep(4)
  92. # driver.save_screenshot("/tmp/screenshot.png")
  93. for request in driver.requests:
  94. print(request.url[0:60])
  95. if request.response:
  96. if 'relatedsearches?' in request.url :
  97. print('*** parsing js:')
  98. resp=request.response.body
  99. data=None
  100. try:
  101. data = gzip.decompress(resp)
  102. except:
  103. traceback.print_exc()
  104. data=resp
  105. jstext=data.decode('utf-8')
  106. print(jstext)
  107. jsobj=json.loads(jstext[6:])
  108. jsobj=jsobj['default']['rankedList']
  109. self.table.insert({'kw':self.key,'dt':datetime.datetime.now(),'json':json.dumps(jsobj, ensure_ascii=False).encode('utf8')})
  110. print(jsobj)
  111. except Exception as e:
  112. traceback.print_exc()
  113. print(e)
  114. pass
  115. driver.quit()
  116. return result
  117. # driver.quit()
  118. def result(self):
  119. return self.results
  120. def gettext(self):
  121. return self.texts
  122. def getlinks(self):
  123. return self.links
  124. def clear(self):
  125. self.texts = []
  126. self.links = []
  127. self.results = []
  128. sgtrend=SelGTrend()
  129. #data=sgtrend.search('居家')
  130. data=sgtrend.search('7-11 當機')