tredning_search.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import datetime
  20. import gzip
  21. #from fp.fp import FreeProxy
  22. def send(driver, cmd, params={}):
  23. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  24. url = driver.command_executor._url + resource
  25. body = json.dumps({'cmd': cmd, 'params': params})
  26. response = driver.command_executor._request('POST', url, body)
  27. # if response['status']:
  28. # raise Exception(response.get('value'))
  29. return response.get('value')
  30. def add_script(driver, script):
  31. send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
  32. def set_viewport_size(driver, width, height):
  33. window_size = driver.execute_script("""
  34. return [window.outerWidth - window.innerWidth + arguments[0],
  35. window.outerHeight - window.innerHeight + arguments[1]];
  36. """, width, height)
  37. driver.set_window_size(*window_size)
  38. def init_webdriver():
  39. # client = docker.from_env()
  40. # ls=client.containers.list()
  41. # print(ls)
  42. # ls[0].restart()
  43. # time.sleep(11)
  44. options = webdriver.ChromeOptions()
  45. options.add_argument("--no-sandbox")
  46. options.add_argument("--disable-dev-shm-usage")
  47. options.add_argument("--headless")
  48. options.add_argument("--incognito")
  49. driver = webdriver.Chrome(options=options)
  50. # driver = webdriver.Remote(
  51. # command_executor='http://127.0.0.1:4444/wd/hub',
  52. # desired_capabilities=options.to_capabilities())
  53. return driver
  54. class SelGTrend:
  55. def __init__(self):
  56. self.texts = []
  57. self.links = []
  58. self.results = []
  59. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  60. self.headers = {'User-Agent': self.user_agent}
  61. # self.proxy = FreeProxy().get()
  62. def search(self, key):
  63. self.key = "+".join(key.split(" "))
  64. self.getpage()
  65. def getpage(self, geo):
  66. result=[]
  67. self.url = 'https://trends.google.com/trends/trendingsearches/daily?geo='+geo
  68. try:
  69. print(self.url)
  70. driver=init_webdriver()
  71. # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
  72. driver.get(self.url)
  73. time.sleep(5)
  74. for request in driver.requests:
  75. print(request.url[0:60])
  76. if request.response:
  77. if 'dailytrends?' in request.url :
  78. print('*** parsing js:')
  79. resp=request.response.body
  80. data=None
  81. try:
  82. data = gzip.decompress(resp)
  83. except:
  84. traceback.print_exc()
  85. data=resp
  86. jstext=data.decode('utf-8')
  87. # print(jstext)
  88. jsobj=json.loads(jstext[6:])
  89. return jsobj
  90. # print(jsobj)
  91. except Exception as e:
  92. traceback.print_exc()
  93. print(e)
  94. pass
  95. # driver.quit()
  96. return result
  97. # driver.quit()
  98. def result(self):
  99. return self.results
  100. def gettext(self):
  101. return self.texts
  102. def getlinks(self):
  103. return self.links
  104. def clear(self):
  105. self.texts = []
  106. self.links = []
  107. self.results = []
  108. def save_to_db(js):
  109. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  110. table = db['trending_search_json']
  111. js=js['default']['trendingSearchesDays'][0]['trendingSearches']
  112. try:
  113. table.insert({'dt':datetime.datetime.now(),'json':json.dumps(js, ensure_ascii=False).encode('utf8')})
  114. except:
  115. print('dup')
  116. # for j in json:
  117. # print(j['title'])
  118. ## print(j['formattedTraffic'])
  119. # print(j['relatedQueries'])
  120. # if j.get('source') is not None:
  121. # print(j['source'])
  122. ## print(json.dumps(j['image']))
  123. # print(j['snippet'])
  124. # print(j)
  125. # table.insert(j)
  126. geo='TW'
  127. sgtrend=SelGTrend()
  128. result=sgtrend.getpage(geo)
  129. #print(result)
  130. save_to_db(result)
  131. #time.sleep(9999)