tredning_search.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import datetime
  20. import gzip
  21. #from fp.fp import FreeProxy
  22. def send(driver, cmd, params={}):
  23. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  24. url = driver.command_executor._url + resource
  25. body = json.dumps({'cmd': cmd, 'params': params})
  26. response = driver.command_executor._request('POST', url, body)
  27. # if response['status']:
  28. # raise Exception(response.get('value'))
  29. return response.get('value')
  30. def add_script(driver, script):
  31. send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
  32. def set_viewport_size(driver, width, height):
  33. window_size = driver.execute_script("""
  34. return [window.outerWidth - window.innerWidth + arguments[0],
  35. window.outerHeight - window.innerHeight + arguments[1]];
  36. """, width, height)
  37. driver.set_window_size(*window_size)
  38. def init_webdriver():
  39. # client = docker.from_env()
  40. # ls=client.containers.list()
  41. # print(ls)
  42. # ls[0].restart()
  43. # time.sleep(11)
  44. options = webdriver.ChromeOptions()
  45. driver = webdriver.Chrome(options=options)
  46. # driver = webdriver.Remote(
  47. # command_executor='http://127.0.0.1:4444/wd/hub',
  48. # desired_capabilities=options.to_capabilities())
  49. return driver
  50. class SelGTrend:
  51. def __init__(self):
  52. self.texts = []
  53. self.links = []
  54. self.results = []
  55. self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
  56. self.headers = {'User-Agent': self.user_agent}
  57. # self.proxy = FreeProxy().get()
  58. def search(self, key):
  59. self.key = "+".join(key.split(" "))
  60. self.getpage()
  61. def getpage(self, geo):
  62. result=[]
  63. self.url = 'https://trends.google.com/trends/trendingsearches/daily?geo='+geo
  64. try:
  65. print(self.url)
  66. driver=init_webdriver()
  67. # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
  68. driver.get(self.url)
  69. time.sleep(5)
  70. for request in driver.requests:
  71. print(request.url[0:60])
  72. if request.response:
  73. if 'dailytrends?' in request.url :
  74. print('*** parsing js:')
  75. resp=request.response.body
  76. data=None
  77. try:
  78. data = gzip.decompress(resp)
  79. except:
  80. traceback.print_exc()
  81. data=resp
  82. jstext=data.decode('utf-8')
  83. # print(jstext)
  84. jsobj=json.loads(jstext[6:])
  85. return jsobj
  86. # print(jsobj)
  87. except Exception as e:
  88. traceback.print_exc()
  89. print(e)
  90. pass
  91. # driver.quit()
  92. return result
  93. # driver.quit()
  94. def result(self):
  95. return self.results
  96. def gettext(self):
  97. return self.texts
  98. def getlinks(self):
  99. return self.links
  100. def clear(self):
  101. self.texts = []
  102. self.links = []
  103. self.results = []
  104. def save_to_db(js):
  105. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  106. table = db['trending_search_json']
  107. js=js['default']['trendingSearchesDays'][0]['trendingSearches']
  108. table.insert({'dt':datetime.datetime.now(),'json':json.dumps(js, ensure_ascii=False).encode('utf8')})
  109. # for j in json:
  110. # print(j['title'])
  111. ## print(j['formattedTraffic'])
  112. # print(j['relatedQueries'])
  113. # if j.get('source') is not None:
  114. # print(j['source'])
  115. ## print(json.dumps(j['image']))
  116. # print(j['snippet'])
  117. # print(j)
  118. # table.insert(j)
  119. geo='TW'
  120. sgtrend=SelGTrend()
  121. result=sgtrend.getpage(geo)
  122. #print(result)
  123. save_to_db(result)
  124. #time.sleep(9999)