#import urllib.request import urllib import requests import traceback from bs4 import BeautifulSoup import json import os import time import sys import random from seleniumwire import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait, Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webdriver import WebDriver import dataset import docker import brotli import gzip import datetime import redis import argparse #from fp.fp import FreeProxy localrun=False geo='TW' def send(driver, cmd, params={}): resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id url = driver.command_executor._url + resource body = json.dumps({'cmd': cmd, 'params': params}) response = driver.command_executor._request('POST', url, body) # if response['status']: # raise Exception(response.get('value')) return response.get('value') def add_script(driver, script): send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script}) def set_viewport_size(driver, width, height): window_size = driver.execute_script(""" return [window.outerWidth - window.innerWidth + arguments[0], window.outerHeight - window.innerHeight + arguments[1]]; """, width, height) driver.set_window_size(*window_size) #docker run -d -p 4445:4444 --name p4445 --add-host=host.docker.internal:172.17.0.1 -v /dev/shm:/dev/shm selenium/standalone-chrome def init_webdriver(): options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument("--no-sandbox") # options.add_argument("--headless")h options.add_argument("--disable-gpu") options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome( options=options ) driver.set_window_size(1400,1000) return driver class SelGTrend: def __init__(self): self.texts = [] self.links = [] self.results = [] self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0' self.headers = {'User-Agent': self.user_agent} def search(self, key): self.key = "+".join(key.split(" ")) return self.getpage(self.key) def getpage(self,query): global geo driver=None result=[] import urllib.parse safe_string = urllib.parse.quote_plus(query) self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string try: print(self.url) driver=init_webdriver() # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();') driver.get(self.url) time.sleep(3) driver.refresh() time.sleep(4) ub = driver.find_element_by_css_selector('body') for i in range(9): ub.send_keys(Keys.PAGE_DOWN) time.sleep(0.5) # time.sleep(4) # driver.save_screenshot("/tmp/screenshot.png") for request in driver.requests: print(request.url[0:60]) if request.response: if 'relatedsearches?' in request.url : print('*** parsing js:') resp=request.response.body data=None try: data = gzip.decompress(resp) except: traceback.print_exc() data=resp jstext=data.decode('utf-8') print(jstext) jsobj=json.loads(jstext[6:]) print(jsobj) try: kws=jsobj['default']['rankedList'][0]['rankedKeyword'] for kw in kws: if kw['hasData']: try: result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()}) # print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()}) # trend_table.insert({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()}) except: traceback.print_exc() val=int (kw['value']) # if val >=150: # print(kw['query']) # print(kw['value']) kws=jsobj['default']['rankedList'][1]['rankedKeyword'] for kw in kws: try: result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()}) # print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()}) except: traceback.print_exc() val=int (kw['value']) # if val >=150: ## print(kw['query']) # print(kw['value']) except: traceback.print_exc() # print(jsobj['default']['rankedList']) # resultobj=parsing_js(jstext) # print("before",datetime.now()) # save_js_to_db(resultobj,area_num,keyword) # print("after",datetime.now()) # time.sleep(9999) except Exception as e: traceback.print_exc() print(e) pass driver.quit() return result # driver.quit() def result(self): return self.results def gettext(self): return self.texts def getlinks(self): return self.links def clear(self): self.texts = [] self.links = [] self.results = [] def save_to_db(json): db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4') table = db['trend_table'] for j in json: table.insert(j) sgtrend=SelGTrend() data=sgtrend.search('稅') save_to_db(data) print(data)