|
@@ -0,0 +1,197 @@
|
|
|
+#import urllib.request
|
|
|
+import urllib
|
|
|
+import requests
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import json
|
|
|
+import os
|
|
|
+import time
|
|
|
+import sys
|
|
|
+import random
|
|
|
+from seleniumwire import webdriver
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.support.ui import WebDriverWait, Select
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.remote.webdriver import WebDriver
|
|
|
+import dataset
|
|
|
+import docker
|
|
|
+import brotli
|
|
|
+import gzip
|
|
|
+import datetime
|
|
|
+import redis
|
|
|
+import argparse
|
|
|
+
|
|
|
+#from fp.fp import FreeProxy
|
|
|
+
|
|
|
+localrun=False
|
|
|
+geo='TW'
|
|
|
+
|
|
|
+
|
|
|
+def send(driver, cmd, params={}):
|
|
|
+ resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
|
|
|
+ url = driver.command_executor._url + resource
|
|
|
+ body = json.dumps({'cmd': cmd, 'params': params})
|
|
|
+ response = driver.command_executor._request('POST', url, body)
|
|
|
+# if response['status']:
|
|
|
+# raise Exception(response.get('value'))
|
|
|
+ return response.get('value')
|
|
|
+
|
|
|
+def add_script(driver, script):
|
|
|
+ send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
|
|
|
+
|
|
|
+def set_viewport_size(driver, width, height):
|
|
|
+ window_size = driver.execute_script("""
|
|
|
+ return [window.outerWidth - window.innerWidth + arguments[0],
|
|
|
+ window.outerHeight - window.innerHeight + arguments[1]];
|
|
|
+ """, width, height)
|
|
|
+ driver.set_window_size(*window_size)
|
|
|
+
|
|
|
+#docker run -d -p 4445:4444 --name p4445 --add-host=host.docker.internal:172.17.0.1 -v /dev/shm:/dev/shm selenium/standalone-chrome
|
|
|
+def init_webdriver():
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ options.add_argument('--ignore-certificate-errors')
|
|
|
+ options.add_argument("--no-sandbox")
|
|
|
+# options.add_argument("--headless")h
|
|
|
+ options.add_argument("--disable-gpu")
|
|
|
+ options.add_argument("--disable-dev-shm-usage")
|
|
|
+ driver = webdriver.Chrome(
|
|
|
+ options=options
|
|
|
+ )
|
|
|
+ driver.set_window_size(1400,1000)
|
|
|
+
|
|
|
+ return driver
|
|
|
+
|
|
|
+
|
|
|
+class SelGTrend:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.texts = []
|
|
|
+ self.links = []
|
|
|
+ self.results = []
|
|
|
+ self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
|
|
|
+ self.headers = {'User-Agent': self.user_agent}
|
|
|
+
|
|
|
+ def search(self, key):
|
|
|
+ self.key = "+".join(key.split(" "))
|
|
|
+ return self.getpage(self.key)
|
|
|
+
|
|
|
+ def getpage(self,query):
|
|
|
+ global geo
|
|
|
+ driver=None
|
|
|
+ result=[]
|
|
|
+ import urllib.parse
|
|
|
+ safe_string = urllib.parse.quote_plus(query)
|
|
|
+ self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
|
|
|
+
|
|
|
+ try:
|
|
|
+ print(self.url)
|
|
|
+ driver=init_webdriver()
|
|
|
+# driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
|
|
|
+ driver.get(self.url)
|
|
|
+ time.sleep(3)
|
|
|
+ driver.refresh()
|
|
|
+ time.sleep(4)
|
|
|
+
|
|
|
+ ub = driver.find_element_by_css_selector('body')
|
|
|
+ for i in range(9):
|
|
|
+ ub.send_keys(Keys.PAGE_DOWN)
|
|
|
+ time.sleep(0.5)
|
|
|
+# time.sleep(4)
|
|
|
+
|
|
|
+# driver.save_screenshot("/tmp/screenshot.png")
|
|
|
+
|
|
|
+
|
|
|
+ for request in driver.requests:
|
|
|
+ print(request.url[0:60])
|
|
|
+ if request.response:
|
|
|
+ if 'relatedsearches?' in request.url :
|
|
|
+ print('*** parsing js:')
|
|
|
+ resp=request.response.body
|
|
|
+ data=None
|
|
|
+ try:
|
|
|
+ data = gzip.decompress(resp)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ data=resp
|
|
|
+
|
|
|
+ jstext=data.decode('utf-8')
|
|
|
+ print(jstext)
|
|
|
+
|
|
|
+ jsobj=json.loads(jstext[6:])
|
|
|
+ print(jsobj)
|
|
|
+ try:
|
|
|
+ kws=jsobj['default']['rankedList'][0]['rankedKeyword']
|
|
|
+ for kw in kws:
|
|
|
+ if kw['hasData']:
|
|
|
+ try:
|
|
|
+ result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
|
|
|
+# print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
|
|
|
+# trend_table.insert({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ val=int (kw['value'])
|
|
|
+# if val >=150:
|
|
|
+# print(kw['query'])
|
|
|
+# print(kw['value'])
|
|
|
+
|
|
|
+
|
|
|
+ kws=jsobj['default']['rankedList'][1]['rankedKeyword']
|
|
|
+ for kw in kws:
|
|
|
+ try:
|
|
|
+ result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
|
|
|
+
|
|
|
+# print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ val=int (kw['value'])
|
|
|
+# if val >=150:
|
|
|
+## print(kw['query'])
|
|
|
+# print(kw['value'])
|
|
|
+
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+# print(jsobj['default']['rankedList'])
|
|
|
+# resultobj=parsing_js(jstext)
|
|
|
+# print("before",datetime.now())
|
|
|
+# save_js_to_db(resultobj,area_num,keyword)
|
|
|
+# print("after",datetime.now())
|
|
|
+
|
|
|
+
|
|
|
+# time.sleep(9999)
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ print(e)
|
|
|
+ pass
|
|
|
+ driver.quit()
|
|
|
+ return result
|
|
|
+# driver.quit()
|
|
|
+
|
|
|
+ def result(self):
|
|
|
+ return self.results
|
|
|
+
|
|
|
+ def gettext(self):
|
|
|
+ return self.texts
|
|
|
+
|
|
|
+ def getlinks(self):
|
|
|
+ return self.links
|
|
|
+
|
|
|
+ def clear(self):
|
|
|
+ self.texts = []
|
|
|
+ self.links = []
|
|
|
+ self.results = []
|
|
|
+
|
|
|
+
|
|
|
+def save_to_db(json):
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
|
|
|
+ table = db['trend_table']
|
|
|
+ for j in json:
|
|
|
+ table.insert(j)
|
|
|
+
|
|
|
+
|
|
|
+sgtrend=SelGTrend()
|
|
|
+data=sgtrend.search('稅')
|
|
|
+save_to_db(data)
|
|
|
+print(data)
|