| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 | 
							- #import urllib.request
 
- import urllib
 
- import requests
 
- import traceback
 
- from bs4 import BeautifulSoup
 
- import json
 
- import os
 
- import time
 
- import sys
 
- import random
 
- from seleniumwire import webdriver
 
- from selenium.webdriver.common.by import By
 
- from selenium.webdriver.support.ui import WebDriverWait, Select
 
- from selenium.webdriver.support import expected_conditions as EC
 
- from selenium.webdriver.common.keys import Keys
 
- from selenium.webdriver.remote.webdriver import WebDriver
 
- import dataset
 
- import docker
 
- import brotli
 
- import gzip
 
- import datetime
 
- import redis
 
- import argparse
 
- #from fp.fp import FreeProxy
 
- localrun=False
 
- geo='TW'
 
- def send(driver, cmd, params={}):
 
-     resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
 
-     url = driver.command_executor._url + resource
 
-     body = json.dumps({'cmd': cmd, 'params': params})
 
-     response = driver.command_executor._request('POST', url, body)
 
- #    if response['status']:
 
- #        raise Exception(response.get('value'))
 
-     return response.get('value')
 
- def add_script(driver, script):
 
-     send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
 
- def set_viewport_size(driver, width, height):
 
-     window_size = driver.execute_script("""
 
-         return [window.outerWidth - window.innerWidth + arguments[0],
 
-           window.outerHeight - window.innerHeight + arguments[1]];
 
-         """, width, height)
 
-     driver.set_window_size(*window_size)
 
- #docker run -d -p 4445:4444  --name p4445  --add-host=host.docker.internal:172.17.0.1  -v /dev/shm:/dev/shm   selenium/standalone-chrome
 
- def init_webdriver():
 
-     options = webdriver.ChromeOptions()
 
-     options.add_argument('--ignore-certificate-errors')
 
-     options.add_argument("--no-sandbox")
 
- #    options.add_argument("--headless")h
 
-     options.add_argument("--disable-gpu")
 
-     options.add_argument("--disable-dev-shm-usage")
 
-     driver = webdriver.Chrome(
 
-         options=options
 
-     )
 
-     driver.set_window_size(1400,1000)
 
-     return driver
 
- class SelGTrend:
 
-     def __init__(self):
 
-         self.texts = []
 
-         self.links = []
 
-         self.results = []
 
-         self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
 
-         self.headers = {'User-Agent': self.user_agent}
 
-     def search(self, key):
 
-         self.key = "+".join(key.split(" "))
 
-         return self.getpage(self.key)
 
-     def getpage(self,query):
 
-         global geo
 
-         driver=None
 
-         result=[]
 
-         import urllib.parse
 
-         safe_string = urllib.parse.quote_plus(query)
 
-         self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
 
-         try:
 
-             print(self.url)
 
-             driver=init_webdriver()
 
- #            driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
 
-             driver.get(self.url)
 
-             time.sleep(3)
 
-             driver.refresh()
 
-             time.sleep(4)
 
-             ub = driver.find_element_by_css_selector('body')
 
-             for i in range(9):
 
-                 ub.send_keys(Keys.PAGE_DOWN)
 
-                 time.sleep(0.5)
 
- #            time.sleep(4)
 
- #            driver.save_screenshot("/tmp/screenshot.png")
 
-             for request in driver.requests:
 
-                 print(request.url[0:60])
 
-                 if request.response:
 
-                     if 'relatedsearches?' in request.url :
 
-                         print('*** parsing js:')
 
-                         resp=request.response.body
 
-                         data=None
 
-                         try:
 
-                             data = gzip.decompress(resp)
 
-                         except:
 
-                             traceback.print_exc()
 
-                             data=resp
 
-                         
 
-                         jstext=data.decode('utf-8')
 
-                         print(jstext)
 
-                         jsobj=json.loads(jstext[6:])
 
-                         print(jsobj)
 
-                         try:
 
-                             kws=jsobj['default']['rankedList'][0]['rankedKeyword']
 
-                             for kw in kws:
 
-                                 if kw['hasData']:
 
-                                     try:
 
-                                         result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
 
- #                                        print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
 
- #                                        trend_table.insert({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
 
-                                     except:
 
-                                         traceback.print_exc()
 
-                                     val=int (kw['value'])
 
- #                                    if val >=150:
 
- #                                        print(kw['query'])
 
- #                                        print(kw['value'])
 
-                             kws=jsobj['default']['rankedList'][1]['rankedKeyword']
 
-                             for kw in kws:
 
-                                 try:
 
-                                     result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
 
- #                                    print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
 
-                                 except:
 
-                                     traceback.print_exc()
 
-                                 val=int (kw['value'])
 
- #                                if val >=150:
 
- ##                                    print(kw['query'])
 
- #                                    print(kw['value'])
 
-                         except:
 
-                             traceback.print_exc()
 
- #                        print(jsobj['default']['rankedList'])
 
- #                        resultobj=parsing_js(jstext)
 
- #                        print("before",datetime.now())
 
- #                        save_js_to_db(resultobj,area_num,keyword)
 
- #                        print("after",datetime.now())
 
- #            time.sleep(9999)
 
-         except Exception as e:
 
-             traceback.print_exc()
 
-             print(e)
 
-             pass
 
-         driver.quit()
 
-         return result
 
- #        driver.quit()
 
-     def result(self):
 
-         return self.results
 
-     def gettext(self):
 
-         return self.texts
 
-     def getlinks(self):
 
-         return self.links
 
-     def clear(self):
 
-         self.texts = []
 
-         self.links = []
 
-         self.results = []
 
- def save_to_db(json):
 
-     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
 
-     table = db['trend_table']
 
-     for j in json:
 
-         table.insert(j)
 
- sgtrend=SelGTrend()
 
- data=sgtrend.search('稅')
 
- save_to_db(data)
 
- print(data)
 
 
  |