123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- #import urllib.request
- import urllib
- import requests
- import traceback
- from bs4 import BeautifulSoup
- import json
- import os
- import time
- import sys
- import random
- from seleniumwire import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait, Select
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.remote.webdriver import WebDriver
- import dataset
- import docker
- import datetime
- import gzip
- #from fp.fp import FreeProxy
- def send(driver, cmd, params={}):
- resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
- url = driver.command_executor._url + resource
- body = json.dumps({'cmd': cmd, 'params': params})
- response = driver.command_executor._request('POST', url, body)
- # if response['status']:
- # raise Exception(response.get('value'))
- return response.get('value')
- def add_script(driver, script):
- send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
- def set_viewport_size(driver, width, height):
- window_size = driver.execute_script("""
- return [window.outerWidth - window.innerWidth + arguments[0],
- window.outerHeight - window.innerHeight + arguments[1]];
- """, width, height)
- driver.set_window_size(*window_size)
- def init_webdriver():
- # client = docker.from_env()
- # ls=client.containers.list()
- # print(ls)
- # ls[0].restart()
- # time.sleep(11)
- options = webdriver.ChromeOptions()
- options.add_argument("--no-sandbox")
- options.add_argument("--disable-dev-shm-usage")
- options.add_argument("--headless")
- options.add_argument("--incognito")
- driver = webdriver.Chrome(options=options)
- # driver = webdriver.Remote(
- # command_executor='http://127.0.0.1:4444/wd/hub',
- # desired_capabilities=options.to_capabilities())
- return driver
- class SelGTrend:
- def __init__(self):
- self.texts = []
- self.links = []
- self.results = []
- self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
- self.headers = {'User-Agent': self.user_agent}
- # self.proxy = FreeProxy().get()
- def search(self, key):
- self.key = "+".join(key.split(" "))
- self.getpage()
- def getpage(self, geo):
- result=[]
- self.url = 'https://trends.google.com/trends/trendingsearches/daily?geo='+geo
- try:
- print(self.url)
- driver=init_webdriver()
- # driver.add_script('const setProperty = () => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }; setProperty();')
- driver.get(self.url)
- time.sleep(5)
- for request in driver.requests:
- print(request.url[0:60])
- if request.response:
- if 'dailytrends?' in request.url :
- print('*** parsing js:')
- resp=request.response.body
- data=None
- try:
- data = gzip.decompress(resp)
- except:
- traceback.print_exc()
- data=resp
-
- jstext=data.decode('utf-8')
- # print(jstext)
- jsobj=json.loads(jstext[6:])
- return jsobj
- # print(jsobj)
- except Exception as e:
- traceback.print_exc()
- print(e)
- pass
- # driver.quit()
- return result
- # driver.quit()
- def result(self):
- return self.results
- def gettext(self):
- return self.texts
- def getlinks(self):
- return self.links
- def clear(self):
- self.texts = []
- self.links = []
- self.results = []
- def save_to_db(js):
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
- table = db['trending_search_json']
- js=js['default']['trendingSearchesDays'][0]['trendingSearches']
- try:
- table.insert({'dt':datetime.datetime.now(),'json':json.dumps(js, ensure_ascii=False).encode('utf8')})
- except:
- print('dup')
- # for j in json:
- # print(j['title'])
- ## print(j['formattedTraffic'])
- # print(j['relatedQueries'])
- # if j.get('source') is not None:
- # print(j['source'])
- ## print(json.dumps(j['image']))
- # print(j['snippet'])
- # print(j)
- # table.insert(j)
- geo='TW'
- sgtrend=SelGTrend()
- result=sgtrend.getpage(geo)
- #print(result)
- save_to_db(result)
- #time.sleep(9999)
|