Jared 3 years ago
parent
commit
5bd6318d80
5 changed files with 601 additions and 0 deletions
  1. 197 0
      gtrends/gtrendtest.py
  2. 157 0
      gtrends/gtrendtest_jsraw.py
  3. 51 0
      gtrends/process_gtrend.py
  4. 42 0
      gtrends/process_trends.py
  5. 154 0
      gtrends/tredning_search.py

+ 197 - 0
gtrends/gtrendtest.py

@@ -0,0 +1,197 @@
+#import urllib.request
+import urllib
+import requests
+import traceback
+from bs4 import BeautifulSoup
+import json
+import os
+import time
+import sys
+import random
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import dataset
+import docker
+import brotli
+import gzip
+import datetime
+import redis
+import argparse
+
+#from fp.fp import FreeProxy
+
+localrun=False
+geo='TW'
+
+
+def send(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+#    if response['status']:
+#        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
+def set_viewport_size(driver, width, height):
+    window_size = driver.execute_script("""
+        return [window.outerWidth - window.innerWidth + arguments[0],
+          window.outerHeight - window.innerHeight + arguments[1]];
+        """, width, height)
+    driver.set_window_size(*window_size)
+
+#docker run -d -p 4445:4444  --name p4445  --add-host=host.docker.internal:172.17.0.1  -v /dev/shm:/dev/shm   selenium/standalone-chrome
+def init_webdriver():
+    options = webdriver.ChromeOptions()
+    options.add_argument('--ignore-certificate-errors')
+    options.add_argument("--no-sandbox")
+#    options.add_argument("--headless")h
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(
+        options=options
+    )
+    driver.set_window_size(1400,1000)
+
+    return driver
+
+
+class SelGTrend:
+
+    def __init__(self):
+        self.texts = []
+        self.links = []
+        self.results = []
+        self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
+        self.headers = {'User-Agent': self.user_agent}
+
+    def search(self, key):
+        self.key = "+".join(key.split(" "))
+        return self.getpage(self.key)
+
+    def getpage(self,query):
+        global geo
+        driver=None
+        result=[]
+        import urllib.parse
+        safe_string = urllib.parse.quote_plus(query)
+        self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
+
+        try:
+            print(self.url)
+            driver=init_webdriver()
+#            driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+            driver.get(self.url)
+            time.sleep(3)
+            driver.refresh()
+            time.sleep(4)
+
+            ub = driver.find_element_by_css_selector('body')
+            for i in range(9):
+                ub.send_keys(Keys.PAGE_DOWN)
+                time.sleep(0.5)
+#            time.sleep(4)
+
+#            driver.save_screenshot("/tmp/screenshot.png")
+
+
+            for request in driver.requests:
+                print(request.url[0:60])
+                if request.response:
+                    if 'relatedsearches?' in request.url :
+                        print('*** parsing js:')
+                        resp=request.response.body
+                        data=None
+                        try:
+                            data = gzip.decompress(resp)
+                        except:
+                            traceback.print_exc()
+                            data=resp
+                        
+                        jstext=data.decode('utf-8')
+                        print(jstext)
+
+                        jsobj=json.loads(jstext[6:])
+                        print(jsobj)
+                        try:
+                            kws=jsobj['default']['rankedList'][0]['rankedKeyword']
+                            for kw in kws:
+                                if kw['hasData']:
+                                    try:
+                                        result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
+#                                        print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
+#                                        trend_table.insert({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
+                                    except:
+                                        traceback.print_exc()
+                                    val=int (kw['value'])
+#                                    if val >=150:
+#                                        print(kw['query'])
+#                                        print(kw['value'])
+
+
+                            kws=jsobj['default']['rankedList'][1]['rankedKeyword']
+                            for kw in kws:
+                                try:
+                                    result.append({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
+
+#                                    print({'kw':query,'query':kw['query'],'value':kw['value'],'expand':0,'geo':geo,'dt':datetime.datetime.now()})
+                                except:
+                                    traceback.print_exc()
+
+                                val=int (kw['value'])
+#                                if val >=150:
+##                                    print(kw['query'])
+#                                    print(kw['value'])
+
+                        except:
+                            traceback.print_exc()
+
+#                        print(jsobj['default']['rankedList'])
+#                        resultobj=parsing_js(jstext)
+#                        print("before",datetime.now())
+#                        save_js_to_db(resultobj,area_num,keyword)
+#                        print("after",datetime.now())
+
+
+#            time.sleep(9999)
+        except Exception as e:
+            traceback.print_exc()
+            print(e)
+            pass
+        driver.quit()
+        return result
+#        driver.quit()
+
+    def result(self):
+        return self.results
+
+    def gettext(self):
+        return self.texts
+
+    def getlinks(self):
+        return self.links
+
+    def clear(self):
+        self.texts = []
+        self.links = []
+        self.results = []
+
+
+def save_to_db(json):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
+    table = db['trend_table']
+    for j in json:
+        table.insert(j)
+
+
+sgtrend=SelGTrend()
+data=sgtrend.search('稅')
+save_to_db(data)
+print(data)

+ 157 - 0
gtrends/gtrendtest_jsraw.py

@@ -0,0 +1,157 @@
+#import urllib.request
+import urllib
+import requests
+import traceback
+from bs4 import BeautifulSoup
+import json
+import os
+import time
+import sys
+import random
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import dataset
+import docker
+import brotli
+import gzip
+import datetime
+import redis
+import argparse
+
+#from fp.fp import FreeProxy
+
+localrun=False
+geo='TW'
+
+
+def send(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+#    if response['status']:
+#        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
+def set_viewport_size(driver, width, height):
+    window_size = driver.execute_script("""
+        return [window.outerWidth - window.innerWidth + arguments[0],
+          window.outerHeight - window.innerHeight + arguments[1]];
+        """, width, height)
+    driver.set_window_size(*window_size)
+
+#docker run -d -p 4445:4444  --name p4445  --add-host=host.docker.internal:172.17.0.1  -v /dev/shm:/dev/shm   selenium/standalone-chrome
+def init_webdriver():
+    options = webdriver.ChromeOptions()
+    options.add_argument('--ignore-certificate-errors')
+    options.add_argument("--no-sandbox")
+#    options.add_argument("--headless")h
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(
+        options=options
+    )
+    driver.set_window_size(1400,1000)
+
+    return driver
+
+
+class SelGTrend:
+    def __init__(self):
+        db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
+        self.table=db['gtrend_jsraw']
+        self.yt=False
+        self.texts = []
+        self.links = []
+        self.results = []
+        self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
+        self.headers = {'User-Agent': self.user_agent}
+
+    def search(self, key):
+        self.key = "+".join(key.split(" "))
+        return self.getpage(self.key)
+
+    def getpage(self,query):
+        global geo
+        driver=None
+        result=[]
+        import urllib.parse
+        safe_string = urllib.parse.quote_plus(query)
+        if self.yt:
+            self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&gprop=youtube&q='+safe_string
+        else:
+            self.url = 'https://trends.google.com/trends/explore?date=now%207-d&geo='+geo+'&q='+safe_string
+
+        try:
+            print(self.url)
+            driver=init_webdriver()
+#            driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+            driver.get(self.url)
+            time.sleep(3)
+            driver.refresh()
+            time.sleep(4)
+
+            ub = driver.find_element_by_css_selector('body')
+            for i in range(9):
+                ub.send_keys(Keys.PAGE_DOWN)
+                time.sleep(0.5)
+#            time.sleep(4)
+
+#            driver.save_screenshot("/tmp/screenshot.png")
+
+
+            for request in driver.requests:
+                print(request.url[0:60])
+                if request.response:
+                    if 'relatedsearches?' in request.url :
+                        print('*** parsing js:')
+                        resp=request.response.body
+                        data=None
+                        try:
+                            data = gzip.decompress(resp)
+                        except:
+                            traceback.print_exc()
+                            data=resp
+                        
+                        jstext=data.decode('utf-8')
+                        print(jstext)
+
+                        jsobj=json.loads(jstext[6:])
+                        jsobj=jsobj['default']['rankedList']
+                        self.table.insert({'kw':self.key,'dt':datetime.datetime.now(),'json':json.dumps(jsobj, ensure_ascii=False).encode('utf8')})
+
+
+                        print(jsobj)
+        except Exception as e:
+            traceback.print_exc()
+            print(e)
+            pass
+        driver.quit()
+        return result
+#        driver.quit()
+
+    def result(self):
+        return self.results
+
+    def gettext(self):
+        return self.texts
+
+    def getlinks(self):
+        return self.links
+
+    def clear(self):
+        self.texts = []
+        self.links = []
+        self.results = []
+
+
+sgtrend=SelGTrend()
+#data=sgtrend.search('居家')
+data=sgtrend.search('7-11 當機')

+ 51 - 0
gtrends/process_gtrend.py

@@ -0,0 +1,51 @@
+#import urllib.request
+import urllib
+import requests
+import traceback
+from bs4 import BeautifulSoup
+import json
+import os
+import time
+import sys
+import random
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import dataset
+import docker
+import datetime
+import gzip
+
+alldict={}
+fullkw='7-11+當機'
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
+cursor=db.query('SELECT * FROM gtrends.gtrend_jsraw where kw="'+fullkw+'"  order by id desc limit 5')
+for c in cursor:
+    js=c['json']
+    jsobj=json.loads(js)
+#    jsobj=jsobj['rankedKeyword']
+    for j in jsobj:
+        kws=j['rankedKeyword']
+        for kw in kws:
+            if 'query' in kw:
+                print(kw['query'])
+                alldict[kw['query']]=1
+                if len(alldict)>=5:
+                    break
+print(alldict)
+#        break
+#        print(j['title']['query'])
+#        for a in j['articles']:
+#            print(a['title'])
+#            if a.get('image')!= None:
+#                print(a['image'])
+#                print(a['image']['imageUrl'])
+##                print(a['image']['newsUrl'])
+#        for r in j['relatedQueries']:
+#            print("-->" +r['query'])
+
+
+

+ 42 - 0
gtrends/process_trends.py

@@ -0,0 +1,42 @@
+#import urllib.request
+import urllib
+import requests
+import traceback
+from bs4 import BeautifulSoup
+import json
+import os
+import time
+import sys
+import random
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import dataset
+import docker
+import datetime
+import gzip
+
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
+cursor=db.query('SELECT * FROM gtrends.trending_search_json order by id desc limit 1')
+for c in cursor:
+    js=c['json']
+    jsobj=json.loads(js)
+    for j in jsobj:
+#        print(j)
+#        break
+        print(j['title']['query'])
+        for a in j['articles']:
+            print(a['title'])
+            if a.get('image')!= None:
+#                print(a['image'])
+                print(a['image']['imageUrl'])
+                print(a['image']['newsUrl'])
+        for r in j['relatedQueries']:
+            print("-->" +r['query'])
+
+
+

+ 154 - 0
gtrends/tredning_search.py

@@ -0,0 +1,154 @@
+#import urllib.request
+import urllib
+import requests
+import traceback
+from bs4 import BeautifulSoup
+import json
+import os
+import time
+import sys
+import random
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import dataset
+import docker
+import datetime
+import gzip
+#from fp.fp import FreeProxy
+
+
+def send(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+#    if response['status']:
+#        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
+def set_viewport_size(driver, width, height):
+    window_size = driver.execute_script("""
+        return [window.outerWidth - window.innerWidth + arguments[0],
+          window.outerHeight - window.innerHeight + arguments[1]];
+        """, width, height)
+    driver.set_window_size(*window_size)
+
+
+def init_webdriver():
+#    client = docker.from_env()
+#    ls=client.containers.list()
+#    print(ls)
+#    ls[0].restart()
+#    time.sleep(11)
+
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(options=options)
+
+#    driver = webdriver.Remote(
+#    command_executor='http://127.0.0.1:4444/wd/hub',
+#    desired_capabilities=options.to_capabilities())
+    return driver
+
+
+class SelGTrend:
+
+    def __init__(self):
+        self.texts = []
+        self.links = []
+        self.results = []
+        self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
+        self.headers = {'User-Agent': self.user_agent}
+#        self.proxy = FreeProxy().get()
+
+    def search(self, key):
+        self.key = "+".join(key.split(" "))
+        self.getpage()
+
+    def getpage(self, geo):
+        result=[]
+        self.url = 'https://trends.google.com/trends/trendingsearches/daily?geo='+geo
+        try:
+            print(self.url)
+            driver=init_webdriver()
+#            driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+            driver.get(self.url)
+            time.sleep(5)
+
+
+            for request in driver.requests:
+                print(request.url[0:60])
+                if request.response:
+                    if 'dailytrends?' in request.url :
+                        print('*** parsing js:')
+                        resp=request.response.body
+                        data=None
+
+                        try:
+                            data = gzip.decompress(resp)
+                        except:
+                            traceback.print_exc()
+                            data=resp
+                        
+                        jstext=data.decode('utf-8')
+#                        print(jstext)
+
+                        jsobj=json.loads(jstext[6:])
+                        return jsobj
+#                        print(jsobj)
+
+
+        except Exception as e:
+            traceback.print_exc()
+            print(e)
+            pass
+#        driver.quit()
+        return result
+#        driver.quit()
+
+    def result(self):
+        return self.results
+
+    def gettext(self):
+        return self.texts
+
+    def getlinks(self):
+        return self.links
+
+    def clear(self):
+        self.texts = []
+        self.links = []
+        self.results = []
+
+
+def save_to_db(js):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
+    table = db['trending_search_json']
+    js=js['default']['trendingSearchesDays'][0]['trendingSearches']
+    table.insert({'dt':datetime.datetime.now(),'json':json.dumps(js, ensure_ascii=False).encode('utf8')})
+    
+#    for j in json:
+#        print(j['title'])
+##        print(j['formattedTraffic'])
+#        print(j['relatedQueries'])
+#        if j.get('source') is not None:
+#            print(j['source'])
+##        print(json.dumps(j['image']))
+#        print(j['snippet'])
+
+#        print(j)
+#        table.insert(j)
+
+
+geo='TW'
+sgtrend=SelGTrend()
+result=sgtrend.getpage(geo)
+#print(result)
+save_to_db(result)
+#time.sleep(9999)