noodles 3 سال پیش
والد
کامیت
e84a806a32
1فایلهای تغییر یافته به همراه245 افزوده شده و 0 حذف شده
  1. 245 0
      swire_shop_review.py

+ 245 - 0
swire_shop_review.py

@@ -0,0 +1,245 @@
+# -*- coding: utf-8 -*-
+#from selenium import webdriver
+from seleniumwire import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+import selenium
+import traceback
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+from requests import session
+import pandas as pd
+import dataset
+import time
+import json
+import re
+import sys, os
+import socket
+import brotli
+import pickle
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import urllib.parse
+chrome_window=False
+globalkw=None
+proxyport=8787
+
+db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
+              'review_time', 'review_content', 'review_image',
+              'store_review_time','store_review']
+
+def write_to_file(jsobj,fname):
+    with open(fname, 'wb') as handle:
+        pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    # import codecs
+    # fw=codecs.open(fname,'w','utf-8')
+    # fw.write(str(jsobj))
+    # fw.close()
+
+
+def build_cache(db):
+    global reviews_table
+    id_dict={}
+    cursor = db.query('SELECT fid, author_id FROM google_poi.{};'.format(reviews_table))
+
+    for c in cursor:
+        key = '{}_{}'.format(c['fid'],c['author_id'])
+        id_dict[key]=1
+    return id_dict
+
+
+def brower_start(port):
+    global proxyport
+    global chrome_window
+    print(proxyport)
+    options = webdriver.ChromeOptions()
+    if chrome_window:
+        browser = webdriver.Chrome(
+            desired_capabilities=options.to_capabilities()
+        )
+    else:
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        browser = webdriver.Remote(
+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+            desired_capabilities=chrome_options.to_capabilities(),
+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
+
+            )
+        browser.set_window_size(1400,1000)
+    return browser
+
+
+def get_next_job(db):
+    result = {}
+    result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
+    url_pd = pd.DataFrame([dict(i) for i in result])
+    url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
+
+    remove = db.query('select fid from review_process')
+    url_pd = pd.DataFrame([dict(i) for i in result])
+    remove_fid_list = url_pd['fid'].to_list()
+
+    url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
+
+    return url_pd
+
+
+def parsing_js(resp):
+    jsobj = json.loads(resp[5::])
+    result = []
+    for i in range(len(jsobj[2])):
+        tmp = []
+        tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
+        tmp += [jsobj[2][i][1], jsobj[2][i][3]]
+
+        # image
+        image = []
+        if jsobj[2][i][14]:
+            for j in range(len(jsobj[2][i][14])):
+                image += [jsobj[2][i][14][j][6][0]]
+        tmp += [image]
+
+        # store reply
+        if jsobj[2][i][9]:
+            tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
+        else:
+            tmp += ['', '']
+
+        tmp_dict = {}
+        for i in range(len(db_columns)):
+            tmp_dict[db_columns[i]] = tmp[i]
+        tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
+        result.append(tmp_dict)
+
+    # write_to_file(orig,'debug.pickle')
+    return result 
+
+
+def save_js_to_db(jsobj, fid):
+    global reviews_table
+    global iddict
+    for r in jsobj:
+        r['fid'] = fid
+        key = '{}_{}'.format(r['fid'], r['author_id'])
+        if iddict.get(key) is not None:
+            continue
+        try:
+            r['review_image'] = str(r['review_image'])
+            reviews_table.insert(r)
+        except:
+            traceback.print_exc()
+
+
+def process_web_request(db, driver, fid):
+    time.sleep(0.8)
+    time.sleep(3)
+    print("ppppppppp&**********************")
+    for request in driver.requests:
+        if request.response:
+            # print(request.url)
+            if 'listentitiesreviews?' in request.url :
+                print('parsing js:')
+                print(request.url)
+                resp = brotli.decompress(request.response.body)
+                jstext = resp.decode('utf-8')
+                result = parsing_js(jstext)
+
+                save_js_to_db(resultobj, fid)
+                time.sleep(1)
+
+
+def page_down_(driver, xpath_css, time_):
+    elmts = driver.find_elements_by_xpath(xpath_css)
+    print(elmts)
+    if len(elmts)>1:
+        elmt=elmts[1]
+    else:
+        elmt=elmts[0]
+    actions = ActionChains(driver)
+    actions.move_to_element(elmt).click().perform()
+    for i in range(time_):
+        try:
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
+        except:
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
+def get_reviews(driver, reviews_cnt):
+    wait = WebDriverWait(driver, 30)
+    more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
+    wait.until(
+        EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
+    )
+    element = driver.find_element_by_css_selector(more_reviews_css)
+    driver.implicitly_wait(10)
+    ActionChains(driver).move_to_element(element).click(element).perform()
+    time.sleep(0.5)
+    reviews_cnt = int(reviews_cnt)
+    if reviews_cnt > 10:
+        page_down_count = int(reviews_cnt) // 3
+        page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
+
+
+def main():
+    global chrome_window
+    global store_list_table
+    global reviews_table
+    global proxyport
+    global iddict
+
+    localip=socket.gethostbyname(socket.gethostname())
+
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    iddict=build_cache(db)
+    store_list_table = db['swire_store_list']
+    reviews_table = db['reviews_table']
+
+    port=4444
+    if len(sys.argv) == 3 :
+        port=int(sys.argv[1])
+        proxyport=int(sys.argv[2])
+    if not chrome_window:
+        print('restart docker pw{}'.format(port))
+#        os.system('sudo docker container restart p'+str(port))
+        os.system('sudo docker container restart pw'+str(port))
+
+        time.sleep(10)
+
+    print('drvier start...')
+    driver = brower_start(port)
+
+    job = get_next_job(db)
+
+    for row, group in job.iterrows():
+        item_url = group['item_url']
+        reviews_cnt = group['reviews_cnt']
+        fid = group['fid']
+
+        print(reviews_cnt, item_url)
+        driver.get(item_url)
+        time.sleep(0.5)
+        shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+        tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
+
+        if tmp_value:
+            get_reviews(driver, reviews_cnt)
+            process_web_request(db, driver, fid)
+            print(driver.current_url)
+
+        db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
+
+if __name__ == '__main__':
+    main()