|
@@ -0,0 +1,245 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+#from selenium import webdriver
|
|
|
+from seleniumwire import webdriver
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+import selenium
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
+
|
|
|
+from datetime import datetime
|
|
|
+from requests import session
|
|
|
+import pandas as pd
|
|
|
+import dataset
|
|
|
+import time
|
|
|
+import json
|
|
|
+import re
|
|
|
+import sys, os
|
|
|
+import socket
|
|
|
+import brotli
|
|
|
+import pickle
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import urllib.parse
|
|
|
+chrome_window=False
|
|
|
+globalkw=None
|
|
|
+proxyport=8787
|
|
|
+
|
|
|
+db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
|
|
|
+ 'review_time', 'review_content', 'review_image',
|
|
|
+ 'store_review_time','store_review']
|
|
|
+
|
|
|
+def write_to_file(jsobj,fname):
|
|
|
+ with open(fname, 'wb') as handle:
|
|
|
+ pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
+ # import codecs
|
|
|
+ # fw=codecs.open(fname,'w','utf-8')
|
|
|
+ # fw.write(str(jsobj))
|
|
|
+ # fw.close()
|
|
|
+
|
|
|
+
|
|
|
+def build_cache(db):
|
|
|
+ global reviews_table
|
|
|
+ id_dict={}
|
|
|
+ cursor = db.query('SELECT fid, author_id FROM google_poi.{};'.format(reviews_table))
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ key = '{}_{}'.format(c['fid'],c['author_id'])
|
|
|
+ id_dict[key]=1
|
|
|
+ return id_dict
|
|
|
+
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ global proxyport
|
|
|
+ global chrome_window
|
|
|
+ print(proxyport)
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ if chrome_window:
|
|
|
+ browser = webdriver.Chrome(
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ chrome_options = webdriver.ChromeOptions()
|
|
|
+ chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
|
|
|
+ chrome_options.add_argument('--ignore-certificate-errors')
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=chrome_options.to_capabilities(),
|
|
|
+ seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
|
|
|
+
|
|
|
+ )
|
|
|
+ browser.set_window_size(1400,1000)
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def get_next_job(db):
|
|
|
+ result = {}
|
|
|
+ result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
+ url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
+
|
|
|
+ remove = db.query('select fid from review_process')
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
+ remove_fid_list = url_pd['fid'].to_list()
|
|
|
+
|
|
|
+ url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
|
|
|
+
|
|
|
+ return url_pd
|
|
|
+
|
|
|
+
|
|
|
+def parsing_js(resp):
|
|
|
+ jsobj = json.loads(resp[5::])
|
|
|
+ result = []
|
|
|
+ for i in range(len(jsobj[2])):
|
|
|
+ tmp = []
|
|
|
+ tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
|
|
|
+ tmp += [jsobj[2][i][1], jsobj[2][i][3]]
|
|
|
+
|
|
|
+ # image
|
|
|
+ image = []
|
|
|
+ if jsobj[2][i][14]:
|
|
|
+ for j in range(len(jsobj[2][i][14])):
|
|
|
+ image += [jsobj[2][i][14][j][6][0]]
|
|
|
+ tmp += [image]
|
|
|
+
|
|
|
+ # store reply
|
|
|
+ if jsobj[2][i][9]:
|
|
|
+ tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
|
|
|
+ else:
|
|
|
+ tmp += ['', '']
|
|
|
+
|
|
|
+ tmp_dict = {}
|
|
|
+ for i in range(len(db_columns)):
|
|
|
+ tmp_dict[db_columns[i]] = tmp[i]
|
|
|
+ tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
|
|
|
+ result.append(tmp_dict)
|
|
|
+
|
|
|
+ # write_to_file(orig,'debug.pickle')
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def save_js_to_db(jsobj, fid):
|
|
|
+ global reviews_table
|
|
|
+ global iddict
|
|
|
+ for r in jsobj:
|
|
|
+ r['fid'] = fid
|
|
|
+ key = '{}_{}'.format(r['fid'], r['author_id'])
|
|
|
+ if iddict.get(key) is not None:
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ r['review_image'] = str(r['review_image'])
|
|
|
+ reviews_table.insert(r)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
+def process_web_request(db, driver, fid):
|
|
|
+ time.sleep(0.8)
|
|
|
+ time.sleep(3)
|
|
|
+ print("ppppppppp&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ # print(request.url)
|
|
|
+ if 'listentitiesreviews?' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ print(request.url)
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ result = parsing_js(jstext)
|
|
|
+
|
|
|
+ save_js_to_db(resultobj, fid)
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
+ print(elmts)
|
|
|
+ if len(elmts)>1:
|
|
|
+ elmt=elmts[1]
|
|
|
+ else:
|
|
|
+ elmt=elmts[0]
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.move_to_element(elmt).click().perform()
|
|
|
+ for i in range(time_):
|
|
|
+ try:
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def get_reviews(driver, reviews_cnt):
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
|
|
|
+ )
|
|
|
+ element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+ reviews_cnt = int(reviews_cnt)
|
|
|
+ if reviews_cnt > 10:
|
|
|
+ page_down_count = int(reviews_cnt) // 3
|
|
|
+ page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ global chrome_window
|
|
|
+ global store_list_table
|
|
|
+ global reviews_table
|
|
|
+ global proxyport
|
|
|
+ global iddict
|
|
|
+
|
|
|
+ localip=socket.gethostbyname(socket.gethostname())
|
|
|
+
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ iddict=build_cache(db)
|
|
|
+ store_list_table = db['swire_store_list']
|
|
|
+ reviews_table = db['reviews_table']
|
|
|
+
|
|
|
+ port=4444
|
|
|
+ if len(sys.argv) == 3 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ proxyport=int(sys.argv[2])
|
|
|
+ if not chrome_window:
|
|
|
+ print('restart docker pw{}'.format(port))
|
|
|
+# os.system('sudo docker container restart p'+str(port))
|
|
|
+ os.system('sudo docker container restart pw'+str(port))
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+
|
|
|
+ print('drvier start...')
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+ job = get_next_job(db)
|
|
|
+
|
|
|
+ for row, group in job.iterrows():
|
|
|
+ item_url = group['item_url']
|
|
|
+ reviews_cnt = group['reviews_cnt']
|
|
|
+ fid = group['fid']
|
|
|
+
|
|
|
+ print(reviews_cnt, item_url)
|
|
|
+ driver.get(item_url)
|
|
|
+ time.sleep(0.5)
|
|
|
+ shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
|
|
|
+
|
|
|
+ if tmp_value:
|
|
|
+ get_reviews(driver, reviews_cnt)
|
|
|
+ process_web_request(db, driver, fid)
|
|
|
+ print(driver.current_url)
|
|
|
+
|
|
|
+ db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|