# -*- coding: utf-8 -*- #from selenium import webdriver from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import traceback from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * from datetime import datetime from requests import session import pandas as pd import dataset import time import json import re import sys, os import socket import brotli import pickle from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib.parse chrome_window=False globalkw=None proxyport=8787 db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count', 'review_time', 'review_content', 'review_image', 'store_review_time','store_review'] def write_to_file(jsobj,fname): with open(fname, 'wb') as handle: pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL) # import codecs # fw=codecs.open(fname,'w','utf-8') # fw.write(str(jsobj)) # fw.close() def build_cache(db): global reviews_table id_dict={} cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;') for c in cursor: key = '{}_{}'.format(c['fid'],c['author_id']) id_dict[key]=1 return id_dict def brower_start(port): global proxyport global chrome_window print(proxyport) options = webdriver.ChromeOptions() if chrome_window: browser = webdriver.Chrome( desired_capabilities=options.to_capabilities() ) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False} ) browser.set_window_size(1400,1000) return browser def get_next_job(db): result = {} result = db.query('select * from swire_store_list ORDER BY RAND() limit 1') url_pd = pd.DataFrame([dict(i) for i in result]) url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x)) remove = db.query('select fid from review_process') remove = pd.DataFrame([dict(i) for i in remove]) remove_fid_list = remove['fid'].to_list() url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)] return url_pd def parsing_js(resp): jsobj = json.loads(resp[5::]) result = [] for i in range(len(jsobj[2])): tmp = [] tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]] tmp += [jsobj[2][i][1], jsobj[2][i][3]] # image image = [] if jsobj[2][i][14]: for j in range(len(jsobj[2][i][14])): image += [jsobj[2][i][14][j][6][0]] tmp += [image] # store reply if jsobj[2][i][9]: tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]] else: tmp += ['', ''] tmp_dict = {} for i in range(len(db_columns)): tmp_dict[db_columns[i]] = tmp[i] tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M") result.append(tmp_dict) # write_to_file(orig,'debug.pickle') return result def save_js_to_db(jsobj, fid): global reviews_table global iddict for r in jsobj: r['fid'] = fid key = '{}_{}'.format(r['fid'], r['author_id']) if iddict.get(key) is not None: continue try: r['review_image'] = str(r['review_image']) reviews_table.insert(r) except: traceback.print_exc() def process_web_request(db, driver, fid): time.sleep(0.8) time.sleep(3) print("ppppppppp&**********************") for request in driver.requests: if request.response: # print(request.url) if 'listentitiesreviews?' in request.url : print('parsing js:') print(request.url) resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') result = parsing_js(jstext) save_js_to_db(result, fid) time.sleep(1) def page_down_(driver, xpath_css, time_): elmts = driver.find_elements_by_xpath(xpath_css) print(elmts) if len(elmts)>1: elmt=elmts[1] else: elmt=elmts[0] actions = ActionChains(driver) actions.move_to_element(elmt).click().perform() for i in range(time_): try: actions = ActionChains(driver) actions.send_keys(Keys.PAGE_DOWN).perform() except: traceback.print_exc() time.sleep(0.5) def get_reviews(driver, reviews_cnt): wait = WebDriverWait(driver, 30) more_reviews_css = "button[jsaction='pane.rating.moreReviews']" wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css)) ) element = driver.find_element_by_css_selector(more_reviews_css) driver.implicitly_wait(10) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(0.5) reviews_cnt = int(reviews_cnt) if reviews_cnt > 10: page_down_count = int(reviews_cnt) // 3 page_down_(driver, '//div[@class="PPCwl"]', page_down_count) def main(): global chrome_window global store_list_table global reviews_table global proxyport global iddict localip=socket.gethostbyname(socket.gethostname()) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') store_list_table = db['swire_store_list'] reviews_table = db['reviews_table'] iddict=build_cache(db) port=4444 if len(sys.argv) == 3 : port=int(sys.argv[1]) proxyport=int(sys.argv[2]) if not chrome_window: print('restart docker pw{}'.format(port)) # os.system('sudo docker container restart p'+str(port)) os.system('sudo docker container restart pw'+str(port)) time.sleep(10) print('drvier start...') driver = brower_start(port) job = get_next_job(db) for row, group in job.iterrows(): try: item_url = group['item_url'] reviews_cnt = group['reviews_cnt'] fid = group['fid'] print(reviews_cnt, item_url) driver.get(item_url) time.sleep(0.5) shop_soup = BeautifulSoup(driver.page_source, 'html.parser') tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'}) if tmp_value: get_reviews(driver, reviews_cnt) process_web_request(db, driver, fid) print(driver.current_url) db['review_process'].insert({'fid':fid, 'dt':datetime.now()}) except: traceback.print_exc() if __name__ == '__main__': main()