# -*- coding: utf-8 -*- #from selenium import webdriver from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import gzip import traceback from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * from datetime import datetime from requests import session import pandas as pd import dataset import time import json import re import gzip import sys, os import socket import brotli import pickle from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib.parse chrome_window=True #chrome_window=False globalkw=None proxyport=8787 db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count', 'review_time', 'review_content', 'review_image', 'store_review_time','store_review'] def write_to_file(jsobj,fname): with open(fname, 'wb') as handle: pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL) def build_cache(db): global reviews_table id_dict={} cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;') for c in cursor: key = '{}_{}'.format(c['fid'],c['author_id']) id_dict[key]=1 return id_dict def brower_start(port): global proxyport global chrome_window print(proxyport) options = webdriver.ChromeOptions() if chrome_window: # browser = webdriver.Chrome( ## desired_capabilities=options.to_capabilities() # ) options.add_argument('--ignore-certificate-errors') options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--disable-dev-shm-usage") browser = webdriver.Chrome( options=options # ,seleniumwire_options={'disable_encoding': True} # desired_capabilities=options.to_capabilities() ) browser.set_window_size(1400,1000) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False} ) browser.set_window_size(1400,1000) return browser def get_next_job(db): result = {} sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and fid not in (select fid from review_process ) ORDER BY RAND() limit 5 ) as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid''' result = db.query(sql) url_pd = pd.DataFrame([dict(i) for i in result]) url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) ) return url_pd def parsing_js(resp): jsobj = json.loads(resp[5::]) result = [] for i in range(len(jsobj[2])): tmp = [] tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]] tmp += [jsobj[2][i][1], jsobj[2][i][3]] # image image = [] if jsobj[2][i][14]: for j in range(len(jsobj[2][i][14])): image += [jsobj[2][i][14][j][6][0]] tmp += [image] # store reply if jsobj[2][i][9]: tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]] else: tmp += ['', ''] tmp_dict = {} for i in range(len(db_columns)): tmp_dict[db_columns[i]] = tmp[i] tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M") result.append(tmp_dict) # write_to_file(orig,'debug.pickle') return result def save_js_to_db(jsobj, fid): global reviews_table global iddict for r in jsobj: r['fid'] = fid key = '{}_{}'.format(r['fid'], r['author_id']) if iddict.get(key) is not None: continue try: r['review_image'] = str(r['review_image']) reviews_table.insert(r) except: traceback.print_exc() def process_web_request(driver, fid): time.sleep(3) print("ppppppppp&**********************") for request in driver.requests: if request.response: # print(request.url) if 'listentitiesreviews?' in request.url : print('parsing js:') print(request.url) # resp = brotli.decompress(request.response.body) resp=request.response.body if 'gzip' in request.response.headers.get('Content-Encoding'): resp = gzip.decompress(request.response.body) if 'br' in request.response.headers.get('Content-Encoding'): resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') result = parsing_js(jstext) save_js_to_db(result, fid) time.sleep(1) del driver.requests return 1 del driver.requests return 0 def page_down_(driver, xpath_css, time_): elmts = driver.find_elements_by_xpath(xpath_css) print(elmts) if len(elmts)>1: elmt=elmts[1] else: elmt=elmts[0] actions = ActionChains(driver) actions.move_to_element(elmt).click().perform() for i in range(time_): try: actions = ActionChains(driver) actions.send_keys(Keys.PAGE_DOWN).perform() except: traceback.print_exc() time.sleep(0.5) def get_reviews(driver, reviews_cnt): wait = WebDriverWait(driver, 30) more_reviews_css = "button[jsaction='pane.rating.moreReviews']" wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css)) ) element = driver.find_element_by_css_selector(more_reviews_css) driver.implicitly_wait(10) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(0.5) reviews_cnt = int(reviews_cnt) if reviews_cnt > 10: page_down_count = int(reviews_cnt) // 3 page_down_(driver, '//div[@class="PPCwl"]', page_down_count) def main(): global chrome_window global store_list_table global reviews_table global proxyport global iddict # localip=socket.gethostbyname(socket.gethostname()) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') store_list_table = db['swire_store_list'] reviews_table = db['reviews_table'] iddict=build_cache(db) port=4444 if len(sys.argv) == 3 : port=int(sys.argv[1]) proxyport=int(sys.argv[2]) if not chrome_window: print('restart docker pw{}'.format(port)) # os.system('sudo docker container restart p'+str(port)) os.system('sudo docker container restart pw'+str(port)) time.sleep(10) print('drvier start...') driver = brower_start(port) job = get_next_job(db) for row, group in job.iterrows(): try: item_url = group['item_url'] reviews_cnt = group['user_ratings_total'] fid = group['fid'] print(reviews_cnt, item_url) for i in range(3): print('reviews try...{}'.format(i)) print("reviews try.....{}".format(datetime.now())) driver.get(item_url) time.sleep(0.5) get_reviews(driver, reviews_cnt) status = process_web_request(driver, fid) if status: db['review_process'].insert({'fid':fid, 'dt':datetime.now()}) break except: traceback.print_exc() if __name__ == '__main__': main()