123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278 |
- # -*- coding: utf-8 -*-
- #from selenium import webdriver
- from seleniumwire import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- import selenium
- import gzip
- import traceback
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- from datetime import datetime
- from requests import session
- import pandas as pd
- import dataset
- import time
- import json
- import re
- import gzip
- import sys, os
- import socket
- import brotli
- import pickle
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import urllib.parse
- chrome_window=True
- #chrome_window=False
- globalkw=None
- proxyport=8787
- db_columns = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
- 'review_time', 'review_content', 'review_image',
- 'store_review_time','store_review']
- def write_to_file(jsobj,fname):
- with open(fname, 'wb') as handle:
- pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
- def build_cache(db):
- global reviews_table
- id_dict={}
- cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
- for c in cursor:
- key = '{}_{}'.format(c['fid'],c['author_id'])
- id_dict[key]=1
- return id_dict
- def brower_start(port):
- global proxyport
- global chrome_window
- print(proxyport)
- options = webdriver.ChromeOptions()
- if chrome_window:
- # browser = webdriver.Chrome(
- ## desired_capabilities=options.to_capabilities()
- # )
- options.add_argument('--ignore-certificate-errors')
- options.add_argument("--no-sandbox")
- options.add_argument("--headless")
- options.add_argument("--disable-gpu")
- options.add_argument("--disable-dev-shm-usage")
- browser = webdriver.Chrome(
- options=options
- # ,seleniumwire_options={'disable_encoding': True}
- # desired_capabilities=options.to_capabilities()
- )
- browser.set_window_size(1400,1000)
- else:
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
- chrome_options.add_argument('--ignore-certificate-errors')
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=chrome_options.to_capabilities(),
- seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
- )
- browser.set_window_size(1400,1000)
- return browser
- def get_next_job(db):
- result = {}
- sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from
- (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and
- fid not in (select fid from review_process ) ORDER BY RAND() limit 5 )
- as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
- result = db.query(sql)
- url_pd = pd.DataFrame([dict(i) for i in result])
- url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
- return url_pd
- def parsing_js(resp):
- jsobj = json.loads(resp[5::])
- result = []
- for i in range(len(jsobj[2])):
- tmp = []
- tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
- tmp += [jsobj[2][i][1], jsobj[2][i][3]]
- # image
- image = []
- if jsobj[2][i][14]:
- for j in range(len(jsobj[2][i][14])):
- image += [jsobj[2][i][14][j][6][0]]
- tmp += [image]
- # store reply
- if jsobj[2][i][9]:
- tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
- else:
- tmp += ['', '']
- tmp_dict = {}
- for i in range(len(db_columns)):
- tmp_dict[db_columns[i]] = tmp[i]
- tmp_dict['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
- result.append(tmp_dict)
- # write_to_file(orig,'debug.pickle')
- return result
- def save_js_to_db(jsobj, fid):
- global reviews_table
- global iddict
- for r in jsobj:
- r['fid'] = fid
- key = '{}_{}'.format(r['fid'], r['author_id'])
- if iddict.get(key) is not None:
- continue
- try:
- r['review_image'] = str(r['review_image'])
- reviews_table.insert(r)
- except:
- traceback.print_exc()
- def process_web_request(driver, fid):
- time.sleep(3)
- print("ppppppppp&**********************")
- for request in driver.requests:
- if request.response:
- # print(request.url)
- if 'listentitiesreviews?' in request.url :
- print('parsing js:')
- print(request.url)
- # resp = brotli.decompress(request.response.body)
- resp=request.response.body
- if 'gzip' in request.response.headers.get('Content-Encoding'):
- resp = gzip.decompress(request.response.body)
- if 'br' in request.response.headers.get('Content-Encoding'):
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- result = parsing_js(jstext)
- save_js_to_db(result, fid)
- time.sleep(1)
- del driver.requests
- return 1
-
- del driver.requests
- return 0
- def page_down_(driver, xpath_css, time_):
- elmts = driver.find_elements_by_xpath(xpath_css)
- print(elmts)
- if len(elmts)>1:
- elmt=elmts[1]
- else:
- elmt=elmts[0]
- actions = ActionChains(driver)
- actions.move_to_element(elmt).click().perform()
- for i in range(time_):
- try:
- actions = ActionChains(driver)
- actions.send_keys(Keys.PAGE_DOWN).perform()
- except:
- traceback.print_exc()
- time.sleep(0.5)
- def get_reviews(driver, reviews_cnt):
- wait = WebDriverWait(driver, 30)
- more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
- wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
- )
- element = driver.find_element_by_css_selector(more_reviews_css)
- driver.implicitly_wait(10)
- ActionChains(driver).move_to_element(element).click(element).perform()
- time.sleep(0.5)
- reviews_cnt = int(reviews_cnt)
- if reviews_cnt > 10:
- page_down_count = int(reviews_cnt) // 3
- page_down_(driver, '//div[@class="PPCwl"]', page_down_count)
- def main():
- global chrome_window
- global store_list_table
- global reviews_table
- global proxyport
- global iddict
- # localip=socket.gethostbyname(socket.gethostname())
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
- store_list_table = db['swire_store_list']
- reviews_table = db['reviews_table']
- iddict=build_cache(db)
-
- port=4444
- if len(sys.argv) == 3 :
- port=int(sys.argv[1])
- proxyport=int(sys.argv[2])
- if not chrome_window:
- print('restart docker pw{}'.format(port))
- # os.system('sudo docker container restart p'+str(port))
- os.system('sudo docker container restart pw'+str(port))
- time.sleep(10)
- print('drvier start...')
- driver = brower_start(port)
- job = get_next_job(db)
- for row, group in job.iterrows():
- try:
- item_url = group['item_url']
- reviews_cnt = group['user_ratings_total']
- fid = group['fid']
- print(reviews_cnt, item_url)
- for i in range(3):
- print('reviews try...{}'.format(i))
- print("reviews try.....{}".format(datetime.now()))
- driver.get(item_url)
- time.sleep(0.5)
- get_reviews(driver, reviews_cnt)
- status = process_web_request(driver, fid)
- if status:
- db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
- break
- except:
- traceback.print_exc()
- if __name__ == '__main__':
- main()
|