# -*- coding: utf-8 -*- #from selenium import webdriver #from tkinter.tix import TEXT from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import WebDriverException import selenium import traceback from bs4 import BeautifulSoup import gzip from utility import database_access as DA from utility.parseutils import * from utility.connect import * import redis from datetime import datetime from requests import session import pandas as pd import dataset import time import json import re import sys, os import socket import brotli import pickle from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib.parse chrome_window=False globalkw=None proxyport=8787 def write_to_file(jsobj,fname): with open(fname, 'wb') as handle: pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL) def build_cache(db): global reviews_table id_dict={} cursor = db.query('SELECT fid FROM google_poi.shop_list4;') for c in cursor: key = '{}'.format(c['fid']) id_dict[key]=1 return id_dict def brower_start(port): options = webdriver.ChromeOptions() # browser = webdriver.Chrome(options=options) options.add_argument('--ignore-certificate-errors') options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--disable-dev-shm-usage") browser = webdriver.Chrome(options=options) browser.set_window_size(1400,1000) # browser = webdriver.Remote( # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub', # desired_capabilities=options.to_capabilities() # ) return browser def get_next_job(db): location_list = pd.read_csv('HKS須重爬店家.csv') result = {} result = db.query('SELECT * FROM progress_list WHERE check_ = 1') url_pd = pd.DataFrame([dict(i) for i in result]) location_list = location_list[~location_list['分店編號'].isin(url_pd['id_'].to_list())] location_list = location_list.sample(500) return location_list def parsing_js(resp): txt = json.loads(resp[5::]) output = {} output['name'] = txt[6][11] output['adress_name'] = txt[6][18] output['fid'] = txt[6][10] if txt[6][4]: if txt[6][4][7]: output['rating'] = str(txt[6][4][7]) else: output['rating'] = None if txt[6][4][8]: output['user_ratings_total'] = str(txt[6][4][8]) else: output['user_ratings_total'] = None if txt[6][4][2]: output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$'])) else: output['price_level'] = None else: output['rating'] = None output['user_ratings_total'] = None output['price_level'] = None if txt[6][37][0]: output['lon'] = txt[6][37][0][0][8][0][1] output['lat'] = txt[6][37][0][0][8][0][2] else: output['lon'] = None output['lat'] = None if txt[6][178]: output['tel'] = txt[6][178][0][0] else: output['tel'] = '' if txt[6][13]: output['category'] = txt[6][13][0] else: output['category'] = '' try: location = txt[6][183][2][2][0] if location: location_s = location.split(' ') output['city'], output['area'] = location_s[-1], location_s[-2] else: output['city'], output['area'] = '', '' except: output['city'], output['area'] = '', '' if txt[6][100]: for item in txt[6][100][1]: name = item[1] if name not in intro_list.keys(): continue name_map = intro_list[name] c = 0 detail = [] for t in item[2]: value = t[1] if t[3] == 1: detail += [{'id':c, name_map[1]:'不提供'+str(value)}] else: detail += [{'id':c, name_map[1]:value}] c += 1 output[name_map[0]] = str(detail) for key in intro_list: if intro_list[key][0] not in output.keys(): output[intro_list[key][0]] = '[]' if txt[6][34]: output = time_parsing_js(txt[6][34], output) else: output['open_now'] = 'False' output['periods'] = '' output['weekday_text'] = '' output['time_status'] = '' if txt[6][72]: output['header_image'] = txt[6][72][0][0][6][0] else: output['header_image'] = '' if txt[6][126]: output['google_url'] = txt[6][126][4] ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1] if len(ludocid_str) != 0: ludocid = ludocid_str[0].split('=')[-1] output['ludocid'] = ludocid else: output['google_url'] = '' # write_to_file(orig,'debug.pickle') return output def time_parsing_js(time_json, output): weekday_text = [] periods = [] if time_json is None: output['open_now'] = 'False' output['periods'] = '' output['weekday_text'] = '' output['time_status'] = '' return output if time_json[1] is None: output['open_now'] = 'False' output['periods'] = '' output['weekday_text'] = '' output['time_status'] = '' return output for time_ in time_json[1]: week = time_[0] weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))] for t in time_[1]: if t == '24 小時營業': periods += [{ "open":{ "day": week_list[week], "time": '0000' }, "close":{ "day": week_list[week], "time": '' } }] elif t == '休息': periods += [{ "open":{ "day": week_list[week], "time": '' }, "close":{ "day": week_list[week], "time": '' } }] else: start, end = t.split('–') end_hour, end_min = end.split(':') start_hour, start_min = start.split(':') if end_hour < start_hour: end_day = week_list[week] + 1 else: end_day = week_list[week] periods += [{ "open":{ "day": week_list[week], "time": start.replace(':','') }, "close":{ "day": end_day, "time": end.replace(':','') } }] output['periods'] = str(periods) output['weekday_text'] = str(weekday_text) output['time_status'] = blank_check(time_json[4][4].split('⋅')[0]) if output['time_status'].find('永久停業') != -1 or\ output['time_status'].find('暫時關閉') != -1 or\ output['time_status'].find('暫停營業') != -1: output['open_now'] = 'False' else: output['open_now'] = 'True' return output def save_js_to_db(jsobj, fid): global shop_table global iddict if iddict.get(fid) is None: try: shop_table.insert(jsobj) except: traceback.print_exc() def process_web_request_start(driver): time.sleep(3) print("start&**********************") for request in driver.requests: if request.response: # print(request.url) if 'place?' in request.url : print(request.url) resp=request.response.body if 'gzip' in request.response.headers.get('Content-Encoding'): resp = gzip.decompress(request.response.body) if 'br' in request.response.headers.get('Content-Encoding'): resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') output = parsing_js(jstext) time.sleep(1) del driver.requests return output del driver.requests return 0 def reviews_parsing_js(resp): columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count', 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review'] jsobj = json.loads(resp[5::]) result = [] for i in range(len(jsobj[2])): tmp = [] tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]] tmp += [jsobj[2][i][1], jsobj[2][i][3]] # image image = [] if jsobj[2][i][14]: for j in range(len(jsobj[2][i][14])): image += [jsobj[2][i][14][j][6][0]] tmp += [image] #rating tmp += [jsobj[2][i][4]] # store reply if jsobj[2][i][9]: tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]] else: tmp += ['', ''] result.append(list(map(lambda x, y: {x:y}, columns_name, tmp))) return result def process_web_request_reviews(driver, output): time.sleep(3) print("reviews&**********************") for request in driver.requests: if request.response: if 'listentitiesreviews?' in request.url : print(request.url) resp=request.response.body if 'gzip' in request.response.headers.get('Content-Encoding'): resp = gzip.decompress(request.response.body) if 'br' in request.response.headers.get('Content-Encoding'): resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') result = reviews_parsing_js(jstext) output['reviews'] = str(result) time.sleep(1) del driver.requests return output del driver.requests return 0 def photos_parsing_js(resp): def image_url_change_size(url): if url.find('streetviewpixels') != -1: return url else: url_split = url.split('=') new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::]) return new_url jsobj = json.loads(resp[5::]) # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c)) menu = [] all = [] photo_category_map = {} for row in jsobj[12][0]: photo_category_map[row[0]] = row[2] if photo_category_map[jsobj[13][0]] == '全部': for img in jsobj[0]: all += [image_url_change_size(img[6][0])] elif photo_category_map[jsobj[13][0]] == '菜單': for img in jsobj[0]: menu += [image_url_change_size(img[6][0])] return list(set(menu)), list(set(all)) def process_web_request_photo(driver, output, fid): try: driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']") photo_soup = BeautifulSoup(driver.page_source, 'html.parser') tab_dict = {} for tab_index in [0, 1, 2]: selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index)) if len(selector) != 0: photo_name = selector[0].text if photo_name == '菜單': tab_dict[photo_name] = tab_index elif photo_name == '全部': tab_dict[photo_name] = tab_index except: tab_dict = {} print(tab_dict) for tab_ in tab_dict: tab_index = tab_dict[tab_] print(tab_index) wait = WebDriverWait(driver, 60) wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))) ) element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(1) print("photo&**********************") menu_list = [] all_list = [] for request in driver.requests: if request.response: # print(request.url) if 'photo?' in request.url : # print('parsing js:') front, _ = fid.split(':') if request.url.find(front) != -1: print(request.url) resp=request.response.body if 'gzip' in request.response.headers.get('Content-Encoding'): resp = gzip.decompress(request.response.body) if 'br' in request.response.headers.get('Content-Encoding'): resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') menu, all = photos_parsing_js(jstext) menu_list += menu all_list += all del driver.requests output['shop_photo'] = str(all_list[:5]) output['menu_photo'] = str(menu_list[:5]) del driver.requests return output def main(): global chrome_window global store_list_table global shop_table global proxyport global iddict localip=socket.gethostbyname(socket.gethostname()) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') # store_list_table = db['swire_store_list'] shop_table = db['shop_list4'] progress_table = db['progress_list'] iddict=build_cache(db) # print("iddict...{}".format(datetime.now())) port=4444 if len(sys.argv) == 3 : port=int(sys.argv[1]) proxyport=int(sys.argv[2]) if not chrome_window: print('restart docker pw{}'.format(port)) os.system('sudo docker container restart pw'+str(port)) # os.system('sudo docker container restart pw'+str(port)) time.sleep(5) print('drvier start...') driver = brower_start(port) job = get_next_job(db) c = 0 for row, group in job.iterrows(): try: print(row) keyword = group['分店'] + group['地址'] item_url = 'https://www.google.com/maps/place/?q={}'.format(keyword) print(item_url) #shop_info print('parsing shop info....') for i in range(5): print('shop info try...{}'.format(i)) print("shop info try...{}".format(datetime.now())) driver.get(item_url) time.sleep(3) element = driver.find_elements_by_css_selector('div[role="article"]') if len(element) != 0: item_url = element[0].find_element_by_css_selector('a').get_attribute('href') print(item_url) driver.get(item_url) time.sleep(3) wait = WebDriverWait(driver, 10) wait.until( EC.element_to_be_clickable((By.ID, 'sb_cb50')) ) element = driver.find_element_by_id('sb_cb50') driver.implicitly_wait(9) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(1) driver.back() if driver.current_url == item_url:continue print(driver.current_url) output = process_web_request_start(driver) if output != 0: break print(output) # reivews print('parsing reviews....') print("parsing reviews.....{}".format(datetime.now())) if not output['user_ratings_total']: output['reviews'] = '' else: for i in range(3): print('reviews try...{}'.format(i)) print("reviews try.....{}".format(datetime.now())) wait = WebDriverWait(driver, 30) more_reviews_css = "button[jsaction='pane.rating.moreReviews']" wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css)) ) element = driver.find_element_by_css_selector(more_reviews_css) driver.implicitly_wait(10) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(0.5) output_ = process_web_request_reviews(driver, output) if output_ != 0: output = output_ break else: driver.get(item_url) time.sleep(0.5) # photo print('parsing photo....') if output['header_image'] != '': for i in range(3): print('photo try...{}'.format(i)) print("photo try......{}".format(datetime.now())) driver.get(item_url) time.sleep(0.5) print(driver.current_url) try: wait = WebDriverWait(driver, 30) wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))) ) element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])) ActionChains(driver).move_to_element(element).click(element).perform() output = process_web_request_photo(driver, output, output['fid']) break except: pass else: output['shop_photo'] = '[]' output['menu_photo'] = '[]' output['item_url'] = item_url output['keyword'] = keyword if output['google_url'] == '': query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','') output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name) output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M") print(output) save_js_to_db(output, output['fid']) print("save_js_to_db......{}".format(datetime.now())) progress_table.insert({'id_':group['分店編號'], 'name':output['name'], 'fid':output['fid'], 'check_':1}) except TimeoutException as e: traceback.print_exc() continue except: progress_table.insert({'id_':group['分店編號'], 'name':output['name'], 'fid':output['fid'], 'check_':0}) if __name__ == '__main__': main()