|
@@ -0,0 +1,546 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+#from selenium import webdriver
|
|
|
+from tkinter.tix import TEXT
|
|
|
+from seleniumwire import webdriver
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+import selenium
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
+
|
|
|
+from datetime import datetime
|
|
|
+from requests import session
|
|
|
+import pandas as pd
|
|
|
+import dataset
|
|
|
+import time
|
|
|
+import json
|
|
|
+import re
|
|
|
+import sys, os
|
|
|
+import socket
|
|
|
+import brotli
|
|
|
+import pickle
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import urllib.parse
|
|
|
+chrome_window=False
|
|
|
+globalkw=None
|
|
|
+proxyport=8787
|
|
|
+
|
|
|
+db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
|
|
|
+ 'created_at', 'text', 'photos', 'store_review_time','store_review']
|
|
|
+
|
|
|
+
|
|
|
+def write_to_file(jsobj,fname):
|
|
|
+ with open(fname, 'wb') as handle:
|
|
|
+ pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
+ # import codecs
|
|
|
+ # fw=codecs.open(fname,'w','utf-8')
|
|
|
+ # fw.write(str(jsobj))
|
|
|
+ # fw.close()
|
|
|
+
|
|
|
+
|
|
|
+def build_cache(db):
|
|
|
+ global reviews_table
|
|
|
+ id_dict={}
|
|
|
+ cursor = db.query('SELECT fid, author_id FROM google_poi.reviews_table;')
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ key = '{}_{}'.format(c['fid'],c['author_id'])
|
|
|
+ id_dict[key]=1
|
|
|
+ return id_dict
|
|
|
+
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ global proxyport
|
|
|
+ global chrome_window
|
|
|
+ print(proxyport)
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ if chrome_window:
|
|
|
+ browser = webdriver.Chrome(
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ chrome_options = webdriver.ChromeOptions()
|
|
|
+ chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
|
|
|
+ chrome_options.add_argument('--ignore-certificate-errors')
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=chrome_options.to_capabilities(),
|
|
|
+ seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
|
|
|
+
|
|
|
+ )
|
|
|
+ browser.set_window_size(1400,1000)
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def get_next_job(db):
|
|
|
+ result = {}
|
|
|
+ result = db.query('select * from error_list2 ORDER BY RAND() limit 2')
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
+
|
|
|
+ # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
+
|
|
|
+ # remove = db.query('select fid from review_process')
|
|
|
+ # remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
+ # remove_fid_list = remove['fid'].to_list()
|
|
|
+
|
|
|
+ # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
|
|
|
+
|
|
|
+ return url_pd
|
|
|
+
|
|
|
+
|
|
|
+def parsing_js(resp, db_name):
|
|
|
+ txt = json.loads(resp[5::])
|
|
|
+
|
|
|
+ output = {}
|
|
|
+
|
|
|
+ if txt[6][11] != db_name:
|
|
|
+ return 0
|
|
|
+ output['name'] = txt[6][11]
|
|
|
+ output['adress_name'] = txt[6][18]
|
|
|
+
|
|
|
+ if txt[6][4]:
|
|
|
+ if txt[6][4][7]:
|
|
|
+ output['rating'] = str(txt[6][4][7])
|
|
|
+ else:
|
|
|
+ output['rating'] = ''
|
|
|
+
|
|
|
+ if txt[6][4][8]:
|
|
|
+ output['user_ratings_total'] = str(txt[6][4][8])
|
|
|
+ else:
|
|
|
+ output['user_ratings_total'] = ''
|
|
|
+
|
|
|
+ if txt[6][4][2]:
|
|
|
+ output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
|
|
|
+ else:
|
|
|
+ output['price_level'] = ''
|
|
|
+ else:
|
|
|
+ output['rating'] = ''
|
|
|
+ output['user_ratings_total'] = ''
|
|
|
+ output['price_level'] = ''
|
|
|
+
|
|
|
+ if txt[6][37][0]:
|
|
|
+ output['lon'] = txt[6][37][0][0][8][0][1]
|
|
|
+ output['lat'] = txt[6][37][0][0][8][0][2]
|
|
|
+ else:
|
|
|
+ output['lon'] = None
|
|
|
+ output['lat'] = None
|
|
|
+
|
|
|
+ if txt[6][178]:
|
|
|
+ output['tel'] = txt[6][178][0][0]
|
|
|
+ else:
|
|
|
+ output['tel'] = ''
|
|
|
+
|
|
|
+ if txt[6][13]:
|
|
|
+ output['category'] = txt[6][13][0]
|
|
|
+ else:
|
|
|
+ output['category'] = ''
|
|
|
+
|
|
|
+ try:
|
|
|
+ location = txt[6][183][2][2][0]
|
|
|
+ if location:
|
|
|
+ location_s = location.split(' ')
|
|
|
+ output['city'], output['area'] = location_s[-1], location_s[-2]
|
|
|
+ else:
|
|
|
+ output['city'], output['area'] = '', ''
|
|
|
+ except:
|
|
|
+ output['city'], output['area'] = '', ''
|
|
|
+
|
|
|
+ if txt[6][100]:
|
|
|
+ for item in txt[6][100][1]:
|
|
|
+ name = item[1]
|
|
|
+ if name not in intro_list.keys(): continue
|
|
|
+ name_map = intro_list[name]
|
|
|
+ c = 0
|
|
|
+ detail = []
|
|
|
+ for t in item[2]:
|
|
|
+ value = t[1]
|
|
|
+ if t[3] == 1:
|
|
|
+ detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
|
|
|
+ else:
|
|
|
+ detail += [{'id':c, name_map[1]:value}]
|
|
|
+ c += 1
|
|
|
+ output[name_map[0]] = str(detail)
|
|
|
+
|
|
|
+ for key in intro_list:
|
|
|
+ if intro_list[key][0] not in output.keys():
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
+
|
|
|
+ if txt[6][34]:
|
|
|
+ output = time_parsing_js(txt[6][34], output)
|
|
|
+ else:
|
|
|
+ output['open_now'] = 'False'
|
|
|
+ output['periods'] = ''
|
|
|
+ output['weekday_text'] = ''
|
|
|
+ output['time_status'] = ''
|
|
|
+
|
|
|
+ if txt[6][72]:
|
|
|
+ output['header_image'] = txt[6][72][0][0][6][0]
|
|
|
+ else:
|
|
|
+ output['header_image'] = ''
|
|
|
+
|
|
|
+ print(output)
|
|
|
+ # write_to_file(orig,'debug.pickle')
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def time_parsing_js(time_json, output):
|
|
|
+ weekday_text = []
|
|
|
+ periods = []
|
|
|
+
|
|
|
+ for time_ in time_json[1]:
|
|
|
+ week = time_[0]
|
|
|
+ weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
|
|
|
+
|
|
|
+ for t in time_[1]:
|
|
|
+ if t == '24 小時營業':
|
|
|
+ periods += [{
|
|
|
+ "open":{
|
|
|
+ "day": week_list[week],
|
|
|
+ "time": '0000'
|
|
|
+ },
|
|
|
+ "close":{
|
|
|
+ "day": week_list[week],
|
|
|
+ "time": ''
|
|
|
+ }
|
|
|
+ }]
|
|
|
+ elif t == '休息':
|
|
|
+ periods += [{
|
|
|
+ "open":{
|
|
|
+ "day": week_list[week],
|
|
|
+ "time": ''
|
|
|
+ },
|
|
|
+ "close":{
|
|
|
+ "day": week_list[week],
|
|
|
+ "time": ''
|
|
|
+ }
|
|
|
+ }]
|
|
|
+ else:
|
|
|
+ start, end = t.split('–')
|
|
|
+ end_hour, end_min = end.split(':')
|
|
|
+ start_hour, start_min = start.split(':')
|
|
|
+
|
|
|
+ if end_hour < start_hour:
|
|
|
+ end_day = week_list[week] + 1
|
|
|
+ else:
|
|
|
+ end_day = week_list[week]
|
|
|
+
|
|
|
+ periods += [{
|
|
|
+ "open":{
|
|
|
+ "day": week_list[week],
|
|
|
+ "time": start.replace(':','')
|
|
|
+ },
|
|
|
+ "close":{
|
|
|
+ "day": end_day,
|
|
|
+ "time": end.replace(':','')
|
|
|
+ }
|
|
|
+ }]
|
|
|
+
|
|
|
+ output['periods'] = str(periods)
|
|
|
+ output['weekday_text'] = str(weekday_text)
|
|
|
+ output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
|
|
|
+
|
|
|
+ if output['time_status'].find('永久停業') != -1 or\
|
|
|
+ output['time_status'].find('暫時關閉') != -1 or\
|
|
|
+ output['time_status'].find('暫停營業') != -1:
|
|
|
+ output['open_now'] = 'False'
|
|
|
+ else:
|
|
|
+ output['open_now'] = 'True'
|
|
|
+
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def save_js_to_db(jsobj, fid):
|
|
|
+ global reviews_table
|
|
|
+ global iddict
|
|
|
+ for r in jsobj:
|
|
|
+ r['fid'] = fid
|
|
|
+ key = '{}_{}'.format(r['fid'], r['author_id'])
|
|
|
+ if iddict.get(key) is not None:
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ r['review_image'] = str(r['review_image'])
|
|
|
+ reviews_table.insert(r)
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
+def process_web_request_start(driver, db_name):
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ print("start&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ # print(request.url)
|
|
|
+ if 'place?' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ print(request.url)
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ output = parsing_js(jstext, db_name)
|
|
|
+ time.sleep(1)
|
|
|
+ return output
|
|
|
+ return 0
|
|
|
+
|
|
|
+
|
|
|
+def reviews_parsing_js(resp):
|
|
|
+ columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
|
|
|
+ 'review_time', 'review_content', 'review_image',
|
|
|
+ 'rating', 'store_review_time','store_review']
|
|
|
+ jsobj = json.loads(resp[5::])
|
|
|
+ result = []
|
|
|
+ for i in range(len(jsobj[2])):
|
|
|
+ tmp = []
|
|
|
+ tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
|
|
|
+ tmp += [jsobj[2][i][1], jsobj[2][i][3]]
|
|
|
+
|
|
|
+ # image
|
|
|
+ image = []
|
|
|
+ if jsobj[2][i][14]:
|
|
|
+ for j in range(len(jsobj[2][i][14])):
|
|
|
+ image += [jsobj[2][i][14][j][6][0]]
|
|
|
+ tmp += [image]
|
|
|
+
|
|
|
+ #rating
|
|
|
+ tmp += [jsobj[2][i][4]]
|
|
|
+
|
|
|
+ # store reply
|
|
|
+ if jsobj[2][i][9]:
|
|
|
+ tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
|
|
|
+ else:
|
|
|
+ tmp += ['', '']
|
|
|
+
|
|
|
+ result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def process_web_request_reviews(driver, output):
|
|
|
+ time.sleep(0.8)
|
|
|
+ time.sleep(3)
|
|
|
+
|
|
|
+ print("reviews&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ # print(request.url)
|
|
|
+ if 'listentitiesreviews?' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ print(request.url)
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ result = reviews_parsing_js(jstext)
|
|
|
+ output['reviews'] = str(result)
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def photos_parsing_js(resp, c):
|
|
|
+ def image_url_change_size(url):
|
|
|
+ url_split = url.split('=')
|
|
|
+ new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
|
|
|
+ return new_url
|
|
|
+
|
|
|
+ jsobj = json.loads(resp[5::])
|
|
|
+ # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
|
|
|
+
|
|
|
+ menu = []
|
|
|
+ all = []
|
|
|
+
|
|
|
+ if jsobj[10] == 0:
|
|
|
+ for img in jsobj[0]:
|
|
|
+ all += [image_url_change_size(img[6][0])]
|
|
|
+ else:
|
|
|
+ for img in jsobj[0]:
|
|
|
+ menu += [image_url_change_size(img[6][0])]
|
|
|
+
|
|
|
+ return menu, all
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def process_web_request_photo(driver, output):
|
|
|
+ try:
|
|
|
+ driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
|
|
|
+ photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ tab_dict = {}
|
|
|
+ for tab_index in [0, 1, 2]:
|
|
|
+ selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
|
|
|
+ if len(selector) != 0:
|
|
|
+ photo_name = selector[0].text
|
|
|
+ if photo_name == '菜單':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
+ elif photo_name == '全部':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
+ except:
|
|
|
+ tab_dict = {}
|
|
|
+ print(tab_dict)
|
|
|
+
|
|
|
+ for tab_ in tab_dict:
|
|
|
+ tab_index = tab_dict[tab_]
|
|
|
+ print(tab_index)
|
|
|
+ wait = WebDriverWait(driver, 60)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
|
|
|
+ )
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ print("photo&**********************")
|
|
|
+ menu_list = []
|
|
|
+ all_list = []
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ # print(request.url)
|
|
|
+ c = 0
|
|
|
+ if 'photo?' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ print(request.url)
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ menu, all = photos_parsing_js(jstext, c)
|
|
|
+ menu_list += menu
|
|
|
+ all_list += all
|
|
|
+ c += 1
|
|
|
+
|
|
|
+ output['shop_photo'] = str(all_list)
|
|
|
+ output['menu_photo'] = str(menu_list)
|
|
|
+
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ global chrome_window
|
|
|
+ global store_list_table
|
|
|
+ global reviews_table
|
|
|
+ global proxyport
|
|
|
+ global iddict
|
|
|
+
|
|
|
+ localip=socket.gethostbyname(socket.gethostname())
|
|
|
+
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ store_list_table = db['swire_store_list']
|
|
|
+ shop_table = db['shop_list4']
|
|
|
+
|
|
|
+ iddict=build_cache(db)
|
|
|
+
|
|
|
+ port=4444
|
|
|
+ if len(sys.argv) == 3 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ proxyport=int(sys.argv[2])
|
|
|
+ if not chrome_window:
|
|
|
+ print('restart docker pw{}'.format(port))
|
|
|
+# os.system('sudo docker container restart p'+str(port))
|
|
|
+ os.system('sudo docker container restart pw'+str(port))
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+
|
|
|
+ print('drvier start...')
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+ job = get_next_job(db)
|
|
|
+
|
|
|
+ for row, group in job.iterrows():
|
|
|
+ try:
|
|
|
+ item_url = group['item_url']
|
|
|
+ name = group['name']
|
|
|
+ num = group['num']
|
|
|
+ keyword = group['keyword']
|
|
|
+
|
|
|
+ if name:
|
|
|
+ db_name = name
|
|
|
+ else:
|
|
|
+ db_name = num
|
|
|
+
|
|
|
+ print(name, num, keyword, db_name)
|
|
|
+ print(item_url)
|
|
|
+
|
|
|
+ #shop_info
|
|
|
+ print('parsing shop info....')
|
|
|
+ for i in range(5):
|
|
|
+ print('shop info try...{}'.format(i))
|
|
|
+ driver.get(item_url)
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ element = driver.find_element_by_id('searchbox-searchbutton')
|
|
|
+ driver.implicitly_wait(10)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ time.sleep(5)
|
|
|
+ driver.back()
|
|
|
+
|
|
|
+ if driver.current_url == item_url:continue
|
|
|
+ print(driver.current_url)
|
|
|
+ output = process_web_request_start(driver, db_name)
|
|
|
+ if output != 0: break
|
|
|
+
|
|
|
+
|
|
|
+ # reivews
|
|
|
+ print('parsing reviews....')
|
|
|
+ if output['user_ratings_total'] == '':
|
|
|
+ output['reviews'] = ''
|
|
|
+ else:
|
|
|
+ for i in range(3):
|
|
|
+ print('reviews try...{}'.format(i))
|
|
|
+ try:
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
|
|
|
+ )
|
|
|
+ element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ output = process_web_request_reviews(driver, output)
|
|
|
+ break
|
|
|
+ except:
|
|
|
+ driver.get(item_url)
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ # photo
|
|
|
+ print('parsing photo....')
|
|
|
+ if output['header_image'] != '':
|
|
|
+ for i in range(3):
|
|
|
+ print('photo try...{}'.format(i))
|
|
|
+ driver.get(item_url)
|
|
|
+ time.sleep(0.5)
|
|
|
+ print(driver.current_url)
|
|
|
+ try:
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
|
|
|
+ )
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+ output = process_web_request_photo(driver, output)
|
|
|
+ break
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ else:
|
|
|
+ output['shop_photo'] = '[]'
|
|
|
+ output['menu_photo'] = '[]'
|
|
|
+
|
|
|
+ print(output)
|
|
|
+ query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
|
|
|
+ output['item_url'] = item_url
|
|
|
+ output['keyword'] = keyword
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
+ output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
|
|
|
+
|
|
|
+ shop_table.insert(output,['item_url'])
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|