123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607 |
- # -*- coding: utf-8 -*-
- #from selenium import webdriver
- #from tkinter.tix import TEXT
- from seleniumwire import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.common.exceptions import TimeoutException
- from selenium.common.exceptions import WebDriverException
- import selenium
- import traceback
- from bs4 import BeautifulSoup
- import gzip
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- import redis
- from datetime import datetime
- from requests import session
- import pandas as pd
- import dataset
- import time
- import json
- import re
- import sys, os
- import socket
- import brotli
- import pickle
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import urllib.parse
- chrome_window=False
- globalkw=None
- proxyport=8787
- def write_to_file(jsobj,fname):
- with open(fname, 'wb') as handle:
- pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
- def build_cache(db):
- global reviews_table
- id_dict={}
- cursor = db.query('SELECT fid FROM google_poi.shop_list4;')
- for c in cursor:
- key = '{}'.format(c['fid'])
- id_dict[key]=1
- return id_dict
- def brower_start(port):
- options = webdriver.ChromeOptions()
- # browser = webdriver.Chrome(options=options)
- options.add_argument('--ignore-certificate-errors')
- options.add_argument("--no-sandbox")
- options.add_argument("--headless")
- options.add_argument("--disable-gpu")
- options.add_argument("--disable-dev-shm-usage")
- browser = webdriver.Chrome(options=options)
- browser.set_window_size(1400,1000)
-
- # browser = webdriver.Remote(
- # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
- # desired_capabilities=options.to_capabilities()
- # )
- return browser
- def get_next_job(db):
- location_list = pd.read_csv('HKS須重爬店家.csv')
-
- result = {}
- result = db.query('SELECT * FROM progress_list WHERE check_ = 1')
- url_pd = pd.DataFrame([dict(i) for i in result])
- location_list = location_list[~location_list['分店編號'].isin(url_pd['id_'].to_list())]
- location_list = location_list.sample(500)
- return location_list
- def parsing_js(resp):
- txt = json.loads(resp[5::])
- output = {}
- output['name'] = txt[6][11]
- output['adress_name'] = txt[6][18]
- output['fid'] = txt[6][10]
-
- if txt[6][4]:
- if txt[6][4][7]:
- output['rating'] = str(txt[6][4][7])
- else:
- output['rating'] = None
- if txt[6][4][8]:
- output['user_ratings_total'] = str(txt[6][4][8])
- else:
- output['user_ratings_total'] = None
- if txt[6][4][2]:
- output['price_level'] = str(len(['$' for i in txt[6][4][2] if i == '$']))
- else:
- output['price_level'] = None
- else:
- output['rating'] = None
- output['user_ratings_total'] = None
- output['price_level'] = None
- if txt[6][37][0]:
- output['lon'] = txt[6][37][0][0][8][0][1]
- output['lat'] = txt[6][37][0][0][8][0][2]
- else:
- output['lon'] = None
- output['lat'] = None
- if txt[6][178]:
- output['tel'] = txt[6][178][0][0]
- else:
- output['tel'] = ''
-
- if txt[6][13]:
- output['category'] = txt[6][13][0]
- else:
- output['category'] = ''
- try:
- location = txt[6][183][2][2][0]
- if location:
- location_s = location.split(' ')
- output['city'], output['area'] = location_s[-1], location_s[-2]
- else:
- output['city'], output['area'] = '', ''
- except:
- output['city'], output['area'] = '', ''
- if txt[6][100]:
- for item in txt[6][100][1]:
- name = item[1]
- if name not in intro_list.keys(): continue
- name_map = intro_list[name]
- c = 0
- detail = []
- for t in item[2]:
- value = t[1]
- if t[3] == 1:
- detail += [{'id':c, name_map[1]:'不提供'+str(value)}]
- else:
- detail += [{'id':c, name_map[1]:value}]
- c += 1
- output[name_map[0]] = str(detail)
- for key in intro_list:
- if intro_list[key][0] not in output.keys():
- output[intro_list[key][0]] = '[]'
-
- if txt[6][34]:
- output = time_parsing_js(txt[6][34], output)
- else:
- output['open_now'] = 'False'
- output['periods'] = ''
- output['weekday_text'] = ''
- output['time_status'] = ''
-
- if txt[6][72]:
- output['header_image'] = txt[6][72][0][0][6][0]
- else:
- output['header_image'] = ''
- if txt[6][126]:
- output['google_url'] = txt[6][126][4]
- ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
- if len(ludocid_str) != 0:
- ludocid = ludocid_str[0].split('=')[-1]
- output['ludocid'] = ludocid
- else:
- output['google_url'] = ''
- # write_to_file(orig,'debug.pickle')
- return output
- def time_parsing_js(time_json, output):
- weekday_text = []
- periods = []
- if time_json is None:
- output['open_now'] = 'False'
- output['periods'] = ''
- output['weekday_text'] = ''
- output['time_status'] = ''
- return output
- if time_json[1] is None:
- output['open_now'] = 'False'
- output['periods'] = ''
- output['weekday_text'] = ''
- output['time_status'] = ''
- return output
- for time_ in time_json[1]:
- week = time_[0]
- weekday_text += ['{}: {}'.format(week, ', '.join(time_[1]))]
-
- for t in time_[1]:
- if t == '24 小時營業':
- periods += [{
- "open":{
- "day": week_list[week],
- "time": '0000'
- },
- "close":{
- "day": week_list[week],
- "time": ''
- }
- }]
- elif t == '休息':
- periods += [{
- "open":{
- "day": week_list[week],
- "time": ''
- },
- "close":{
- "day": week_list[week],
- "time": ''
- }
- }]
- else:
- start, end = t.split('–')
- end_hour, end_min = end.split(':')
- start_hour, start_min = start.split(':')
- if end_hour < start_hour:
- end_day = week_list[week] + 1
- else:
- end_day = week_list[week]
- periods += [{
- "open":{
- "day": week_list[week],
- "time": start.replace(':','')
- },
- "close":{
- "day": end_day,
- "time": end.replace(':','')
- }
- }]
-
- output['periods'] = str(periods)
- output['weekday_text'] = str(weekday_text)
- output['time_status'] = blank_check(time_json[4][4].split('⋅')[0])
- if output['time_status'].find('永久停業') != -1 or\
- output['time_status'].find('暫時關閉') != -1 or\
- output['time_status'].find('暫停營業') != -1:
- output['open_now'] = 'False'
- else:
- output['open_now'] = 'True'
- return output
- def save_js_to_db(jsobj, fid):
- global shop_table
- global iddict
- if iddict.get(fid) is None:
- try:
- shop_table.insert(jsobj)
- except:
- traceback.print_exc()
- def process_web_request_start(driver):
- time.sleep(3)
- print("start&**********************")
- for request in driver.requests:
- if request.response:
- # print(request.url)
- if 'place?' in request.url :
- print(request.url)
- resp=request.response.body
- if 'gzip' in request.response.headers.get('Content-Encoding'):
- resp = gzip.decompress(request.response.body)
- if 'br' in request.response.headers.get('Content-Encoding'):
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- output = parsing_js(jstext)
- time.sleep(1)
- del driver.requests
- return output
- del driver.requests
- return 0
- def reviews_parsing_js(resp):
- columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
- 'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
- jsobj = json.loads(resp[5::])
- result = []
- for i in range(len(jsobj[2])):
- tmp = []
- tmp += [jsobj[2][i][6], jsobj[2][i][0][0], jsobj[2][i][0][1], jsobj[2][i][0][2], jsobj[2][i][12][1][1]]
- tmp += [jsobj[2][i][1], jsobj[2][i][3]]
- # image
- image = []
- if jsobj[2][i][14]:
- for j in range(len(jsobj[2][i][14])):
- image += [jsobj[2][i][14][j][6][0]]
- tmp += [image]
- #rating
- tmp += [jsobj[2][i][4]]
- # store reply
- if jsobj[2][i][9]:
- tmp += [jsobj[2][i][9][0], jsobj[2][i][9][1]]
- else:
- tmp += ['', '']
- result.append(list(map(lambda x, y: {x:y}, columns_name, tmp)))
- return result
- def process_web_request_reviews(driver, output):
- time.sleep(3)
- print("reviews&**********************")
- for request in driver.requests:
- if request.response:
- if 'listentitiesreviews?' in request.url :
- print(request.url)
-
- resp=request.response.body
- if 'gzip' in request.response.headers.get('Content-Encoding'):
- resp = gzip.decompress(request.response.body)
- if 'br' in request.response.headers.get('Content-Encoding'):
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- result = reviews_parsing_js(jstext)
- output['reviews'] = str(result)
- time.sleep(1)
- del driver.requests
- return output
- del driver.requests
- return 0
- def photos_parsing_js(resp):
- def image_url_change_size(url):
- if url.find('streetviewpixels') != -1:
- return url
- else:
- url_split = url.split('=')
- new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
- return new_url
- jsobj = json.loads(resp[5::])
- # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
- menu = []
- all = []
- photo_category_map = {}
- for row in jsobj[12][0]:
- photo_category_map[row[0]] = row[2]
- if photo_category_map[jsobj[13][0]] == '全部':
- for img in jsobj[0]:
- all += [image_url_change_size(img[6][0])]
- elif photo_category_map[jsobj[13][0]] == '菜單':
- for img in jsobj[0]:
- menu += [image_url_change_size(img[6][0])]
- return list(set(menu)), list(set(all))
-
- def process_web_request_photo(driver, output, fid):
- try:
- driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
- photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
- tab_dict = {}
- for tab_index in [0, 1, 2]:
- selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
- if len(selector) != 0:
- photo_name = selector[0].text
- if photo_name == '菜單':
- tab_dict[photo_name] = tab_index
- elif photo_name == '全部':
- tab_dict[photo_name] = tab_index
- except:
- tab_dict = {}
- print(tab_dict)
- for tab_ in tab_dict:
- tab_index = tab_dict[tab_]
- print(tab_index)
- wait = WebDriverWait(driver, 60)
- wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
- )
- element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
- ActionChains(driver).move_to_element(element).click(element).perform()
- time.sleep(1)
- print("photo&**********************")
- menu_list = []
- all_list = []
- for request in driver.requests:
- if request.response:
- # print(request.url)
- if 'photo?' in request.url :
- # print('parsing js:')
- front, _ = fid.split(':')
- if request.url.find(front) != -1:
- print(request.url)
- resp=request.response.body
- if 'gzip' in request.response.headers.get('Content-Encoding'):
- resp = gzip.decompress(request.response.body)
- if 'br' in request.response.headers.get('Content-Encoding'):
- resp = brotli.decompress(request.response.body)
- jstext = resp.decode('utf-8')
- menu, all = photos_parsing_js(jstext)
- menu_list += menu
- all_list += all
- del driver.requests
- output['shop_photo'] = str(all_list[:5])
- output['menu_photo'] = str(menu_list[:5])
- del driver.requests
- return output
-
- def main():
- global chrome_window
- global store_list_table
- global shop_table
- global proxyport
- global iddict
- localip=socket.gethostbyname(socket.gethostname())
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
- # store_list_table = db['swire_store_list']
- shop_table = db['shop_list4']
- progress_table = db['progress_list']
- iddict=build_cache(db)
- # print("iddict...{}".format(datetime.now()))
- port=4444
- if len(sys.argv) == 3 :
- port=int(sys.argv[1])
- proxyport=int(sys.argv[2])
- if not chrome_window:
- print('restart docker pw{}'.format(port))
- os.system('sudo docker container restart pw'+str(port))
- # os.system('sudo docker container restart pw'+str(port))
- time.sleep(5)
- print('drvier start...')
- driver = brower_start(port)
- job = get_next_job(db)
- c = 0
- for row, group in job.iterrows():
- try:
- print(row)
- keyword = group['分店'] + group['地址']
- item_url = 'https://www.google.com/maps/place/?q={}'.format(keyword)
- print(item_url)
-
- #shop_info
- print('parsing shop info....')
- for i in range(5):
- print('shop info try...{}'.format(i))
- print("shop info try...{}".format(datetime.now()))
- driver.get(item_url)
- time.sleep(3)
- element = driver.find_elements_by_css_selector('div[role="article"]')
- if len(element) != 0:
- item_url = element[0].find_element_by_css_selector('a').get_attribute('href')
- print(item_url)
- driver.get(item_url)
- time.sleep(3)
- wait = WebDriverWait(driver, 10)
- wait.until(
- EC.element_to_be_clickable((By.ID, 'sb_cb50'))
- )
- element = driver.find_element_by_id('sb_cb50')
- driver.implicitly_wait(9)
- ActionChains(driver).move_to_element(element).click(element).perform()
- time.sleep(1)
- driver.back()
- if driver.current_url == item_url:continue
- print(driver.current_url)
- output = process_web_request_start(driver)
- if output != 0: break
- print(output)
- # reivews
- print('parsing reviews....')
- print("parsing reviews.....{}".format(datetime.now()))
- if not output['user_ratings_total']:
- output['reviews'] = ''
- else:
- for i in range(3):
- print('reviews try...{}'.format(i))
- print("reviews try.....{}".format(datetime.now()))
- wait = WebDriverWait(driver, 30)
- more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
- wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
- )
- element = driver.find_element_by_css_selector(more_reviews_css)
- driver.implicitly_wait(10)
- ActionChains(driver).move_to_element(element).click(element).perform()
- time.sleep(0.5)
- output_ = process_web_request_reviews(driver, output)
- if output_ != 0:
- output = output_
- break
- else:
- driver.get(item_url)
- time.sleep(0.5)
- # photo
- print('parsing photo....')
- if output['header_image'] != '':
- for i in range(3):
- print('photo try...{}'.format(i))
- print("photo try......{}".format(datetime.now()))
- driver.get(item_url)
- time.sleep(0.5)
- print(driver.current_url)
- try:
- wait = WebDriverWait(driver, 30)
- wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name'])))
- )
- element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
- ActionChains(driver).move_to_element(element).click(element).perform()
- output = process_web_request_photo(driver, output, output['fid'])
- break
- except:
- pass
- else:
- output['shop_photo'] = '[]'
- output['menu_photo'] = '[]'
-
- output['item_url'] = item_url
- output['keyword'] = keyword
- if output['google_url'] == '':
- query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
- output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
- output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
- print(output)
- save_js_to_db(output, output['fid'])
- print("save_js_to_db......{}".format(datetime.now()))
- progress_table.insert({'id_':group['分店編號'],
- 'name':output['name'],
- 'fid':output['fid'],
- 'check_':1})
- except TimeoutException as e:
- traceback.print_exc()
- continue
- except:
- progress_table.insert({'id_':group['分店編號'],
- 'name':output['name'],
- 'fid':output['fid'],
- 'check_':0})
- if __name__ == '__main__':
- main()
|