@@ -1,13 +1,22 @@
+# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
-from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
import pandas as pd
import time
import json
+import re
+# import pyautogui as pag
def serive_create(profilepath):
option = webdriver.ChromeOptions()
@@ -37,12 +46,12 @@ def brower_start():
def keyin_keyword(driver, keyword):
button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(20)
+ driver.implicitly_wait(30)
element = driver.find_element_by_class_name("V0h1Ob-haAclf")
- driver.implicitly_wait(20)
+ driver.implicitly_wait(30)
@@ -67,7 +76,7 @@ def get_shop_info(driver, output, shop_soup):
output['area'] = location[-2]
- output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1]
+ output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
print(output['addr'], output['tel'])
@@ -93,6 +102,12 @@ def get_intro_info(driver, output):
+ wait = WebDriverWait(driver, 30)
+ item_xpath = "div[aria-label='{}簡介']".format(output['name'])
+ wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, item_xpath))
+ )
+ time.sleep(1)
intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
for key in intro_list:
@@ -114,7 +129,6 @@ def get_intro_info(driver, output):
output[intro_list[key][0]] = []
- time.sleep(2)
return output
@@ -181,7 +195,12 @@ def get_time_list(shop_soup, output):
def get_reviews(driver, output):
- element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']")
+ wait = WebDriverWait(driver, 30)
+ more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
+ wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
+ )
+ element = driver.find_element_by_css_selector(more_reviews_css)
@@ -226,52 +245,151 @@ def get_reviews(driver, output):
return output
+# def get_photo(output, shop_soup):
+# shop_photo = {}
+# for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
+# try:
+# if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
+# continue
+# shop_photo[i['aria-label']] = i.find('img')['src']
+# except:
+# pass
+# output['shop_photo'] = shop_photo
+# return output
+def find_photo_list(driver):
+ wait = WebDriverWait(driver, 30)
+ wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[data-photo-index="5"]'))
+ )
+ for photo_id in range(6):
+ driver.find_element(By.CSS_SELECTOR,'a[data-photo-index="{}"]'.format(photo_id)).send_keys(Keys.DOWN)
+ time.sleep(1)
+ photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
+ photo_url = []
+ for photo_id in range(5):
+ for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
+ if i['style'].find('width') != -1:
+ sentence = i['style']
+ photo = re.search(r'https:(.*)\"', sentence)
+ photo_url += [photo.group(0).replace('\"','')]
+ break
+ return photo_url
+def find_big_photo(output, driver):
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
+ ActionChains(driver).move_to_element(element).click(element).perform()
+ photo_map = {
+ 0: 'shop_photo',
+ 2: 'menu_photo'
+ }
+ for tab_index in [0, 2]:
+ wait = WebDriverWait(driver, 30)
+ wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
+ )
+ element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
+ ActionChains(driver).move_to_element(element).click(element).perform()
+ photo_list = find_photo_list(driver)
+ output[photo_map[tab_index]] = photo_list
+ for i in range(2):
+ driver.back()
+ time.sleep(1)
+ return output
+def get_url_list(driver):
+ wait = WebDriverWait(driver, 10)
+ wait.until(
+ EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
+ )
+ driver.back()
+ time.sleep(2)
+ for i in range(5, 43, 2):
+ driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
+ url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+ url_list = []
+ for i in url_soup.find_all('a'):
+ try:
+ if i['href'].find('maps/place') != -1:
+ url_list += [[i['href'], i['aria-label']]]
+ except:
+ pass
+ return url_list
def main():
data = pd.read_csv('lat_long_location.csv', index_col = 0)
- tmp = data.iloc[10]
+ tmp = data.iloc[0]
latitude = tmp['latitude'] #緯度
longitude = tmp['longitude'] #精度
url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
# driver = serive_create('Profile 1')
+ db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+ print('drvier start...')
driver = brower_start()
+ keyin_keyword(driver, '咖啡')
+ url_list = get_url_list(driver)
- keyin_keyword(driver, '燒烤')
result = []
+ # try:
+ for item_url, name in url_list:
+ print(name, ': ' ,item_url)
+ driver.get(item_url)
+ wait = WebDriverWait(driver, 120)
+ time_css = "span[aria-label='顯示本週營業時間']"
+ wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, time_css))
+ )
+ element = driver.find_element_by_css_selector(time_css)
+ driver.implicitly_wait(30)
+ ActionChains(driver).move_to_element(element).click(element).perform()
+ time.sleep(1)
+ shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+ output = {
+ 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
+ }
+ print(output['name'])
+ output = get_shop_info(driver, output, shop_soup)
+ output = get_intro_info(driver, output)
+ output = get_time_list(shop_soup, output)
+ output = get_reviews(driver, output)
+ output = find_big_photo(output, driver)
+ output_name = output['name'].replace('(','').replace(')', '')
+ output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output_name, output['addr'])
+ time.sleep(2)
+ result += [output]
+ with open('result/20211207_{}.json'.format(name), 'w') as f:
+ json.dump(output, f)
+ DA.data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
+ break
+ # except:
+ # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+ # print("error {}".format(id_))
+ # print(blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text))
- for id_ in range(1, 16):
- element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_))
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
- time_check = open_time(driver)
- if time_check == 1:
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
- output = {
- 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
- }
- print(output['name'])
- output = get_shop_info(driver, output, shop_soup)
- # print('intro')
- output = get_intro_info(driver, output)
- time.sleep(2)
- # print('time')
- output = get_time_list(shop_soup, output)
- # print('reviews')
- output = get_reviews(driver, output)
- output_name = output['name'].replace('(','').replace(')', '')
- output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr'])
- time.sleep(2)
- result += [output]
- with open('result/20211203.json', 'w') as f:
- json.dump(result, f)
- time.sleep(2)
if __name__ == '__main__':
- main()
+ main()