|
@@ -1,13 +1,22 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
|
from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
-from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import json
|
|
|
+import re
|
|
|
+# import pyautogui as pag
|
|
|
|
|
|
def serive_create(profilepath):
|
|
|
option = webdriver.ChromeOptions()
|
|
@@ -37,12 +46,12 @@ def brower_start():
|
|
|
|
|
|
def keyin_keyword(driver, keyword):
|
|
|
button = driver.find_element_by_id("searchbox")
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(30)
|
|
|
ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
|
|
|
time.sleep(3)
|
|
|
|
|
|
element = driver.find_element_by_class_name("V0h1Ob-haAclf")
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(30)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
|
|
@@ -67,7 +76,7 @@ def get_shop_info(driver, output, shop_soup):
|
|
|
output['area'] = location[-2]
|
|
|
print(location)
|
|
|
|
|
|
- output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1]
|
|
|
+ output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
print(output['addr'], output['tel'])
|
|
|
|
|
@@ -93,6 +102,12 @@ def get_intro_info(driver, output):
|
|
|
driver.implicitly_wait(20)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ item_xpath = "div[aria-label='{}簡介']".format(output['name'])
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, item_xpath))
|
|
|
+ )
|
|
|
+ time.sleep(1)
|
|
|
intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
|
|
|
for key in intro_list:
|
|
@@ -114,7 +129,6 @@ def get_intro_info(driver, output):
|
|
|
else:
|
|
|
output[intro_list[key][0]] = []
|
|
|
driver.back()
|
|
|
- time.sleep(2)
|
|
|
return output
|
|
|
|
|
|
|
|
@@ -181,7 +195,12 @@ def get_time_list(shop_soup, output):
|
|
|
|
|
|
|
|
|
def get_reviews(driver, output):
|
|
|
- element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']")
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
|
|
|
+ )
|
|
|
+ element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
driver.implicitly_wait(20)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
time.sleep(2)
|
|
@@ -226,52 +245,151 @@ def get_reviews(driver, output):
|
|
|
driver.back()
|
|
|
return output
|
|
|
|
|
|
+# def get_photo(output, shop_soup):
|
|
|
+# shop_photo = {}
|
|
|
+# for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
|
|
|
+# try:
|
|
|
+# if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
|
|
|
+# continue
|
|
|
+
|
|
|
+# shop_photo[i['aria-label']] = i.find('img')['src']
|
|
|
+# except:
|
|
|
+# pass
|
|
|
+# output['shop_photo'] = shop_photo
|
|
|
+# return output
|
|
|
+
|
|
|
+
|
|
|
+def find_photo_list(driver):
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[data-photo-index="5"]'))
|
|
|
+ )
|
|
|
+ for photo_id in range(6):
|
|
|
+ driver.find_element(By.CSS_SELECTOR,'a[data-photo-index="{}"]'.format(photo_id)).send_keys(Keys.DOWN)
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+ photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ photo_url = []
|
|
|
+ for photo_id in range(5):
|
|
|
+ for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
|
|
|
+ if i['style'].find('width') != -1:
|
|
|
+ sentence = i['style']
|
|
|
+ photo = re.search(r'https:(.*)\"', sentence)
|
|
|
+ photo_url += [photo.group(0).replace('\"','')]
|
|
|
+ break
|
|
|
+ return photo_url
|
|
|
+
|
|
|
+
|
|
|
+def find_big_photo(output, driver):
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+ photo_map = {
|
|
|
+ 0: 'shop_photo',
|
|
|
+ 2: 'menu_photo'
|
|
|
+ }
|
|
|
+
|
|
|
+ for tab_index in [0, 2]:
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
|
|
|
+ )
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ photo_list = find_photo_list(driver)
|
|
|
+ output[photo_map[tab_index]] = photo_list
|
|
|
+
|
|
|
+ for i in range(2):
|
|
|
+ driver.back()
|
|
|
+ time.sleep(1)
|
|
|
+ return output
|
|
|
+
|
|
|
+def get_url_list(driver):
|
|
|
+ wait = WebDriverWait(driver, 10)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
|
|
|
+ )
|
|
|
+ driver.back()
|
|
|
+
|
|
|
+ time.sleep(2)
|
|
|
+ for i in range(5, 43, 2):
|
|
|
+ driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
|
|
|
+ url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ url_list = []
|
|
|
+ for i in url_soup.find_all('a'):
|
|
|
+ try:
|
|
|
+ if i['href'].find('maps/place') != -1:
|
|
|
+ url_list += [[i['href'], i['aria-label']]]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ return url_list
|
|
|
|
|
|
|
|
|
def main():
|
|
|
data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
- tmp = data.iloc[10]
|
|
|
+ tmp = data.iloc[0]
|
|
|
latitude = tmp['latitude'] #緯度
|
|
|
longitude = tmp['longitude'] #精度
|
|
|
|
|
|
url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
|
|
|
# driver = serive_create('Profile 1')
|
|
|
+
|
|
|
+ db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+ print('drvier start...')
|
|
|
driver = brower_start()
|
|
|
driver.get(url)
|
|
|
+ keyin_keyword(driver, '咖啡')
|
|
|
+ url_list = get_url_list(driver)
|
|
|
|
|
|
- keyin_keyword(driver, '燒烤')
|
|
|
result = []
|
|
|
+ # try:
|
|
|
+ for item_url, name in url_list:
|
|
|
+ print(name, ': ' ,item_url)
|
|
|
+ driver.get(item_url)
|
|
|
+
|
|
|
+ wait = WebDriverWait(driver, 120)
|
|
|
+ time_css = "span[aria-label='顯示本週營業時間']"
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.CSS_SELECTOR, time_css))
|
|
|
+ )
|
|
|
+ element = driver.find_element_by_css_selector(time_css)
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+ time.sleep(1)
|
|
|
+ shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
+ output = {
|
|
|
+ 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
|
|
|
+ }
|
|
|
+ print(output['name'])
|
|
|
+
|
|
|
+ output = get_shop_info(driver, output, shop_soup)
|
|
|
+
|
|
|
+ output = get_intro_info(driver, output)
|
|
|
+
|
|
|
+ output = get_time_list(shop_soup, output)
|
|
|
+
|
|
|
+ output = get_reviews(driver, output)
|
|
|
+
|
|
|
+ output = find_big_photo(output, driver)
|
|
|
+
|
|
|
+ output_name = output['name'].replace('(','').replace(')', '')
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output_name, output['addr'])
|
|
|
+ time.sleep(2)
|
|
|
+ result += [output]
|
|
|
+ with open('result/20211207_{}.json'.format(name), 'w') as f:
|
|
|
+ json.dump(output, f)
|
|
|
+
|
|
|
+ DA.data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
+ break
|
|
|
+ # except:
|
|
|
+ # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ # print("error {}".format(id_))
|
|
|
+ # print(blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text))
|
|
|
+
|
|
|
|
|
|
- for id_ in range(1, 16):
|
|
|
-
|
|
|
- element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_))
|
|
|
- driver.implicitly_wait(20)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
-
|
|
|
- time_check = open_time(driver)
|
|
|
- if time_check == 1:
|
|
|
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
-
|
|
|
- output = {
|
|
|
- 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
|
|
|
- }
|
|
|
- print(output['name'])
|
|
|
- output = get_shop_info(driver, output, shop_soup)
|
|
|
- # print('intro')
|
|
|
- output = get_intro_info(driver, output)
|
|
|
- time.sleep(2)
|
|
|
- # print('time')
|
|
|
- output = get_time_list(shop_soup, output)
|
|
|
- # print('reviews')
|
|
|
- output = get_reviews(driver, output)
|
|
|
- output_name = output['name'].replace('(','').replace(')', '')
|
|
|
- output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr'])
|
|
|
- time.sleep(2)
|
|
|
- result += [output]
|
|
|
-
|
|
|
- with open('result/20211203.json', 'w') as f:
|
|
|
- json.dump(result, f)
|
|
|
- time.sleep(2)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- main()
|
|
|
+ main()
|