from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check import pandas as pd import time import json def serive_create(profilepath): option = webdriver.ChromeOptions() option.add_argument('--disable-web-security') option.add_argument('--allow-running-insecure-content') option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data") option.add_argument("profile-directory="+profilepath) driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option) executor_url = driver.command_executor._url session_id = driver.session_id print (session_id) print (executor_url) time.sleep(3) return driver def brower_start(): options = webdriver.ChromeOptions() browser = webdriver.Remote( command_executor='http://192.53.174.202:4444/wd/hub', desired_capabilities=options.to_capabilities() ) return browser def keyin_keyword(driver, keyword): button = driver.find_element_by_id("searchbox") driver.implicitly_wait(20) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) element = driver.find_element_by_class_name("V0h1Ob-haAclf") driver.implicitly_wait(20) ActionChains(driver).move_to_element(element).click(element).perform() def open_time(driver): element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]') if element.text.find('預訂') == -1: element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]') driver.implicitly_wait(20) ActionChains(driver).move_to_element(element).click(element).perform() return 1 else: return 0 def get_shop_info(driver, output, shop_soup): current_url_split = driver.current_url.split('@')[1].split(',') output['lon'] = current_url_split[1] output['lat'] = current_url_split[0] location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ') output['city'] = location[-1] output['area'] = location[-2] print(location) output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1] output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1]) print(output['addr'], output['tel']) for key in element_list: element = element_list[key] if len(element) == 3: value = shop_soup.find(element[0],element[1])[element[2]] else: tmp_value = shop_soup.find(element[0],element[1]) if tmp_value: value = tmp_value.text else: value = '' output[key] = value_check(key, value) return output def get_intro_info(driver, output): element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]') driver.implicitly_wait(20) ActionChains(driver).move_to_element(element).click(element).perform() intro_soup = BeautifulSoup(driver.page_source, 'html.parser') for key in intro_list: elements = intro_soup.find('div',{'aria-label':key}) # print(elements) if elements: element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'}) # print(element) count = 0 tmp = [] for ele in element: if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}): tmp += [{ 'id':count, intro_list[key][1]: blank_check(ele.text) }] count += 1 output[intro_list[key][0]] = str(tmp) else: output[intro_list[key][0]] = [] driver.back() time.sleep(2) return output def get_time_list(shop_soup, output): open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0]) if open_now == '永久停業' or open_now == '暫時關閉': output['open_now'] = 'False' else: output['open_now'] = 'True' periods = [] weekday_text = [] for tr_ in shop_soup.find_all('tr'): if tr_.find('div').text.replace(' ','') != '': week = tr_.find('div').text time_list = [blank_check(i.text) for i in tr_.find_all('li')] for time_ in time_list: if time_ == '24 小時營業': periods += [{ "open":{ "day": week_list[week], "time": 0000 }, "close":{ "day": week_list[week], "time": '' } }] elif time_ == '休息': periods += [{ "open":{ "day": week_list[week], "time": '' }, "close":{ "day": week_list[week], "time": '' } }] else: start, end = time_.split('–') end_hour, end_min = end.split(':') start_hour, start_min = start.split(':') if end_hour < start_hour: end_day = week_list[week] + 1 else: end_day = week_list[week] periods += [{ "open":{ "day": week_list[week], "time": start.replace(':','') }, "close":{ "day": end_day, "time": end.replace(':','') } }] weekday_text += ["{}: {}".format(week, ', '.join(time_list))] output['periods'] = str(periods) output['weekday_text'] = str(weekday_text) return output def get_reviews(driver, output): element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']") driver.implicitly_wait(20) ActionChains(driver).move_to_element(element).click(element).perform() time.sleep(2) all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb') for ap in all_photo: ap.click() all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"') for ap in all_review: ap.click() comment_soup = BeautifulSoup(driver.page_source, 'html.parser') count = 0 reviews = [] for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}): comment_a_tag = comment.find_all('a') author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text) profile_photo_url = comment_a_tag[0].find('img')['src'] rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', '')) text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text photos = [] c = 0 for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'): path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','') photos += [path] c += 1 reviews += [{ 'id': comment.find('a')['href'].split('/')[5], 'author_name': author_name, 'profile_photo_url': profile_photo_url, 'rating': int(rating), 'text': text, 'created_at': created_at, 'photos': photos }] count += 1 output['reviews'] = reviews driver.back() return output def main(): data = pd.read_csv('lat_long_location.csv', index_col = 0) tmp = data.iloc[10] latitude = tmp['latitude'] #緯度 longitude = tmp['longitude'] #精度 url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude) # driver = serive_create('Profile 1') driver = brower_start() driver.get(url) keyin_keyword(driver, '燒烤') result = [] for id_ in range(1, 16): element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_)) driver.implicitly_wait(20) ActionChains(driver).move_to_element(element).click(element).perform() time_check = open_time(driver) if time_check == 1: shop_soup = BeautifulSoup(driver.page_source, 'html.parser') output = { 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text) } print(output['name']) output = get_shop_info(driver, output, shop_soup) # print('intro') output = get_intro_info(driver, output) time.sleep(2) # print('time') output = get_time_list(shop_soup, output) # print('reviews') output = get_reviews(driver, output) output_name = output['name'].replace('(','').replace(')', '') output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr']) time.sleep(2) result += [output] with open('result/20211203.json', 'w') as f: json.dump(result, f) time.sleep(2) if __name__ == '__main__': main()