123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from bs4 import BeautifulSoup
- from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check
- import pandas as pd
- import time
- import json
- def serive_create(profilepath):
- option = webdriver.ChromeOptions()
- option.add_argument('--disable-web-security')
- option.add_argument('--allow-running-insecure-content')
- option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
- option.add_argument("profile-directory="+profilepath)
- driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
- executor_url = driver.command_executor._url
- session_id = driver.session_id
- print (session_id)
- print (executor_url)
- time.sleep(3)
-
- return driver
- def brower_start():
- options = webdriver.ChromeOptions()
- browser = webdriver.Remote(
- command_executor='http://192.53.174.202:4444/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def keyin_keyword(driver, keyword):
- button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
- time.sleep(3)
- element = driver.find_element_by_class_name("V0h1Ob-haAclf")
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
- def open_time(driver):
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
- if element.text.find('預訂') == -1:
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
- return 1
- else:
- return 0
- def get_shop_info(driver, output, shop_soup):
- current_url_split = driver.current_url.split('@')[1].split(',')
- output['lon'] = current_url_split[1]
- output['lat'] = current_url_split[0]
-
- location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
- output['city'] = location[-1]
- output['area'] = location[-2]
- print(location)
- output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1]
- output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
- print(output['addr'], output['tel'])
- for key in element_list:
- element = element_list[key]
- if len(element) == 3:
- value = shop_soup.find(element[0],element[1])[element[2]]
-
- else:
- tmp_value = shop_soup.find(element[0],element[1])
- if tmp_value:
- value = tmp_value.text
- else:
- value = ''
- output[key] = value_check(key, value)
- return output
- def get_intro_info(driver, output):
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
- intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
- for key in intro_list:
- elements = intro_soup.find('div',{'aria-label':key})
- # print(elements)
- if elements:
- element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
- # print(element)
- count = 0
- tmp = []
- for ele in element:
- if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
- tmp += [{
- 'id':count,
- intro_list[key][1]: blank_check(ele.text)
- }]
- count += 1
- output[intro_list[key][0]] = str(tmp)
- else:
- output[intro_list[key][0]] = []
- driver.back()
- time.sleep(2)
- return output
- def get_time_list(shop_soup, output):
- open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
- if open_now == '永久停業' or open_now == '暫時關閉':
- output['open_now'] = 'False'
- else:
- output['open_now'] = 'True'
- periods = []
- weekday_text = []
- for tr_ in shop_soup.find_all('tr'):
- if tr_.find('div').text.replace(' ','') != '':
- week = tr_.find('div').text
- time_list = [blank_check(i.text) for i in tr_.find_all('li')]
- for time_ in time_list:
- if time_ == '24 小時營業':
- periods += [{
- "open":{
- "day": week_list[week],
- "time": 0000
- },
- "close":{
- "day": week_list[week],
- "time": ''
- }
- }]
- elif time_ == '休息':
- periods += [{
- "open":{
- "day": week_list[week],
- "time": ''
- },
- "close":{
- "day": week_list[week],
- "time": ''
- }
- }]
- else:
- start, end = time_.split('–')
- end_hour, end_min = end.split(':')
- start_hour, start_min = start.split(':')
- if end_hour < start_hour:
- end_day = week_list[week] + 1
- else:
- end_day = week_list[week]
- periods += [{
- "open":{
- "day": week_list[week],
- "time": start.replace(':','')
- },
- "close":{
- "day": end_day,
- "time": end.replace(':','')
- }
- }]
- weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
- output['periods'] = str(periods)
- output['weekday_text'] = str(weekday_text)
- return output
- def get_reviews(driver, output):
- element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']")
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
- time.sleep(2)
- all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
- for ap in all_photo:
- ap.click()
- all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
- for ap in all_review:
- ap.click()
- comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
- count = 0
- reviews = []
- for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
- comment_a_tag = comment.find_all('a')
- author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
- profile_photo_url = comment_a_tag[0].find('img')['src']
- rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
- text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
- created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
- photos = []
- c = 0
- for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
- path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
- photos += [path]
- c += 1
-
- reviews += [{
- 'id': comment.find('a')['href'].split('/')[5],
- 'author_name': author_name,
- 'profile_photo_url': profile_photo_url,
- 'rating': int(rating),
- 'text': text,
- 'created_at': created_at,
- 'photos': photos
- }]
- count += 1
- output['reviews'] = reviews
- driver.back()
- return output
- def main():
- data = pd.read_csv('lat_long_location.csv', index_col = 0)
- tmp = data.iloc[10]
- latitude = tmp['latitude'] #緯度
- longitude = tmp['longitude'] #精度
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
- # driver = serive_create('Profile 1')
- driver = brower_start()
- driver.get(url)
- keyin_keyword(driver, '燒烤')
- result = []
- for id_ in range(1, 16):
- element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_))
- driver.implicitly_wait(20)
- ActionChains(driver).move_to_element(element).click(element).perform()
-
- time_check = open_time(driver)
- if time_check == 1:
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-
- output = {
- 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
- }
- print(output['name'])
- output = get_shop_info(driver, output, shop_soup)
- # print('intro')
- output = get_intro_info(driver, output)
- time.sleep(2)
- # print('time')
- output = get_time_list(shop_soup, output)
- # print('reviews')
- output = get_reviews(driver, output)
- output_name = output['name'].replace('(','').replace(')', '')
- output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr'])
- time.sleep(2)
- result += [output]
- with open('result/20211203.json', 'w') as f:
- json.dump(result, f)
- time.sleep(2)
- if __name__ == '__main__':
- main()
|