noodlesloves
/
GooglePoiCrawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
							from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check

import pandas as pd
import time
import json

def serive_create(profilepath):
    option = webdriver.ChromeOptions()

    option.add_argument('--disable-web-security')
    option.add_argument('--allow-running-insecure-content') 
    option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
    option.add_argument("profile-directory="+profilepath)

    driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
    executor_url = driver.command_executor._url
    session_id = driver.session_id
    print (session_id)
    print (executor_url)
    time.sleep(3)
    
    return driver

def brower_start():
    options = webdriver.ChromeOptions()
    browser = webdriver.Remote(
        command_executor='http://192.53.174.202:4444/wd/hub',
        desired_capabilities=options.to_capabilities()
    )
    return browser


def keyin_keyword(driver, keyword):
    button = driver.find_element_by_id("searchbox")
    driver.implicitly_wait(20)
    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
    time.sleep(3)

    element = driver.find_element_by_class_name("V0h1Ob-haAclf")
    driver.implicitly_wait(20)
    ActionChains(driver).move_to_element(element).click(element).perform()


def open_time(driver):
    element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
    if element.text.find('預訂') == -1:
        element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
        driver.implicitly_wait(20)
        ActionChains(driver).move_to_element(element).click(element).perform()
        return 1
    else:
        return 0


def get_shop_info(driver, output, shop_soup):
    current_url_split = driver.current_url.split('@')[1].split(',')
    output['lon'] = current_url_split[1]
    output['lat'] = current_url_split[0]
    
    location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
    output['city'] = location[-1]
    output['area'] = location[-2]
    print(location)

    output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1]
    output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
    print(output['addr'], output['tel'])

    for key in element_list:
        element = element_list[key]
        if len(element) == 3:
            value = shop_soup.find(element[0],element[1])[element[2]]
            
        else:
            tmp_value = shop_soup.find(element[0],element[1])
            if tmp_value:
                value = tmp_value.text
            else:
                value = ''

        output[key] = value_check(key, value)

    return output


def get_intro_info(driver, output):
    element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
    driver.implicitly_wait(20)
    ActionChains(driver).move_to_element(element).click(element).perform()

    intro_soup = BeautifulSoup(driver.page_source, 'html.parser')

    for key in intro_list:
        elements = intro_soup.find('div',{'aria-label':key})
        # print(elements)
        if elements:
            element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
            # print(element)
            count = 0
            tmp = []
            for ele in element:
                if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
                    tmp += [{
                        'id':count,
                        intro_list[key][1]: blank_check(ele.text)
                    }]
                    count += 1
            output[intro_list[key][0]] = str(tmp)
        else:
            output[intro_list[key][0]] = []
    driver.back()
    time.sleep(2)
    return output


def get_time_list(shop_soup, output):
    open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
    if open_now == '永久停業' or open_now == '暫時關閉':
        output['open_now'] = 'False'
    else:
        output['open_now'] = 'True'
    periods = []
    weekday_text = []
    for tr_ in shop_soup.find_all('tr'):
        if tr_.find('div').text.replace(' ','') != '':
            week = tr_.find('div').text
            time_list = [blank_check(i.text) for i in tr_.find_all('li')]
            for time_ in time_list:
                if time_ == '24 小時營業':
                    periods += [{
                        "open":{
                            "day": week_list[week], 
                            "time": 0000
                        },
                        "close":{
                            "day": week_list[week], 
                            "time": ''
                        }
                    }]
                elif time_ == '休息':
                    periods += [{
                        "open":{
                            "day": week_list[week], 
                            "time": ''
                        },
                        "close":{
                            "day": week_list[week], 
                            "time": ''
                        }
                    }]
                else:
                    start, end = time_.split('–')
                    end_hour, end_min = end.split(':')
                    start_hour, start_min = start.split(':')

                    if end_hour < start_hour:
                        end_day = week_list[week] + 1 
                    else:
                        end_day = week_list[week]

                    periods += [{
                        "open":{
                            "day": week_list[week], 
                            "time": start.replace(':','')
                        },
                        "close":{
                            "day": end_day, 
                            "time": end.replace(':','')
                        }
                    }]
            weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
    output['periods'] = str(periods)
    output['weekday_text'] = str(weekday_text)

    return output


def get_reviews(driver, output):
    element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']")
    driver.implicitly_wait(20)
    ActionChains(driver).move_to_element(element).click(element).perform()
    time.sleep(2)

    all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
    for ap in all_photo:
        ap.click()

    all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
    for ap in all_review:
        ap.click()

    comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
    count = 0
    reviews = []
    for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
        comment_a_tag = comment.find_all('a')
        author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
        profile_photo_url = comment_a_tag[0].find('img')['src']
        rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
        text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
        created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
        photos = []
        c = 0
        for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
            path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','') 
            photos += [path]
            c += 1
            
        reviews += [{
            'id': comment.find('a')['href'].split('/')[5],
            'author_name': author_name,
            'profile_photo_url': profile_photo_url,
            'rating': int(rating), 
            'text': text,
            'created_at': created_at,
            'photos': photos
        }]
        count += 1

    output['reviews'] = reviews
    driver.back()
    return output


def main():
    data = pd.read_csv('lat_long_location.csv', index_col = 0)
    tmp = data.iloc[10]
    latitude = tmp['latitude'] #緯度
    longitude = tmp['longitude'] #精度

    url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
    # driver = serive_create('Profile 1')
    driver = brower_start()
    driver.get(url)

    keyin_keyword(driver, '燒烤')
    result = []

      for id_ in range(1, 16):

          element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_))
          driver.implicitly_wait(20)
          ActionChains(driver).move_to_element(element).click(element).perform()
  
          time_check = open_time(driver)
          if time_check == 1:
              shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  
              output = {
                  'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
              }
              print(output['name'])
              output = get_shop_info(driver, output, shop_soup)
              # print('intro')
              output = get_intro_info(driver, output)
              time.sleep(2)
              # print('time')
              output = get_time_list(shop_soup, output)
              # print('reviews')
              output = get_reviews(driver, output)
              output_name = output['name'].replace('（','').replace('）', '')
              output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr'])
              time.sleep(2)
          result += [output]

    with open('result/20211203.json', 'w') as f:
        json.dump(result, f)
    time.sleep(2)

if __name__ == '__main__':
    main()