noodles 3 år sedan
förälder
incheckning
9f66f3917e
1 ändrade filer med 573 tillägg och 0 borttagningar
  1. 573 0
      run2.py

+ 573 - 0
run2.py

@@ -0,0 +1,573 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.common.by import By
+
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+import traceback
+import dataset
+import pandas as pd
+import time
+import json
+import re
+import sys
+# import pyautogui as pag
+
+def serive_create(profilepath):
+    option = webdriver.ChromeOptions()
+
+    option.add_argument('--disable-web-security')
+    option.add_argument('--allow-running-insecure-content') 
+    option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
+    option.add_argument("profile-directory="+profilepath)
+
+    driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
+    executor_url = driver.command_executor._url
+    session_id = driver.session_id
+    print (session_id)
+    print (executor_url)
+    time.sleep(3)
+    
+    return driver
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+#    browser = webdriver.Chrome(options=options)
+
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        #command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+    # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
+    # driver.implicitly_wait(30)
+    # ActionChains(driver).move_to_element(element).click(element).perform()
+
+
+def open_time(driver):
+    element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
+    if element.text.find('預訂') == -1:
+        element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
+        driver.implicitly_wait(10)
+        ActionChains(driver).move_to_element(element).click(element).perform()
+        return 1
+    else:
+        return 0
+
+
+def get_shop_info(driver, output, shop_soup):
+    current_url_split = driver.current_url.split('@')[1].split(',')
+    output['lon'] = current_url_split[1]
+    output['lat'] = current_url_split[0]
+    
+    location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
+    output['city'] = location[-1]
+    output['area'] = location[-2]
+    
+    try:
+        output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
+    except:
+        output['addr'] = ''
+        
+    try:
+        output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
+    except:
+        output['tel'] = ''
+    print(output['addr'], ', ' ,output['tel'])
+
+    for key in element_list:
+        try:
+            element = element_list[key]
+            if len(element) == 3:
+                value = shop_soup.find(element[0],element[1])[element[2]]
+
+            else:
+                tmp_value = shop_soup.find(element[0],element[1])
+                if tmp_value:
+                    value = tmp_value.text
+                else:
+                    value = ''
+
+            output[key] = value_check(key, value)
+        except:
+            output[key] = ''
+
+    return output
+
+
+def get_intro_info(driver, output):
+    # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
+    try:
+        element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
+        driver.implicitly_wait(5)
+        ActionChains(driver).move_to_element(element).click(element).perform()
+
+        # pageSource = driver.page_source
+        # fileToWrite = open("page_source.html", "w")
+        # fileToWrite.write(pageSource)
+        # fileToWrite.close()
+
+        page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
+
+        intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
+        for key in intro_list:
+            elements = intro_soup.find('div',{'aria-label':key})
+            if elements:
+                element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
+                count = 0
+                tmp = []
+                for ele in element:
+                    # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
+                    if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
+                        tmp += [{
+                            'id':count,
+                            intro_list[key][1]: blank_check(ele.text)
+                        }]
+                        count += 1
+                print(str(tmp))
+                output[intro_list[key][0]] = str(tmp)
+            else:
+                output[intro_list[key][0]] = '[]'
+        driver.back()
+        return output
+
+    except:
+        for key in intro_list:
+            output[intro_list[key][0]] = '[]'
+        return output
+
+
+def get_time_list(shop_soup, output):
+    periods = []
+    weekday_text = []
+    
+    open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
+    if open_now == '永久停業' or open_now == '暫時關閉':
+        output['open_now'] = 'False'
+    else:
+        output['open_now'] = 'True'
+        for tr_ in shop_soup.find_all('tr'):
+            if tr_.find('div').text.replace(' ','') != '':
+                week = tr_.find('div').text
+                time_list = [blank_check(i.text) for i in tr_.find_all('li')]
+                for time_ in time_list:
+                    if time_ == '24 小時營業':
+                        periods += [{
+                            "open":{
+                                "day": week_list[week], 
+                                "time": 0000
+                            },
+                            "close":{
+                                "day": week_list[week], 
+                                "time": ''
+                            }
+                        }]
+                    elif time_ == '休息':
+                        periods += [{
+                            "open":{
+                                "day": week_list[week], 
+                                "time": ''
+                            },
+                            "close":{
+                                "day": week_list[week], 
+                                "time": ''
+                            }
+                        }]
+                    else:
+                        start, end = time_.split('–')
+                        end_hour, end_min = end.split(':')
+                        start_hour, start_min = start.split(':')
+
+                        if end_hour < start_hour:
+                            end_day = week_list[week] + 1 
+                        else:
+                            end_day = week_list[week]
+
+                        periods += [{
+                            "open":{
+                                "day": week_list[week], 
+                                "time": start.replace(':','')
+                            },
+                            "close":{
+                                "day": end_day, 
+                                "time": end.replace(':','')
+                            }
+                        }]
+                weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
+    output['periods'] = str(periods)
+    output['weekday_text'] = str(weekday_text)
+
+    return output
+
+
+def get_reviews(driver, output):
+    wait = WebDriverWait(driver, 30)
+    more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
+    wait.until(
+        EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
+    )
+    element = driver.find_element_by_css_selector(more_reviews_css)
+    driver.implicitly_wait(10)
+    ActionChains(driver).move_to_element(element).click(element).perform()
+    time.sleep(0.5)
+
+    # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
+    page_down_(driver, '//div[@class="PPCwl"]',5)
+
+    comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
+        all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
+        for ap in all_photo:
+            ap.click()
+
+    if comment_soup.select('button[aria-label="顯示更多"]') != 0:
+        all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
+        for ap in all_review:
+            ap.click()
+
+    comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+    count = 0
+    reviews = []
+    for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
+        comment_a_tag = comment.find_all('a')
+        author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
+        profile_photo_url = comment_a_tag[0].find('img')['src']
+        rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
+        text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
+        created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
+        photos = []
+        c = 0
+        for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
+            path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','') 
+            photos += [path]
+            c += 1
+            
+        reviews += [{
+            'id': comment.find('a')['href'].split('/')[5],
+            'author_name': author_name,
+            'profile_photo_url': profile_photo_url,
+            'rating': int(rating), 
+            'text': text,
+            'created_at': created_at,
+            'photos': photos
+        }]
+        count += 1
+
+    output['reviews'] = str(reviews)
+    driver.back()
+    return output
+
+# def get_photo(output, shop_soup):
+#     shop_photo = {}
+#     for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
+#         try:
+#             if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
+#                 continue
+                
+#             shop_photo[i['aria-label']] = i.find('img')['src']
+#         except:
+#             pass
+#     output['shop_photo'] = shop_photo
+#     return output
+
+
+def find_photo_list(driver):
+    time.sleep(0.5)
+    wait = WebDriverWait(driver, 60)
+    wait.until(
+        EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
+    )
+    page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
+    photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+    photo_url = []
+    count = 0
+    for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
+        if count > 5: break
+        a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
+        if a_url:
+            if a_url.find('width') != -1:
+                sentence = a_url['style']
+                photo = re.search(r'https:(.*)\"', sentence)
+                photo_url += [photo.group(0).replace('\"','')]
+        count += 1
+    return photo_url
+
+
+def find_big_photo(output, driver):
+    # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
+    wait = WebDriverWait(driver, 60)
+    wait.until(
+        EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
+    )
+    element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
+    ActionChains(driver).move_to_element(element).click(element).perform()
+    output['shop_photo'] = '[]'
+    output['menu_photo'] = '[]'
+    
+    photo_map = {
+        '全部': 'shop_photo',
+        '菜單': 'menu_photo'
+    }
+
+    driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
+    photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    tab_dict = {}
+    for tab_index in [0, 1, 2]:
+        selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
+        if len(selector) != 0:
+            photo_name = selector[0].text
+            if photo_name == '菜單':
+                tab_dict[photo_name] = tab_index
+            elif photo_name == '全部':
+                tab_dict[photo_name] = tab_index
+    print(tab_dict)                            
+    for tab_ in tab_dict:
+        tab_index = tab_dict[tab_]
+        print(tab_index)
+        wait = WebDriverWait(driver, 60)
+        wait.until(
+            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
+        )
+        element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
+        ActionChains(driver).move_to_element(element).click(element).perform()
+        photo_list = find_photo_list(driver)
+        output[photo_map[tab_]] = str(photo_list)
+        
+    return output
+
+
+def get_url_list(driver):
+    # wait = WebDriverWait(driver, 10)
+    # wait.until(
+    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
+    # )
+    # driver.back()
+
+    time.sleep(2)
+    for i in range(5, 43, 2):
+        driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    url_list = []
+    for i in url_soup.find_all('a'):
+        try:
+            if i['href'].find('maps/place') != -1:
+                url_list += [[i['href'], i['aria-label']]]
+        except:
+            pass
+    
+    return url_list
+
+
+def data_select_insert(db, table_name, table_col, data):
+    tmp = []
+    for name_ in table_col:
+        if name_ == 'crawler_date':
+            continue
+        if name_ == 'lon' or name_ == 'lat':
+            tmp += [float(data[name_])]
+        else:
+            tmp += [data[name_]]
+
+    tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
+
+    insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+                    .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
+
+    DA.mysql_insert_data(db, insert_sql)
+
+
+def time_click(driver):
+    shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
+    status = ''
+    try:
+        if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
+            time_css =  "span[aria-label='顯示本週營業時間']"
+            element = driver.find_element_by_css_selector(time_css)
+            driver.implicitly_wait(10)
+            ActionChains(driver).move_to_element(element).click(element).perform()
+            status = '正常'
+        
+        elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
+            status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
+#             status = '永久停業' or '暫時關閉'
+                
+        elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
+            status = 'error'
+            
+        return status
+    except:
+        return ''
+
+
+def get_not_cralwer_url(keyword):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table = db['shop_item_list']
+    url_list = list(table.find(keyword=keyword))
+    shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
+    error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
+    
+    url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
+    url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
+    url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
+    url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
+    url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
+
+    print('have {} URL list'.format(len(url_pd)))
+    # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
+
+    return url_pd
+
+
+def serive_create_linux(profilepath):
+    option = webdriver.ChromeOptions()
+    option.add_argument('--headless')
+    option.add_argument('--no-sandbox')
+    option.add_argument('--disable-web-security')
+    option.add_argument('--allow-running-insecure-content')
+    option.add_argument('--incognito')
+    option.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
+    # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
+    option.add_argument(
+        "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
+    option.add_argument("profile-directory="+profilepath)
+    driver = webdriver.Chrome('utility/chromedriver', options=option)
+    # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
+    #                           service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
+
+    executor_url = driver.command_executor._url
+    session_id = driver.session_id
+    print(session_id)
+    print(executor_url)
+
+    return driver
+
+
+def page_down_(driver, xpath_css, time_):
+    elmts = driver.find_elements_by_xpath(xpath_css)
+    print(elmts)
+    if len(elmts)>1:
+        elmt=elmts[1]
+    else:
+        elmt=elmts[0]
+    actions = ActionChains(driver)
+    actions.move_to_element(elmt).click().perform()
+    for i in range(time_):
+        try:
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
+        except:
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
+def main():
+    db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+    
+    keyword = '麻辣火鍋'
+    if len(sys.argv) >1:
+        keyword=sys.argv[1]
+    port=4448
+    if len(sys.argv) >2:
+        port=int(sys.argv[2])
+        
+    for keyword in ['鳳梨酥','蔥油餅','滷肉飯']:
+        url_pd = get_not_cralwer_url(keyword)
+        print('drvier start {}...'.format(keyword))
+        driver = brower_start(port)
+        #driver = serive_create('Profile 6')
+        #profilepath = 'Profile 1'
+        #driver = serive_create_linux(profilepath)
+        
+        for key, row in url_pd.iterrows():
+            try:    
+                name = row['name']
+                item_url = row['item_url']
+                print(key, name, ': ' ,item_url)
+
+                print('start...')
+                driver.get(item_url)
+                page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
+                
+                time_status = time_click(driver)
+                if time_status == 'error':
+                    error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
+                    data_select_insert(db, 'error_list', error_table_col, row)
+                    continue
+                time.sleep(0.5)
+                shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+                output = {
+                    'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
+                }
+                print(output['name'])
+
+                print('get_shop_info')
+                output = get_shop_info(driver, output, shop_soup)
+
+                print('get_intro_info')
+                if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
+                    output = get_intro_info(driver, output)
+                else:
+                    for key in intro_list:
+                        output[intro_list[key][0]] = '[]'
+
+                print('get_time_list')
+                if time_status == '正常':
+                    output = get_time_list(shop_soup, output)
+                else:
+                    output['open_now'] = False
+                    output['periods'] = ''
+                    output['weekday_text'] = ''
+
+                print('user_ratings_total')
+                if output['user_ratings_total'] == '':
+                    output['reviews'] = ''
+                else:
+                    output = get_reviews(driver, output)
+
+                print('find_big_photo')
+                output = find_big_photo(output, driver)
+
+                output_name = output['name'].replace('(','').replace(')', '')
+                query_name = '{}+{}'.format(output_name, output['addr'])
+                query_name = query_name.replace(' ','')
+                output['item_url'] = item_url
+                output['keyword'] = keyword
+                output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
+                data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
+                
+            except Exception as e:
+                print(e)
+                error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
+                data_select_insert(db, 'error_list', error_table_col, row)
+                time.sleep(1)
+                # driver.close()
+                # driver = brower_start(port)
+                # driver = serive_create_linux(profilepath)
+   
+
+
+if __name__ == '__main__':
+    main()