noodles 3 gadi atpakaļ
vecāks
revīzija
e604aecd14
1 mainītis faili ar 93 papildinājumiem un 69 dzēšanām
  1. 93 69
      run.py

+ 93 - 69
run.py

@@ -14,6 +14,7 @@ from utility.parseutils import *
 from utility.connect import *
 
 from datetime import datetime
+import traceback
 import dataset
 import pandas as pd
 import time
@@ -62,7 +63,7 @@ def open_time(driver):
     element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
     if element.text.find('預訂') == -1:
         element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
-        driver.implicitly_wait(20)
+        driver.implicitly_wait(10)
         ActionChains(driver).move_to_element(element).click(element).perform()
         return 1
     else:
@@ -77,41 +78,46 @@ def get_shop_info(driver, output, shop_soup):
     location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
     output['city'] = location[-1]
     output['area'] = location[-2]
-
-    output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
-    output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
+    
+    try:
+        output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
+    except:
+        output['addr'] = ''
+        
+    try:
+        output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
+    except:
+        output['tel'] = ''
     print(output['addr'], ', ' ,output['tel'])
 
     for key in element_list:
-        element = element_list[key]
-        if len(element) == 3:
-            value = shop_soup.find(element[0],element[1])[element[2]]
-            
-        else:
-            tmp_value = shop_soup.find(element[0],element[1])
-            if tmp_value:
-                value = tmp_value.text
+        try:
+            element = element_list[key]
+            if len(element) == 3:
+                value = shop_soup.find(element[0],element[1])[element[2]]
+
             else:
-                value = ''
+                tmp_value = shop_soup.find(element[0],element[1])
+                if tmp_value:
+                    value = tmp_value.text
+                else:
+                    value = ''
 
-        output[key] = value_check(key, value)
+            output[key] = value_check(key, value)
+        except:
+            output[key] = ''
 
     return output
 
 
 def get_intro_info(driver, output):
-    element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
-    driver.implicitly_wait(20)
+    # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
+    element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
+    driver.implicitly_wait(10)
     ActionChains(driver).move_to_element(element).click(element).perform()
     
-    for i in range(5, 35, 3):
-        try:
-            element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
-            actions = ActionChains(driver)
-            actions.move_to_element(element).perform()
-        except:
-            break
-    
+    page_down_(driver, "//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc']", 3)
+
     intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
     for key in intro_list:
         elements = intro_soup.find('div',{'aria-label':key})
@@ -205,10 +211,11 @@ def get_reviews(driver, output):
         EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
     )
     element = driver.find_element_by_css_selector(more_reviews_css)
-    driver.implicitly_wait(20)
+    driver.implicitly_wait(10)
     ActionChains(driver).move_to_element(element).click(element).perform()
-    time.sleep(1)
-
+    time.sleep(0.5)
+    
+    page_down_(driver, "//div[@class='siAUzd-neVct siAUzd-neVct-H9tDt']", 10)
     all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
     for ap in all_photo:
         ap.click()
@@ -264,31 +271,25 @@ def get_reviews(driver, output):
 
 
 def find_photo_list(driver):
-    time.sleep(2)
+    time.sleep(0.5)
     wait = WebDriverWait(driver, 60)
     wait.until(
         EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
     )
-    count_list = []
-    for i in range(1, 6):
-        try:
-            element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
-            count_list += [element.get_attribute('data-photo-index')]
-            actions = ActionChains(driver)
-            actions.move_to_element(element).perform()
-        except:
-            break
-    time.sleep(1)
+    page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
     photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
     photo_url = []
-    for photo_id in count_list:
-        for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
-            if i['style'].find('width') != -1:
-                sentence = i['style']
+    count = 0
+    for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
+        if count > 5: break
+        a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
+        if a_url:
+            if a_url.find('width') != -1:
+                sentence = a_url['style']
                 photo = re.search(r'https:(.*)\"', sentence)
-                print(sentence)
                 photo_url += [photo.group(0).replace('\"','')]
-                break
+        count += 1
     return photo_url
 
 
@@ -305,11 +306,14 @@ def find_big_photo(output, driver):
     
     tab_dict = {}
     for tab_index in [0, 1, 2]:
-        photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
-        if photo_name == '菜單':
-            tab_dict[photo_name] = tab_index
-        elif photo_name == '全部':
-            tab_dict[photo_name] = tab_index
+        try:
+            photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
+            if photo_name == '菜單':
+                tab_dict[photo_name] = tab_index
+            elif photo_name == '全部':
+                tab_dict[photo_name] = tab_index
+        except:
+            traceback.print_exc()
     print(tab_dict)                            
     for tab_ in tab_dict:
         tab_index = tab_dict[tab_]
@@ -371,7 +375,7 @@ def time_click(driver):
     try:
         time_css =  "span[aria-label='顯示本週營業時間']"
         element = driver.find_element_by_css_selector(time_css)
-        driver.implicitly_wait(30)
+        driver.implicitly_wait(10)
         ActionChains(driver).move_to_element(element).click(element).perform()
         status = '正常'
 
@@ -392,7 +396,7 @@ def get_not_cralwer_url(keyword):
     
     url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
     url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
-    url_pd = url_pd[url_pd['item_url_length']!=1000]
+    url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
     url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
     url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
 
@@ -426,17 +430,37 @@ def serive_create_linux(profilepath):
 
     return driver
 
+
+def page_down_(driver, xpath_css, time_):
+    elmts = driver.find_elements_by_xpath(xpath_css)
+    print(elmts)
+    if len(elmts)>1:
+        elmt=elmts[1]
+    else:
+        elmt=elmts[0]
+        
+    actions = ActionChains(driver)
+    actions.move_to_element(elmt).click().perform()
+    for i in range(time_):
+        try:
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
+        except:
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
 def main():
     keyword = '咖啡'
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
     url_pd = get_not_cralwer_url(keyword)
 
     print('drvier start...')
-    driver = brower_start()
+    # driver = brower_start()
 
     # driver = serive_create('Profile 1')
-    # profilepath = 'Profile 1'
-    # driver = serive_create_linux(profilepath)
+    profilepath = 'Profile 1'
+    driver = serive_create_linux(profilepath)
     
     for key, row in url_pd.iterrows():
         try:    
@@ -444,30 +468,30 @@ def main():
             item_url = row['item_url']
             print(key, name, ': ' ,item_url)
 
-            driver.get(item_url)
-            for i in range(4, 26, 2):
-                element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
-                actions = ActionChains(driver)
-                actions.move_to_element(element).perform()
-                time.sleep(0.5)
             print('start...')
+            driver.get(item_url)
+            page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
+            
             time_status = time_click(driver)
-            time.sleep(0.5)
+            time.sleep(1)
             shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
 
             output = {
                 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
             }
             print(output['name'])
-
+            print('get_shop_info')
             output = get_shop_info(driver, output, shop_soup)
-
+            print('get_intro_info')
             output = get_intro_info(driver, output)
-
+            print('get_time_list')
             output = get_time_list(shop_soup, output)
-
-            output = get_reviews(driver, output)
-
+            print('user_ratings_total')
+            if output['user_ratings_total'] == '':
+                output['reviews'] = ''
+            else:
+                output = get_reviews(driver, output)
+            print('find_big_photo')
             output = find_big_photo(output, driver)
 
             output_name = output['name'].replace('(','').replace(')', '')
@@ -482,8 +506,8 @@ def main():
             error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
             data_select_insert(db, 'error_list', error_table_col, row)
             driver.close()
-            driver = brower_start()
-            # driver = serive_create_linux(profilepath)
+            # driver = brower_start()
+            driver = serive_create_linux(profilepath)