noodles 3 anni fa
parent
commit
b5070a7013
2 ha cambiato i file con 50 aggiunte e 55 eliminazioni
  1. 49 54
      run.py
  2. 1 1
      utility/parseutils.py

+ 49 - 54
run.py

@@ -381,64 +381,59 @@ def time_click(driver):
 
 
 def main():
-    data = pd.read_csv('lat_long_location.csv', index_col = 0)
-    tmp = data.iloc[0]
-    latitude = tmp['latitude'] #緯度
-    longitude = tmp['longitude'] #精度
-
-    url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
     # driver = serive_create('Profile 1')
-
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+    url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
     print('drvier start...')
     driver = brower_start()
-    driver.get(url)
-    # keyin_keyword(driver, '咖啡')
-    # url_list = get_url_list(driver)
-    url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
-
-    # try:
-    for key, row in url_list.iterrows():
-        name = row['name']
-        item_url = row['item_url']
-        print(key, name, ': ' ,item_url)
-        driver.get(item_url)
-        for i in range(4, 26, 2):
-            element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
-            actions = ActionChains(driver)
-            actions.move_to_element(element).perform()
-        print('start...')
-        time_status = time_click(driver)
-        time.sleep(1)
-        shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-
-        output = {
-            'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
-        }
-        print(output['name'])
-
-        output = get_shop_info(driver, output, shop_soup)
-
-        output = get_intro_info(driver, output)
-
-        output = get_time_list(shop_soup, output)
-
-        output = get_reviews(driver, output)
-
-        output = find_big_photo(output, driver)
-
-        output_name = output['name'].replace('(','').replace(')', '')
-        query_name = '{}+{}'.format(output_name, output['addr'])
-        query_name = query_name.replace(' ','')
-        output['query_name'] = query_name
-        output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
-        time.sleep(1)
-        data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
-
-    # except:
-        # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-        # print("error {}".format(id_))
-        # print(blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text))
+    
+    try:
+        for key, row in url_list.iterrows():
+            name = row['name']
+            item_url = row['item_url']
+            result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
+            if len(result) != 0: continue
+            print(key, name, ': ' ,item_url)
+
+            driver.get(item_url)
+            for i in range(4, 26, 2):
+                element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
+                actions = ActionChains(driver)
+                actions.move_to_element(element).perform()
+                time.sleep(0.5)
+            print('start...')
+            time_status = time_click(driver)
+            time.sleep(1)
+            shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+            output = {
+                'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
+            }
+            print(output['name'])
+
+            output = get_shop_info(driver, output, shop_soup)
+
+            output = get_intro_info(driver, output)
+
+            output = get_time_list(shop_soup, output)
+
+            output = get_reviews(driver, output)
+
+            output = find_big_photo(output, driver)
+
+            output_name = output['name'].replace('(','').replace(')', '')
+            query_name = '{}+{}'.format(output_name, output['addr'])
+            query_name = query_name.replace(' ','')
+            output['item_url'] = item_url
+            output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
+            time.sleep(1)
+            data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
+
+    except:
+        error = pd.DataFrame([row])
+        error.to_csv('error.csv', mode='a', header = False)
+        driver.close()
+        driver = brower_start()
    
 
 

+ 1 - 1
utility/parseutils.py

@@ -8,7 +8,7 @@ SHOP_LIST_TABLE_COL = ['name', 'lon', 'lat', 'city', 'area',
                       'specials', 'barrierlevels', 'items' ,
                       'open_now', 'periods', 'weekday_text','reviews',
                       'shop_photo','menu_photo',
-                      'google_url', 'query_name','crawler_date']
+                      'google_url', 'item_url', 'crawler_date']
 
 element_list = {
     'category': ['button', {'jsaction':'pane.rating.category'}],