noodles před 3 roky
rodič
revize
02971c1dfe
1 změnil soubory, kde provedl 58 přidání a 39 odebrání
  1. 58 39
      shop_item_list.py

+ 58 - 39
shop_item_list.py

@@ -5,7 +5,8 @@ from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.common.by import By
-
+import selenium
+import traceback
 from bs4 import BeautifulSoup
 
 from utility import database_access as DA
@@ -18,7 +19,7 @@ import dataset
 import time
 import json
 import re
-import sys
+import sys, os
 
 def brower_start(port):
     options = webdriver.ChromeOptions()
@@ -59,6 +60,14 @@ def get_url_list(driver):
     #         time.sleep(0.5)
     #     except:
     #         pass
+
+    # wait = WebDriverWait(driver, 30)
+    # try:
+    #     wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
+    # except selenium.common.exceptions.TimeoutException:
+    #     traceback.print_exc()
+    #     return "EMPTY"
+
     page_down_(driver, '//div[@class="TFQHme"]', 8)
 
     url_soup = BeautifulSoup(driver.page_source, 'html.parser')
@@ -69,7 +78,7 @@ def get_url_list(driver):
                 url_list += [[i['href'], i['aria-label']]]
         except:
             pass
-    print(len(url_list))
+    # print(len(url_list))
     return url_list
 
 
@@ -85,8 +94,8 @@ def get_crawler_list(db):
     result = db.query('select keyword, count(*) from shop_item_list group by keyword')
     result = pd.DataFrame([i for i in result])
     result.columns = ['keyword', 'count']
-    results = results[results['count'] < 500]
-    keyword = results.sample(1).iloc[0]['keyword']
+    result = result[result['count'] < 100]
+    keyword = result.sample(1).iloc[0]['keyword']
     
     num=0
     cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
@@ -106,9 +115,12 @@ def get_crawler_list(db):
 def main():
 #     data = pd.read_csv('lat_long_location.csv', index_col = 0)
 #     keyword = '麻辣火鍋'
-    port=4444
+    port=4447
     if len(sys.argv) > 1 :
-        port=sys.argv[1]
+        port=int(sys.argv[1])
+        print('restart docker p{}'.format(port))
+        os.system('sudo docker container restart p'+str(port))
+        time.sleep(8)
 #     if len(sys.argv) >2:
 #         port=int(sys.argv[2])
 
@@ -119,41 +131,48 @@ def main():
     table=db['shop_item_list']
     table2=db['progress_list2']
     
-    data, keyword = get_crawler_list(db)
+    keyword, data  = get_crawler_list(db)
     print( keyword, len(data))
     
     for row in data:
-#         try:
-        latitude = row['lat'] #緯度
-        longitude = row['lon'] #精度
-        table2.upsert({'kw':keyword,'num':r['num']},['kw'])
-
-        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
-        driver.get(url)
-        keyin_keyword(driver, keyword)
-
-        for page in range(4):
-            print(keyword, row['loc'], latitude, longitude, page)
-            url_list = get_url_list(driver)
-
-            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
-            for item in url_list:
-                try:
-                    table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
-                except:
-                    print('dup entry')
-
-#                     result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
-#                     insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-#                                     .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
-
-#                     DA.mysql_insert_data(db, insert_sql)
-
-            if page < 2 :
-                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
-                driver.implicitly_wait(30)
-                ActionChains(driver).move_to_element(element).click(element).perform() 
-#         except:
+        try:
+            latitude = row['lat'] #緯度
+            longitude = row['lon'] #精度
+            table2.upsert({'kw':keyword,'num':row['num']},['kw'])
+
+            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+            driver.get(url)
+            keyin_keyword(driver, keyword)
+            failcnt = 0
+            for page in range(4):
+                print(keyword, row['loc'], latitude, longitude, page)
+                url_list = get_url_list(driver)
+                # if url_list == 'EMPTY':
+                #     failcnt+=1
+                #     if failcnt >=2:
+                #         break
+                #     continue
+
+                duplicate = 0
+                shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+                for item in url_list:
+                    try:
+                        table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+                    except:
+                        duplicate += 1
+                print(len(url_list), duplicate)
+    #                     result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
+    #                     insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+    #                                     .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
+
+    #                     DA.mysql_insert_data(db, insert_sql)
+
+                if page < 2 :
+                    element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                    driver.implicitly_wait(30)
+                    ActionChains(driver).move_to_element(element).click(element).perform() 
+        except:
+            pass
 #             error = pd.DataFrame([row])
 #             error.to_csv('error_shop_item_list.csv', mode='a', header = False)
             #driver.close()