Wunoodles 3 anni fa
parent
commit
d235165ea0
1 ha cambiato i file con 73 aggiunte e 37 eliminazioni
  1. 73 37
      shop_item_list.py

+ 73 - 37
shop_item_list.py

@@ -14,6 +14,7 @@ from utility.connect import *
 
 from datetime import datetime
 import pandas as pd
+import dataset
 import time
 import json
 import re
@@ -58,7 +59,7 @@ def get_url_list(driver):
     #         time.sleep(0.5)
     #     except:
     #         pass
-    page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
+    page_down_(driver, '//div[@class="TFQHme"]', 8)
 
     url_soup = BeautifulSoup(driver.page_source, 'html.parser')
     url_list = []
@@ -79,47 +80,82 @@ def keyin_keyword(driver, keyword):
     time.sleep(3)
 
 
+def get_crawler_list(db):
+    
+    result = db.query('select keyword, count(*) from shop_item_list group by keyword')
+    result = pd.DataFrame([i for i in result])
+    result.columns = ['keyword', 'count']
+    results = results[results['count'] < 500]
+    keyword = results.sample(1).iloc[0]['keyword']
+    
+    num=0
+    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
+    for c in cursor:
+        num=c['num']
+        break
+
+    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
+    #    cursor=db.query('select * from lat_lon_loc')
+    lst=[]
+    for c in cursor:
+        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+        
+    return keyword, lst
+    
+    
 def main():
-    data = pd.read_csv('lat_long_location.csv', index_col = 0)
-
-    keyword = '麻辣火鍋'
-    if len(sys.argv) >1:
-        keyword=sys.argv[1]
+#     data = pd.read_csv('lat_long_location.csv', index_col = 0)
+#     keyword = '麻辣火鍋'
     port=4444
-    if len(sys.argv) >2:
-        port=int(sys.argv[2])
+    if len(sys.argv) > 1 :
+        port=sys.argv[1]
+#     if len(sys.argv) >2:
+#         port=int(sys.argv[2])
 
     print('drvier start...')
     driver = brower_start(port)
-    db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
-
-    for k, row in data.iterrows():
-        try:
-            latitude = row['latitude'] #緯度
-            longitude = row['longitude'] #精度
-            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
-            driver.get(url)
-            keyin_keyword(driver, keyword)
-            
-            for page in range(4):
-                print(keyword, k, row['location'], latitude, longitude, page)
-                url_list = get_url_list(driver)
-                
-                shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
-                for item in url_list:
-                    result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
-                    insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-                                    .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
-
-                    DA.mysql_insert_data(db, insert_sql)
-                
-                if page < 2 :
-                    element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
-                    driver.implicitly_wait(30)
-                    ActionChains(driver).move_to_element(element).click(element).perform() 
-        except:
-            error = pd.DataFrame([row])
-            error.to_csv('error_shop_item_list.csv', mode='a', header = False)
+#     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table=db['shop_item_list']
+    table2=db['progress_list2']
+    
+    data, keyword = get_crawler_list(db)
+    print( keyword, len(data))
+    
+    for row in data:
+#         try:
+        latitude = row['lat'] #緯度
+        longitude = row['lon'] #精度
+        table2.upsert({'kw':keyword,'num':r['num']},['kw'])
+
+        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+        driver.get(url)
+        keyin_keyword(driver, keyword)
+
+        for page in range(4):
+            print(keyword, row['loc'], latitude, longitude, page)
+            url_list = get_url_list(driver)
+
+            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+            for item in url_list:
+                try:
+                    table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+                except:
+                    print('dup entry')
+
+#                     result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
+#                     insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+#                                     .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
+
+#                     DA.mysql_insert_data(db, insert_sql)
+
+            if page < 2 :
+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                driver.implicitly_wait(30)
+                ActionChains(driver).move_to_element(element).click(element).perform() 
+#         except:
+#             error = pd.DataFrame([row])
+#             error.to_csv('error_shop_item_list.csv', mode='a', header = False)
             #driver.close()
             #driver = brower_start()