noodles 3 anni fa
parent
commit
9752735f6e
1 ha cambiato i file con 51 aggiunte e 36 eliminazioni
  1. 51 36
      shop_item_list.py

+ 51 - 36
shop_item_list.py

@@ -89,32 +89,47 @@ def keyin_keyword(driver, keyword):
     time.sleep(3)
 
 
-def get_crawler_list(db):
+# def get_crawler_list(db):
     
-    result = db.query('select keyword, count(*) from shop_item_list group by keyword')
-    result = pd.DataFrame([i for i in result])
-    result.columns = ['keyword', 'count']
-    result = result[result['count'] < 100]
-    keyword = result.sample(1).iloc[0]['keyword']
+#     result = db.query('select keyword, count(*) from shop_item_list group by keyword')
+#     result = pd.DataFrame([i for i in result])
+#     result.columns = ['keyword', 'count']
+#     result = result[result['count'] < 100]
+#     keyword = result.sample(1).iloc[0]['keyword']
     
-    num=0
-    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
-    for c in cursor:
-        num=c['num']
-        break
-
-    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
-    #    cursor=db.query('select * from lat_lon_loc')
-    lst=[]
-    for c in cursor:
-        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+#     num=0
+#     cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
+#     for c in cursor:
+#         num=c['num']
+#         break
+
+#     cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
+#     #    cursor=db.query('select * from lat_lon_loc')
+#     lst=[]
+#     for c in cursor:
+#         lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
         
-    return keyword, lst
-    
+#     return keyword, lst
     
+
+def get_crawler_list(db):
+    result = db.query('select distinct(keyword) from shop_item_list order by keyword')
+    result = pd.DataFrame([i for i in result])
+    progress = db.query('select distinct(kw) from progress_list2')
+    progress = pd.DataFrame([i for i in progress])
+
+    if len(progress) != 0:
+        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
+    else:
+        keyword = result.iloc[0].values[0]
+        
+    return keyword
+
+
 def main():
 #     data = pd.read_csv('lat_long_location.csv', index_col = 0)
 #     keyword = '麻辣火鍋'
+    lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]]
     port=4447
     if len(sys.argv) > 1 :
         port=int(sys.argv[1])
@@ -128,36 +143,36 @@ def main():
     driver = brower_start(port)
 #     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
-    table=db['shop_item_list']
+    table=db['shop_item_list2']
     table2=db['progress_list2']
     
-    keyword, data  = get_crawler_list(db)
-    print( keyword, len(data))
-    
-    for row in data:
+    keyword  = get_crawler_list(db)
+    print(keyword)
+
+    c = 0
+    for row in lon_lat:
+        c += 1
         try:
-            latitude = row['lat'] #緯度
-            longitude = row['lon'] #精度
-            table2.upsert({'kw':keyword,'num':row['num']},['kw'])
+            # latitude = row['lat'] #緯度
+            # longitude = row['lon'] #精度
+            latitude = row[1] #緯度
+            longitude = row[0] #精度
+            # table2.upsert({'kw':keyword,'num':row['num']},['kw'])
+            table2.insert({'kw':keyword,'num':c})
 
             url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
             driver.get(url)
             keyin_keyword(driver, keyword)
             failcnt = 0
-            for page in range(4):
+            for page in range(5):
                 print(keyword, row['loc'], latitude, longitude, page)
                 url_list = get_url_list(driver)
-                # if url_list == 'EMPTY':
-                #     failcnt+=1
-                #     if failcnt >=2:
-                #         break
-                #     continue
-
                 duplicate = 0
-                shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+                # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
                 for item in url_list:
                     try:
-                        table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+                        table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
+                                      'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                     except:
                         duplicate += 1
                 print(len(url_list), duplicate)