noodles 3 lat temu
rodzic
commit
3e72f15efd
1 zmienionych plików z 66 dodań i 41 usunięć
  1. 66 41
      shop_item_list.py

+ 66 - 41
shop_item_list.py

@@ -32,21 +32,33 @@ def brower_start(port):
 
 
 def page_down_(driver, xpath_css, time_):
-    elmts = driver.find_elements_by_xpath(xpath_css)
-    print(elmts)
-    if len(elmts)>1:
-        elmt=elmts[1]
-    else:
-        elmt=elmts[0]
-    actions = ActionChains(driver)
-    actions.move_to_element(elmt).click().perform()
-    for i in range(time_):
-        try:
-            actions = ActionChains(driver)
-            actions.send_keys(Keys.PAGE_DOWN).perform()
-        except:
-            traceback.print_exc()
-        time.sleep(0.5)
+    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
+    result_count = e.text.split('-')[1].replace(' 項結果','')
+    print(result_count)
+    if int(result_count) > 5:
+        for i in range(time_):
+            e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
+            action = webdriver.common.action_chains.ActionChains(driver)
+            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
+            action.click()
+            action.perform()
+            time.sleep(0.5)
+
+    # elmts = driver.find_elements_by_xpath(xpath_css)
+    # print(elmts)
+    # if len(elmts)>1:
+    #     elmt=elmts[1]
+    # else:
+    #     elmt=elmts[0]
+    # actions = ActionChains(driver)
+    # actions.move_to_element(elmt).click().perform()
+    # for i in range(time_):
+    #     try:
+    #         actions = ActionChains(driver)
+    #         actions.send_keys(Keys.PAGE_DOWN).perform()
+    #     except:
+    #         traceback.print_exc()
+    #     time.sleep(0.5)
 
 
 def get_url_list(driver):
@@ -113,59 +125,70 @@ def keyin_keyword(driver, keyword):
     
 
 def get_crawler_list(db):
-    result = db.query('select distinct(keyword) from shop_item_list order by keyword')
+    result = db.query('select * from shop_item_list order by keyword')
     result = pd.DataFrame([i for i in result])
-    progress = db.query('select distinct(kw) from progress_list2')
+    result = result[~result.keyword.str.contains('項')]
+
+    progress = db.query('select distinct(kw) from progress_list2 where num < 367')
     progress = pd.DataFrame([i for i in progress])
 
     if len(progress) != 0:
-        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
+        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
     else:
-        keyword = result.iloc[0].values[0]
+        keyword = result.iloc[0]['keyword']
         
     return keyword
 
 
+def get_lon_lat_list(db, keyword):
+    num=0
+    cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
+    for c in cursor:
+        num=c['num']
+        break
+
+    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
+
+    lst=[]
+    for c in cursor:
+        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+
+    return lst
+
+
 def main():
-#     data = pd.read_csv('lat_long_location.csv', index_col = 0)
-#     keyword = '麻辣火鍋'
-    lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]]
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table = db['shop_item_list3']
+    table2 = db['progress_list2']
+
     port=4447
     if len(sys.argv) > 1 :
         port=int(sys.argv[1])
         print('restart docker p{}'.format(port))
-        os.system('sudo docker container restart pp'+str(port))
+        os.system('sudo docker container restart p'+str(port))
         time.sleep(8)
 
-#     if len(sys.argv) >2:
-#         port=int(sys.argv[2])
     print('drvier start...')
     driver = brower_start(port)
-    #     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
-    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
-    table=db['shop_item_list3']
-    table2=db['progress_list2']
+    
 
-    for i in range(20):
+    for i in range(10):
         try:
             keyword  = get_crawler_list(db)
             print(keyword)
+            lst = get_lon_lat_list(db, keyword)
+            print(keyword, len(lst))
 
-            c = 0
-            for row in lon_lat:
-                c += 1
-                # latitude = row['lat'] #緯度
-                # longitude = row['lon'] #精度
-                latitude = row[1] #緯度
-                longitude = row[0] #精度
-                # table2.upsert({'kw':keyword,'num':row['num']},['kw'])
-                table2.insert({'kw':keyword,'num':c})
+            for r in lst:
+                latitude = r['lat'] #緯度
+                longitude = r['lon'] #精度
+                table2.upsert({'kw':keyword,'num':r['num']},['kw'])
 
                 url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
                 driver.get(url)
                 keyin_keyword(driver, keyword)
                 failcnt = 0
-                for page in range(5):
+                for page in range(10):
                     print(keyword, latitude, longitude, page)
                     url_list = get_url_list(driver)
                     duplicate = 0
@@ -173,7 +196,7 @@ def main():
                     for item in url_list:
                         try:
                             table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
-                                        'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+                                          'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                         except:
                             duplicate += 1
                     print(len(url_list), duplicate)
@@ -185,6 +208,8 @@ def main():
 
                     if page < 2 :
                         element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                        if element.get_attribute('disabled'):
+                            break
                         driver.implicitly_wait(30)
                         ActionChains(driver).move_to_element(element).click(element).perform() 
         except: