noodles 3 anni fa
parent
commit
dbaf6ea51e
1 ha cambiato i file con 72 aggiunte e 46 eliminazioni
  1. 72 46
      shop_item_list.py

+ 72 - 46
shop_item_list.py

@@ -18,26 +18,47 @@ import time
 import json
 import re
 
-def brower_start():
+def brower_start(port):
     options = webdriver.ChromeOptions()
     browser = webdriver.Remote(
-        command_executor='http://192.53.174.202:4444/wd/hub',
+        #command_executor='http://192.53.174.202:4444/wd/hub',
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
         desired_capabilities=options.to_capabilities()
     )
     return browser
 
 
-def get_url_list(driver):
-    for i in range(5, 43, 2):
+def page_down_(driver, xpath_css, time_):
+    elmts = driver.find_elements_by_xpath(xpath_css)
+    print(elmts)
+    if len(elmts)>1:
+        elmt=elmts[1]
+    else:
+        elmt=elmts[0]
+    actions = ActionChains(driver)
+    actions.move_to_element(elmt).click().perform()
+    for i in range(time_):
         try:
-            wait = WebDriverWait(driver, 60)
-            wait.until(
-                EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
-            )
-            driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
-            time.sleep(0.5)
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.PAGE_DOWN).perform()
         except:
-            pass
+            traceback.print_exc()
+        time.sleep(0.5)
+
+
+def get_url_list(driver):
+    # for i in range(5, 43, 2):
+    #     try:
+    #         wait = WebDriverWait(driver, 60)
+    #         wait.until(
+    #             EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
+    #         )
+    #         driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
+    #         time.sleep(0.5)
+    #     except:
+    #         pass
+    page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
+
     url_soup = BeautifulSoup(driver.page_source, 'html.parser')
     url_list = []
     for i in url_soup.find_all('a'):
@@ -46,7 +67,7 @@ def get_url_list(driver):
                 url_list += [[i['href'], i['aria-label']]]
         except:
             pass
-    
+    print(len(url_list))
     return url_list
 
 
@@ -59,42 +80,47 @@ def keyin_keyword(driver, keyword):
 
 def main():
     data = pd.read_csv('lat_long_location.csv', index_col = 0)
-    db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+
+    keyword = '麻辣火鍋'
+    if len(sys.argv) >1:
+        keyword=sys.argv[1]
+    port=4444
+    if len(sys.argv) >2:
+        port=int(sys.argv[2])
+
     print('drvier start...')
-    driver = brower_start()
-
-#    for keyword in ['碗粿','炒麵','肉粽']:
-    for keyword in ['碗粿']:
-
-        for k, row in data.iterrows():
-            try:
-                latitude = row['latitude'] #緯度
-                longitude = row['longitude'] #精度
-                url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
-                driver.get(url)
-                keyin_keyword(driver, keyword)
+    driver = brower_start(port)
+    db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+
+    for k, row in data.iterrows():
+        try:
+            latitude = row['latitude'] #緯度
+            longitude = row['longitude'] #精度
+            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+            driver.get(url)
+            keyin_keyword(driver, keyword)
+            
+            for page in range(4):
+                print(keyword, k, row['location'], latitude, longitude, page)
+                url_list = get_url_list(driver)
                 
-                for page in range(4):
-                    print(keyword, k, row['location'], latitude, longitude, page)
-                    url_list = get_url_list(driver)
-                    
-                    shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
-                    for item in url_list:
-                        result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
-                        insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-                                        .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
-
-                        DA.mysql_insert_data(db, insert_sql)
-                    
-                    if page < 2 :
-                        element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
-                        driver.implicitly_wait(30)
-                        ActionChains(driver).move_to_element(element).click(element).perform() 
-            except:
-                error = pd.DataFrame([row])
-                error.to_csv('error_shop_item_list.csv', mode='a', header = False)
-                driver.close()
-                driver = brower_start()
+                shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+                for item in url_list:
+                    result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
+                    insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+                                    .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
+
+                    DA.mysql_insert_data(db, insert_sql)
+                
+                if page < 2 :
+                    element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                    driver.implicitly_wait(30)
+                    ActionChains(driver).move_to_element(element).click(element).perform() 
+        except:
+            error = pd.DataFrame([row])
+            error.to_csv('error_shop_item_list.csv', mode='a', header = False)
+            #driver.close()
+            #driver = brower_start()
 
 
 if __name__ == '__main__':