noodles před 3 roky
rodič
revize
80b422a878
3 změnil soubory, kde provedl 126 přidání a 23 odebrání
  1. 27 10
      run.py
  2. 93 0
      shop_item_list.py
  3. 6 13
      utility/googlemapsearch.sql

+ 27 - 10
run.py

@@ -262,21 +262,28 @@ def get_reviews(driver, output):
 
 
 def find_photo_list(driver):
-    wait = WebDriverWait(driver, 30)
+    wait = WebDriverWait(driver, 60)
     wait.until(
-        EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[data-photo-index="5"]'))
+        EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
     )
-    for photo_id in range(6):
-        driver.find_element(By.CSS_SELECTOR,'a[data-photo-index="{}"]'.format(photo_id)).send_keys(Keys.DOWN)
-        time.sleep(1)
+    count_list = []
+    for i in range(1, 6):
+        try:
+            element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
+            count_list += [element.get_attribute('data-photo-index')]
+            actions = ActionChains(driver)
+            actions.move_to_element(element).perform()
+        except:
+            break
         
     photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
     photo_url = []
-    for photo_id in range(5):
+    for photo_id in count_list:
         for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
             if i['style'].find('width') != -1:
                 sentence = i['style']
                 photo = re.search(r'https:(.*)\"', sentence)
+                print(sentence)
                 photo_url += [photo.group(0).replace('\"','')]
                 break
     return photo_url
@@ -287,11 +294,21 @@ def find_big_photo(output, driver):
     ActionChains(driver).move_to_element(element).click(element).perform()
     
     photo_map = {
-        1: 'shop_photo',
-        2: 'menu_photo'
+        '全部': 'shop_photo',
+        '菜單': 'menu_photo'
     }
     
-    for tab_index in [1, 2]:
+    tab_dict = {}
+    for tab_index in [0, 1, 2]:
+        photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
+        if photo_name == '菜單':
+            tab_dict[photo_name] = tab_index
+        elif photo_name == '全部':
+            tab_dict[photo_name] = tab_index
+    print(tab_dict)                            
+    for tab_ in tab_dict:
+        tab_index = tab_dict[tab_]
+        print(tab_index)
         wait = WebDriverWait(driver, 60)
         wait.until(
             EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
@@ -299,7 +316,7 @@ def find_big_photo(output, driver):
         element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
         ActionChains(driver).move_to_element(element).click(element).perform()
         photo_list = find_photo_list(driver)
-        output[photo_map[tab_index]] = str(photo_list)
+        output[photo_map[tab_]] = str(photo_list)
         
     return output
 

+ 93 - 0
shop_item_list.py

@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+
+from datetime import datetime
+import pandas as pd
+import time
+import json
+import re
+
+def brower_start():
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://192.53.174.202:4444/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def get_url_list(driver):
+    for i in range(5, 43, 2):
+        try:
+            wait = WebDriverWait(driver, 60)
+            wait.until(
+                EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
+            )
+            driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
+            time.sleep(1)
+        except:
+            pass
+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    url_list = []
+    for i in url_soup.find_all('a'):
+        try:
+            if i['href'].find('maps/place') != -1:
+                url_list += [[i['href'], i['aria-label']]]
+        except:
+            pass
+    
+    return url_list
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+
+def main():
+    data = pd.read_csv('lat_long_location.csv', index_col = 0)
+    db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+    print('drvier start...')
+    driver = brower_start()
+
+    for k, row in data.iterrows():
+        latitude = row['latitude'] #緯度
+        longitude = row['longitude'] #精度
+        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+        driver.get(url)
+
+        keyword = '咖啡'
+        keyin_keyword(driver, keyword)
+        
+        for page in range(4):
+            print(k, row['location'], latitude, longitude, page)
+            url_list = get_url_list(driver)
+            
+            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+            for item in url_list:
+                result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
+                insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+                                .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
+
+                DA.mysql_insert_data(db, insert_sql)
+            
+            if page < 2 :
+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                driver.implicitly_wait(30)
+                ActionChains(driver).move_to_element(element).click(element).perform() 
+
+if __name__ == '__main__':
+    main()

+ 6 - 13
utility/googlemapsearch.sql

@@ -43,21 +43,14 @@ CREATE TABLE `shop_list` (
 
 CREATE TABLE `shop_item_list` (
    `id` int NOT NULL AUTO_INCREMENT,
-   `google_url` VARCHAR(200),
-   `services` VARCHAR(100),
-   `products` VARCHAR(100),
-   `choices` VARCHAR(100),
-   `facilities` VARCHAR(100),
-   `groups` VARCHAR(100),
-   `plans` VARCHAR(100),
-   `payments` VARCHAR(100),
-   `safeties` VARCHAR(100),
-   `specials` VARCHAR(100),
-   `barrierlevels` VARCHAR(100),
-   `items` VARCHAR(100),
+   `name` VARCHAR(100),
+   `lon` DOUBLE,
+   `lat` DOUBLE,
+   `keyword` VARCHAR(20),
+   `item_url` VARCHAR(200),
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
-   UNIQUE KEY (`google_url`)
+   UNIQUE KEY (`item_url`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;