noodles 3 år sedan
förälder
incheckning
4902969765
4 ändrade filer med 54 tillägg och 49 borttagningar
  1. 48 30
      run.py
  2. 0 15
      utility/database_access.py
  3. 2 1
      utility/googlemapsearch.sql
  4. 4 3
      utility/parseutils.py

+ 48 - 30
run.py

@@ -12,6 +12,7 @@ from utility import database_access as DA
 from utility.parseutils import *
 from utility.connect import *
 
+from datetime import datetime
 import pandas as pd
 import time
 import json
@@ -50,9 +51,9 @@ def keyin_keyword(driver, keyword):
     ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
     time.sleep(3)
 
-    element = driver.find_element_by_class_name("V0h1Ob-haAclf")
-    driver.implicitly_wait(30)
-    ActionChains(driver).move_to_element(element).click(element).perform()
+    # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
+    # driver.implicitly_wait(30)
+    # ActionChains(driver).move_to_element(element).click(element).perform()
 
 
 def open_time(driver):
@@ -74,11 +75,10 @@ def get_shop_info(driver, output, shop_soup):
     location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
     output['city'] = location[-1]
     output['area'] = location[-2]
-    print(location)
 
     output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
     output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
-    print(output['addr'], output['tel'])
+    print(output['addr'], ', ' ,output['tel'])
 
     for key in element_list:
         element = element_list[key]
@@ -101,18 +101,19 @@ def get_intro_info(driver, output):
     element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
     driver.implicitly_wait(20)
     ActionChains(driver).move_to_element(element).click(element).perform()
-
-    wait = WebDriverWait(driver, 30)
-    item_xpath = "div[aria-label='{}簡介']".format(output['name'])
-    wait.until(
-        EC.element_to_be_clickable((By.CSS_SELECTOR, item_xpath))
-    )
-    time.sleep(1)
+    
+    for i in range(5, 35, 3):
+        try:
+            element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
+            actions = ActionChains(driver)
+            actions.move_to_element(element).perform()
+        except:
+            break
+    
     intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
-
+    
     for key in intro_list:
         elements = intro_soup.find('div',{'aria-label':key})
-        # print(elements)
         if elements:
             element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
             # print(element)
@@ -125,9 +126,10 @@ def get_intro_info(driver, output):
                         intro_list[key][1]: blank_check(ele.text)
                     }]
                     count += 1
+            print(str(tmp))
             output[intro_list[key][0]] = str(tmp)
         else:
-            output[intro_list[key][0]] = []
+            output[intro_list[key][0]] = '[]'
     driver.back()
     return output
 
@@ -241,7 +243,7 @@ def get_reviews(driver, output):
         }]
         count += 1
 
-    output['reviews'] = reviews
+    output['reviews'] = str(reviews)
     driver.back()
     return output
 
@@ -285,31 +287,29 @@ def find_big_photo(output, driver):
     ActionChains(driver).move_to_element(element).click(element).perform()
     
     photo_map = {
-        0: 'shop_photo',
+        1: 'shop_photo',
         2: 'menu_photo'
     }
     
-    for tab_index in [0, 2]:
-        wait = WebDriverWait(driver, 30)
+    for tab_index in [1, 2]:
+        wait = WebDriverWait(driver, 60)
         wait.until(
             EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
         )
         element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
         ActionChains(driver).move_to_element(element).click(element).perform()
         photo_list = find_photo_list(driver)
-        output[photo_map[tab_index]] = photo_list
+        output[photo_map[tab_index]] = str(photo_list)
         
-    for i in range(2):
-        driver.back()
-        time.sleep(1)
     return output
 
+
 def get_url_list(driver):
-    wait = WebDriverWait(driver, 10)
-    wait.until(
-        EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
-    )
-    driver.back()
+    # wait = WebDriverWait(driver, 10)
+    # wait.until(
+    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
+    # )
+    # driver.back()
 
     time.sleep(2)
     for i in range(5, 43, 2):
@@ -326,6 +326,21 @@ def get_url_list(driver):
     return url_list
 
 
+def data_select_insert(db, table_name, table_col, data):
+    tmp = []
+    for name_ in table_col:
+        if name_ == 'crawler_date':
+            continue
+        tmp += [data[name_]]
+
+    tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
+
+    insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+                    .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
+
+    DA.mysql_insert_data(db, insert_sql)
+
+
 def main():
     data = pd.read_csv('lat_long_location.csv', index_col = 0)
     tmp = data.iloc[0]
@@ -376,13 +391,16 @@ def main():
         output = find_big_photo(output, driver)
 
         output_name = output['name'].replace('(','').replace(')', '')
-        output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output_name, output['addr'])
+        query_name = '{}+{}'.format(output_name, output['addr'])
+        query_name = query_name.replace(' ','')
+        output['query_name'] = query_name
+        output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
         time.sleep(2)
         result += [output]
         with open('result/20211207_{}.json'.format(name), 'w') as f:
             json.dump(output, f)
 
-        DA.data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
+        data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
         break
     # except:
         # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')

+ 0 - 15
utility/database_access.py

@@ -97,18 +97,3 @@ def mysql_select_data(db, query_sql):
         print ("Error: unable to fetch data")
         
     return results
-
-
-def data_select_insert(db, table_name, table_col, data):
-    tmp = []
-    for name_ in table_col:
-        if name_ == 'crawler_date':
-            continue
-        tmp += [data[name_]]
-
-    tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
-
-    insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-                    .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
-
-    DA.mysql_insert_data(db, insert_sql)

+ 2 - 1
utility/googlemapsearch.sql

@@ -34,9 +34,10 @@ CREATE TABLE `shop_list` (
    `menu_photo` JSON,
 
    `google_url` VARCHAR(200),
+   `query_name` VARCHAR(200),
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
-   UNIQUE KEY (`google_url`)
+   UNIQUE KEY (`query_name`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 

+ 4 - 3
utility/parseutils.py

@@ -5,9 +5,10 @@ SHOP_LIST_TABLE_COL = ['name', 'lon', 'lat', 'city', 'area',
                       'rating', 'user_ratings_total', 'category', 'price_level', 
                       'addr', 'tel', 'services', 'products', 'choices', 
                       'facilities', 'groups', 'plans', 'payments', 'safeties', 
-                      'specials', 'barrierlevels', 'items', 
-                      'open_now', 'periods', 'weekday_text', 'reviews', 
-                      'google_url', 'crawler_date']
+                      'specials', 'barrierlevels', 'items' ,
+                      'open_now', 'periods', 'weekday_text','reviews',
+                      'shop_photo','menu_photo',
+                      'google_url', 'query_name','crawler_date']
 
 element_list = {
     'category': ['button', {'jsaction':'pane.rating.category'}],