3 vuotta sitten · c1ca1b09c9
--- a/run.py
+++ b/run.py
@@ -14,6 +14,7 @@ from utility.parseutils import *
 
				 from utility.connect import *
			
 
				 
			
 
				 from datetime import datetime
			
 
				+import traceback
			
 
				 import dataset
			
 
				 import pandas as pd
			
 
				 import time
			
@@ -77,24 +78,34 @@ def get_shop_info(driver, output, shop_soup):
 
				     location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
			
 
				     output['city'] = location[-1]
			
 
				     output['area'] = location[-2]
			
 
				-
			
 
				-    output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
			
 
				-    output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
			
 
				+    
			
 
				+    try:
			
 
				+        output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
			
 
				+    except:
			
 
				+        output['addr'] = ''
			
 
				+        
			
 
				+    try:
			
 
				+        output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
			
 
				+    except:
			
 
				+        output['tel'] = ''
			
 
				     print(output['addr'], ', ' ,output['tel'])
			
 
				 
			
 
				     for key in element_list:
			
 
				-        element = element_list[key]
			
 
				-        if len(element) == 3:
			
 
				-            value = shop_soup.find(element[0],element[1])[element[2]]
			
 
				-            
			
 
				-        else:
			
 
				-            tmp_value = shop_soup.find(element[0],element[1])
			
 
				-            if tmp_value:
			
 
				-                value = tmp_value.text
			
 
				+        try:
			
 
				+            element = element_list[key]
			
 
				+            if len(element) == 3:
			
 
				+                value = shop_soup.find(element[0],element[1])[element[2]]
			
 
				+
			
 
				             else:
			
 
				-                value = ''
			
 
				+                tmp_value = shop_soup.find(element[0],element[1])
			
 
				+                if tmp_value:
			
 
				+                    value = tmp_value.text
			
 
				+                else:
			
 
				+                    value = ''
			
 
				 
			
 
				-        output[key] = value_check(key, value)
			
 
				+            output[key] = value_check(key, value)
			
 
				+        except:
			
 
				+            output[key] = ''
			
 
				 
			
 
				     return output
			
 
				 
			
@@ -104,14 +115,8 @@ def get_intro_info(driver, output):
 
				     driver.implicitly_wait(20)
			
 
				     ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				     
			
 
				-    for i in range(5, 35, 3):
			
 
				-        try:
			
 
				-            element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
			
 
				-            actions = ActionChains(driver)
			
 
				-            actions.move_to_element(element).perform()
			
 
				-        except:
			
 
				-            break
			
 
				-    
			
 
				+    page_down_(driver, "//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc']", 3)
			
 
				+
			
 
				     intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				     for key in intro_list:
			
 
				         elements = intro_soup.find('div',{'aria-label':key})
			
@@ -207,8 +212,9 @@ def get_reviews(driver, output):
 
				     element = driver.find_element_by_css_selector(more_reviews_css)
			
 
				     driver.implicitly_wait(20)
			
 
				     ActionChains(driver).move_to_element(element).click(element).perform()
			
 
				-    time.sleep(1)
			
 
				-
			
 
				+    time.sleep(0.5)
			
 
				+    
			
 
				+    page_down_(driver, "//div[@class='siAUzd-neVct siAUzd-neVct-H9tDt']", 10)
			
 
				     all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
			
 
				     for ap in all_photo:
			
 
				         ap.click()
			
@@ -264,31 +270,25 @@ def get_reviews(driver, output):
 
				 
			
 
				 
			
 
				 def find_photo_list(driver):
			
 
				-    time.sleep(2)
			
 
				+    time.sleep(0.5)
			
 
				     wait = WebDriverWait(driver, 60)
			
 
				     wait.until(
			
 
				         EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
			
 
				     )
			
 
				-    count_list = []
			
 
				-    for i in range(1, 6):
			
 
				-        try:
			
 
				-            element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
			
 
				-            count_list += [element.get_attribute('data-photo-index')]
			
 
				-            actions = ActionChains(driver)
			
 
				-            actions.move_to_element(element).perform()
			
 
				-        except:
			
 
				-            break
			
 
				-    time.sleep(1)
			
 
				+    page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
			
 
				     photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+
			
 
				     photo_url = []
			
 
				-    for photo_id in count_list:
			
 
				-        for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
			
 
				-            if i['style'].find('width') != -1:
			
 
				-                sentence = i['style']
			
 
				+    count = 0
			
 
				+    for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
			
 
				+        if count > 5: break
			
 
				+        a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
			
 
				+        if a_url:
			
 
				+            if a_url.find('width') != -1:
			
 
				+                sentence = a_url['style']
			
 
				                 photo = re.search(r'https:(.*)\"', sentence)
			
 
				-                print(sentence)
			
 
				                 photo_url += [photo.group(0).replace('\"','')]
			
 
				-                break
			
 
				+        count += 1
			
 
				     return photo_url
			
 
				 
			
 
				 
			
@@ -426,6 +426,26 @@ def serive_create_linux(profilepath):
 
				 
			
 
				     return driver
			
 
				 
			
 
				+
			
 
				+def page_down_(driver, xpath_css, time_):
			
 
				+    elmts = driver.find_elements_by_xpath(xpath_css)
			
 
				+    print(elmts)
			
 
				+    if len(elmts)>1:
			
 
				+        elmt=elmts[1]
			
 
				+    else:
			
 
				+        elmt=elmts[0]
			
 
				+        
			
 
				+    actions = ActionChains(driver)
			
 
				+    actions.move_to_element(elmt).click().perform()
			
 
				+    for i in range(time_):
			
 
				+        try:
			
 
				+            actions = ActionChains(driver)
			
 
				+            actions.send_keys(Keys.PAGE_DOWN).perform()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+        time.sleep(0.5)
			
 
				+
			
 
				+
			
 
				 def main():
			
 
				     keyword = '咖啡'
			
 
				     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
			
@@ -444,13 +464,10 @@ def main():
 
				             item_url = row['item_url']
			
 
				             print(key, name, ': ' ,item_url)
			
 
				 
			
 
				-            driver.get(item_url)
			
 
				-            for i in range(4, 26, 2):
			
 
				-                element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
			
 
				-                actions = ActionChains(driver)
			
 
				-                actions.move_to_element(element).perform()
			
 
				-                time.sleep(0.5)
			
 
				             print('start...')
			
 
				+            driver.get(item_url)
			
 
				+            page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 6)
			
 
				+            
			
 
				             time_status = time_click(driver)
			
 
				             time.sleep(0.5)
			
 
				             shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
@@ -466,7 +483,10 @@ def main():
 
				 
			
 
				             output = get_time_list(shop_soup, output)
			
 
				 
			
 
				-            output = get_reviews(driver, output)
			
 
				+            if output['user_ratings_total'] == '':
			
 
				+                output['reviews'] = ''
			
 
				+            else:
			
 
				+                output = get_reviews(driver, output)
			
 
				 
			
 
				             output = find_big_photo(output, driver)
			
 
				 
			
--- a/utility/parseutils.py
+++ b/utility/parseutils.py
@@ -29,7 +29,7 @@ intro_list = {
 
				     '健康與安全':['safeties','safety'],
			
 
				     '特色':['specials','special'],
			
 
				     '無障礙程度':['barrierlevels','barrierlevel'],
			
 
				-    '詳細資料':['items','item'],
			
 
				+    '詳細資料':['items','item']
			
 
				 }
			
 
				 
			
 
				 week_list = {