|
@@ -14,6 +14,7 @@ from utility.parseutils import *
|
|
|
from utility.connect import *
|
|
|
|
|
|
from datetime import datetime
|
|
|
+import traceback
|
|
|
import dataset
|
|
|
import pandas as pd
|
|
|
import time
|
|
@@ -62,7 +63,7 @@ def open_time(driver):
|
|
|
element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
|
|
|
if element.text.find('預訂') == -1:
|
|
|
element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
return 1
|
|
|
else:
|
|
@@ -77,41 +78,46 @@ def get_shop_info(driver, output, shop_soup):
|
|
|
location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
|
|
|
output['city'] = location[-1]
|
|
|
output['area'] = location[-2]
|
|
|
-
|
|
|
- output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
- output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
+
|
|
|
+ try:
|
|
|
+ output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
+ except:
|
|
|
+ output['addr'] = ''
|
|
|
+
|
|
|
+ try:
|
|
|
+ output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
+ except:
|
|
|
+ output['tel'] = ''
|
|
|
print(output['addr'], ', ' ,output['tel'])
|
|
|
|
|
|
for key in element_list:
|
|
|
- element = element_list[key]
|
|
|
- if len(element) == 3:
|
|
|
- value = shop_soup.find(element[0],element[1])[element[2]]
|
|
|
-
|
|
|
- else:
|
|
|
- tmp_value = shop_soup.find(element[0],element[1])
|
|
|
- if tmp_value:
|
|
|
- value = tmp_value.text
|
|
|
+ try:
|
|
|
+ element = element_list[key]
|
|
|
+ if len(element) == 3:
|
|
|
+ value = shop_soup.find(element[0],element[1])[element[2]]
|
|
|
+
|
|
|
else:
|
|
|
- value = ''
|
|
|
+ tmp_value = shop_soup.find(element[0],element[1])
|
|
|
+ if tmp_value:
|
|
|
+ value = tmp_value.text
|
|
|
+ else:
|
|
|
+ value = ''
|
|
|
|
|
|
- output[key] = value_check(key, value)
|
|
|
+ output[key] = value_check(key, value)
|
|
|
+ except:
|
|
|
+ output[key] = ''
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
def get_intro_info(driver, output):
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
- for i in range(5, 35, 3):
|
|
|
- try:
|
|
|
- element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- except:
|
|
|
- break
|
|
|
-
|
|
|
+ page_down_(driver, "//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc']", 3)
|
|
|
+
|
|
|
intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
for key in intro_list:
|
|
|
elements = intro_soup.find('div',{'aria-label':key})
|
|
@@ -205,10 +211,11 @@ def get_reviews(driver, output):
|
|
|
EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
|
|
|
)
|
|
|
element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ page_down_(driver, "//div[@class='siAUzd-neVct siAUzd-neVct-H9tDt']", 10)
|
|
|
all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
|
|
|
for ap in all_photo:
|
|
|
ap.click()
|
|
@@ -264,31 +271,25 @@ def get_reviews(driver, output):
|
|
|
|
|
|
|
|
|
def find_photo_list(driver):
|
|
|
- time.sleep(2)
|
|
|
+ time.sleep(0.5)
|
|
|
wait = WebDriverWait(driver, 60)
|
|
|
wait.until(
|
|
|
EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
|
|
|
)
|
|
|
- count_list = []
|
|
|
- for i in range(1, 6):
|
|
|
- try:
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
|
|
|
- count_list += [element.get_attribute('data-photo-index')]
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- except:
|
|
|
- break
|
|
|
- time.sleep(1)
|
|
|
+ page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
|
|
|
photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
photo_url = []
|
|
|
- for photo_id in count_list:
|
|
|
- for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
|
|
|
- if i['style'].find('width') != -1:
|
|
|
- sentence = i['style']
|
|
|
+ count = 0
|
|
|
+ for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
|
|
|
+ if count > 5: break
|
|
|
+ a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
|
|
|
+ if a_url:
|
|
|
+ if a_url.find('width') != -1:
|
|
|
+ sentence = a_url['style']
|
|
|
photo = re.search(r'https:(.*)\"', sentence)
|
|
|
- print(sentence)
|
|
|
photo_url += [photo.group(0).replace('\"','')]
|
|
|
- break
|
|
|
+ count += 1
|
|
|
return photo_url
|
|
|
|
|
|
|
|
@@ -305,11 +306,14 @@ def find_big_photo(output, driver):
|
|
|
|
|
|
tab_dict = {}
|
|
|
for tab_index in [0, 1, 2]:
|
|
|
- photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
|
|
|
- if photo_name == '菜單':
|
|
|
- tab_dict[photo_name] = tab_index
|
|
|
- elif photo_name == '全部':
|
|
|
- tab_dict[photo_name] = tab_index
|
|
|
+ try:
|
|
|
+ photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
|
|
|
+ if photo_name == '菜單':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
+ elif photo_name == '全部':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
print(tab_dict)
|
|
|
for tab_ in tab_dict:
|
|
|
tab_index = tab_dict[tab_]
|
|
@@ -371,7 +375,7 @@ def time_click(driver):
|
|
|
try:
|
|
|
time_css = "span[aria-label='顯示本週營業時間']"
|
|
|
element = driver.find_element_by_css_selector(time_css)
|
|
|
- driver.implicitly_wait(30)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
status = '正常'
|
|
|
|
|
@@ -392,7 +396,7 @@ def get_not_cralwer_url(keyword):
|
|
|
|
|
|
url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
- url_pd = url_pd[url_pd['item_url_length']!=1000]
|
|
|
+ url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
|
|
|
|
|
@@ -426,17 +430,37 @@ def serive_create_linux(profilepath):
|
|
|
|
|
|
return driver
|
|
|
|
|
|
+
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
+ print(elmts)
|
|
|
+ if len(elmts)>1:
|
|
|
+ elmt=elmts[1]
|
|
|
+ else:
|
|
|
+ elmt=elmts[0]
|
|
|
+
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.move_to_element(elmt).click().perform()
|
|
|
+ for i in range(time_):
|
|
|
+ try:
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
keyword = '咖啡'
|
|
|
db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
url_pd = get_not_cralwer_url(keyword)
|
|
|
|
|
|
print('drvier start...')
|
|
|
- driver = brower_start()
|
|
|
+ # driver = brower_start()
|
|
|
|
|
|
# driver = serive_create('Profile 1')
|
|
|
- # profilepath = 'Profile 1'
|
|
|
- # driver = serive_create_linux(profilepath)
|
|
|
+ profilepath = 'Profile 1'
|
|
|
+ driver = serive_create_linux(profilepath)
|
|
|
|
|
|
for key, row in url_pd.iterrows():
|
|
|
try:
|
|
@@ -444,30 +468,30 @@ def main():
|
|
|
item_url = row['item_url']
|
|
|
print(key, name, ': ' ,item_url)
|
|
|
|
|
|
- driver.get(item_url)
|
|
|
- for i in range(4, 26, 2):
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- time.sleep(0.5)
|
|
|
print('start...')
|
|
|
+ driver.get(item_url)
|
|
|
+ page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
|
|
|
+
|
|
|
time_status = time_click(driver)
|
|
|
- time.sleep(0.5)
|
|
|
+ time.sleep(1)
|
|
|
shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
|
|
|
output = {
|
|
|
'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
|
|
|
}
|
|
|
print(output['name'])
|
|
|
-
|
|
|
+ print('get_shop_info')
|
|
|
output = get_shop_info(driver, output, shop_soup)
|
|
|
-
|
|
|
+ print('get_intro_info')
|
|
|
output = get_intro_info(driver, output)
|
|
|
-
|
|
|
+ print('get_time_list')
|
|
|
output = get_time_list(shop_soup, output)
|
|
|
-
|
|
|
- output = get_reviews(driver, output)
|
|
|
-
|
|
|
+ print('user_ratings_total')
|
|
|
+ if output['user_ratings_total'] == '':
|
|
|
+ output['reviews'] = ''
|
|
|
+ else:
|
|
|
+ output = get_reviews(driver, output)
|
|
|
+ print('find_big_photo')
|
|
|
output = find_big_photo(output, driver)
|
|
|
|
|
|
output_name = output['name'].replace('(','').replace(')', '')
|
|
@@ -482,8 +506,8 @@ def main():
|
|
|
error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
driver.close()
|
|
|
- driver = brower_start()
|
|
|
- # driver = serive_create_linux(profilepath)
|
|
|
+ # driver = brower_start()
|
|
|
+ driver = serive_create_linux(profilepath)
|
|
|
|
|
|
|
|
|
|