|
@@ -12,6 +12,7 @@ from utility import database_access as DA
|
|
|
from utility.parseutils import *
|
|
|
from utility.connect import *
|
|
|
|
|
|
+from datetime import datetime
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import json
|
|
@@ -50,9 +51,9 @@ def keyin_keyword(driver, keyword):
|
|
|
ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
|
|
|
time.sleep(3)
|
|
|
|
|
|
- element = driver.find_element_by_class_name("V0h1Ob-haAclf")
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
|
|
|
+ # driver.implicitly_wait(30)
|
|
|
+ # ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
|
|
|
def open_time(driver):
|
|
@@ -74,11 +75,10 @@ def get_shop_info(driver, output, shop_soup):
|
|
|
location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
|
|
|
output['city'] = location[-1]
|
|
|
output['area'] = location[-2]
|
|
|
- print(location)
|
|
|
|
|
|
output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
- print(output['addr'], output['tel'])
|
|
|
+ print(output['addr'], ', ' ,output['tel'])
|
|
|
|
|
|
for key in element_list:
|
|
|
element = element_list[key]
|
|
@@ -101,18 +101,19 @@ def get_intro_info(driver, output):
|
|
|
element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
driver.implicitly_wait(20)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
-
|
|
|
- wait = WebDriverWait(driver, 30)
|
|
|
- item_xpath = "div[aria-label='{}簡介']".format(output['name'])
|
|
|
- wait.until(
|
|
|
- EC.element_to_be_clickable((By.CSS_SELECTOR, item_xpath))
|
|
|
- )
|
|
|
- time.sleep(1)
|
|
|
+
|
|
|
+ for i in range(5, 35, 3):
|
|
|
+ try:
|
|
|
+ element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.move_to_element(element).perform()
|
|
|
+ except:
|
|
|
+ break
|
|
|
+
|
|
|
intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
-
|
|
|
+
|
|
|
for key in intro_list:
|
|
|
elements = intro_soup.find('div',{'aria-label':key})
|
|
|
- # print(elements)
|
|
|
if elements:
|
|
|
element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
# print(element)
|
|
@@ -125,9 +126,10 @@ def get_intro_info(driver, output):
|
|
|
intro_list[key][1]: blank_check(ele.text)
|
|
|
}]
|
|
|
count += 1
|
|
|
+ print(str(tmp))
|
|
|
output[intro_list[key][0]] = str(tmp)
|
|
|
else:
|
|
|
- output[intro_list[key][0]] = []
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
driver.back()
|
|
|
return output
|
|
|
|
|
@@ -241,7 +243,7 @@ def get_reviews(driver, output):
|
|
|
}]
|
|
|
count += 1
|
|
|
|
|
|
- output['reviews'] = reviews
|
|
|
+ output['reviews'] = str(reviews)
|
|
|
driver.back()
|
|
|
return output
|
|
|
|
|
@@ -285,31 +287,29 @@ def find_big_photo(output, driver):
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
photo_map = {
|
|
|
- 0: 'shop_photo',
|
|
|
+ 1: 'shop_photo',
|
|
|
2: 'menu_photo'
|
|
|
}
|
|
|
|
|
|
- for tab_index in [0, 2]:
|
|
|
- wait = WebDriverWait(driver, 30)
|
|
|
+ for tab_index in [1, 2]:
|
|
|
+ wait = WebDriverWait(driver, 60)
|
|
|
wait.until(
|
|
|
EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
|
|
|
)
|
|
|
element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
photo_list = find_photo_list(driver)
|
|
|
- output[photo_map[tab_index]] = photo_list
|
|
|
+ output[photo_map[tab_index]] = str(photo_list)
|
|
|
|
|
|
- for i in range(2):
|
|
|
- driver.back()
|
|
|
- time.sleep(1)
|
|
|
return output
|
|
|
|
|
|
+
|
|
|
def get_url_list(driver):
|
|
|
- wait = WebDriverWait(driver, 10)
|
|
|
- wait.until(
|
|
|
- EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
|
|
|
- )
|
|
|
- driver.back()
|
|
|
+ # wait = WebDriverWait(driver, 10)
|
|
|
+ # wait.until(
|
|
|
+ # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
|
|
|
+ # )
|
|
|
+ # driver.back()
|
|
|
|
|
|
time.sleep(2)
|
|
|
for i in range(5, 43, 2):
|
|
@@ -326,6 +326,21 @@ def get_url_list(driver):
|
|
|
return url_list
|
|
|
|
|
|
|
|
|
+def data_select_insert(db, table_name, table_col, data):
|
|
|
+ tmp = []
|
|
|
+ for name_ in table_col:
|
|
|
+ if name_ == 'crawler_date':
|
|
|
+ continue
|
|
|
+ tmp += [data[name_]]
|
|
|
+
|
|
|
+ tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
+
|
|
|
+ insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
+ .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
|
|
|
+
|
|
|
+ DA.mysql_insert_data(db, insert_sql)
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
tmp = data.iloc[0]
|
|
@@ -376,13 +391,16 @@ def main():
|
|
|
output = find_big_photo(output, driver)
|
|
|
|
|
|
output_name = output['name'].replace('(','').replace(')', '')
|
|
|
- output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output_name, output['addr'])
|
|
|
+ query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
+ query_name = query_name.replace(' ','')
|
|
|
+ output['query_name'] = query_name
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
time.sleep(2)
|
|
|
result += [output]
|
|
|
with open('result/20211207_{}.json'.format(name), 'w') as f:
|
|
|
json.dump(output, f)
|
|
|
|
|
|
- DA.data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
+ data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
break
|
|
|
# except:
|
|
|
# shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|