|
@@ -14,11 +14,14 @@ from utility.parseutils import *
|
|
|
from utility.connect import *
|
|
|
|
|
|
from datetime import datetime
|
|
|
+import traceback
|
|
|
import dataset
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import json
|
|
|
import re
|
|
|
+import sys
|
|
|
+import os
|
|
|
# import pyautogui as pag
|
|
|
|
|
|
def serive_create(profilepath):
|
|
@@ -26,10 +29,10 @@ def serive_create(profilepath):
|
|
|
|
|
|
option.add_argument('--disable-web-security')
|
|
|
option.add_argument('--allow-running-insecure-content')
|
|
|
- option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
+ option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
option.add_argument("profile-directory="+profilepath)
|
|
|
|
|
|
- driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
|
|
|
+ driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
|
|
|
executor_url = driver.command_executor._url
|
|
|
session_id = driver.session_id
|
|
|
print (session_id)
|
|
@@ -38,10 +41,13 @@ def serive_create(profilepath):
|
|
|
|
|
|
return driver
|
|
|
|
|
|
-def brower_start():
|
|
|
+def brower_start(port):
|
|
|
options = webdriver.ChromeOptions()
|
|
|
+# browser = webdriver.Chrome(options=options)
|
|
|
+
|
|
|
browser = webdriver.Remote(
|
|
|
- command_executor='http://192.53.174.202:4444/wd/hub',
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
|
|
|
desired_capabilities=options.to_capabilities()
|
|
|
)
|
|
|
return browser
|
|
@@ -62,7 +68,7 @@ def open_time(driver):
|
|
|
element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
|
|
|
if element.text.find('預訂') == -1:
|
|
|
element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
return 1
|
|
|
else:
|
|
@@ -70,69 +76,85 @@ def open_time(driver):
|
|
|
|
|
|
|
|
|
def get_shop_info(driver, output, shop_soup):
|
|
|
- current_url_split = driver.current_url.split('@')[1].split(',')
|
|
|
- output['lon'] = current_url_split[1]
|
|
|
- output['lat'] = current_url_split[0]
|
|
|
+ # current_url_split = driver.current_url.split('@')[1].split(',')
|
|
|
+ # output['lon'] = current_url_split[1]
|
|
|
+ # output['lat'] = current_url_split[0]
|
|
|
|
|
|
location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
|
|
|
output['city'] = location[-1]
|
|
|
output['area'] = location[-2]
|
|
|
-
|
|
|
- output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
- output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
+
|
|
|
+ try:
|
|
|
+ output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
|
|
|
+ except:
|
|
|
+ output['addr'] = ''
|
|
|
+
|
|
|
+ try:
|
|
|
+ output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
|
|
|
+ except:
|
|
|
+ output['tel'] = ''
|
|
|
print(output['addr'], ', ' ,output['tel'])
|
|
|
|
|
|
for key in element_list:
|
|
|
- element = element_list[key]
|
|
|
- if len(element) == 3:
|
|
|
- value = shop_soup.find(element[0],element[1])[element[2]]
|
|
|
-
|
|
|
- else:
|
|
|
- tmp_value = shop_soup.find(element[0],element[1])
|
|
|
- if tmp_value:
|
|
|
- value = tmp_value.text
|
|
|
+ try:
|
|
|
+ element = element_list[key]
|
|
|
+ if len(element) == 3:
|
|
|
+ value = shop_soup.find(element[0],element[1])[element[2]]
|
|
|
+
|
|
|
else:
|
|
|
- value = ''
|
|
|
+ tmp_value = shop_soup.find(element[0],element[1])
|
|
|
+ if tmp_value:
|
|
|
+ value = tmp_value.text
|
|
|
+ else:
|
|
|
+ value = ''
|
|
|
|
|
|
- output[key] = value_check(key, value)
|
|
|
+ output[key] = value_check(key, value)
|
|
|
+ except:
|
|
|
+ output[key] = ''
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
def get_intro_info(driver, output):
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
- driver.implicitly_wait(20)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
-
|
|
|
- for i in range(5, 35, 3):
|
|
|
- try:
|
|
|
- element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- except:
|
|
|
- break
|
|
|
-
|
|
|
- intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- for key in intro_list:
|
|
|
- elements = intro_soup.find('div',{'aria-label':key})
|
|
|
- if elements:
|
|
|
- element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
- count = 0
|
|
|
- tmp = []
|
|
|
- for ele in element:
|
|
|
- # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
- if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
|
|
|
- tmp += [{
|
|
|
- 'id':count,
|
|
|
- intro_list[key][1]: blank_check(ele.text)
|
|
|
- }]
|
|
|
- count += 1
|
|
|
- print(str(tmp))
|
|
|
- output[intro_list[key][0]] = str(tmp)
|
|
|
- else:
|
|
|
+ # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
|
|
|
+ try:
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+ # pageSource = driver.page_source
|
|
|
+ # fileToWrite = open("page_source.html", "w")
|
|
|
+ # fileToWrite.write(pageSource)
|
|
|
+ # fileToWrite.close()
|
|
|
+
|
|
|
+ page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
|
|
|
+
|
|
|
+ intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ for key in intro_list:
|
|
|
+ elements = intro_soup.find('div',{'aria-label':key})
|
|
|
+ if elements:
|
|
|
+ element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
|
|
|
+ count = 0
|
|
|
+ tmp = []
|
|
|
+ for ele in element:
|
|
|
+ # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
|
|
|
+ if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
|
|
|
+ tmp += [{
|
|
|
+ 'id':count,
|
|
|
+ intro_list[key][1]: blank_check(ele.text)
|
|
|
+ }]
|
|
|
+ count += 1
|
|
|
+ print(str(tmp))
|
|
|
+ output[intro_list[key][0]] = str(tmp)
|
|
|
+ else:
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
+ driver.back()
|
|
|
+ return output
|
|
|
+
|
|
|
+ except:
|
|
|
+ for key in intro_list:
|
|
|
output[intro_list[key][0]] = '[]'
|
|
|
- driver.back()
|
|
|
- return output
|
|
|
+ return output
|
|
|
|
|
|
|
|
|
def get_time_list(shop_soup, output):
|
|
@@ -205,19 +227,26 @@ def get_reviews(driver, output):
|
|
|
EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
|
|
|
)
|
|
|
element = driver.find_element_by_css_selector(more_reviews_css)
|
|
|
- driver.implicitly_wait(20)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- time.sleep(1)
|
|
|
+ time.sleep(0.5)
|
|
|
|
|
|
- all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
|
|
|
- for ap in all_photo:
|
|
|
- ap.click()
|
|
|
+ # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
|
|
|
+ page_down_(driver, '//div[@class="PPCwl"]',5)
|
|
|
|
|
|
- all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
|
|
|
- for ap in all_review:
|
|
|
- ap.click()
|
|
|
+ comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
|
|
|
+ all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
|
|
|
+ for ap in all_photo:
|
|
|
+ ap.click()
|
|
|
+
|
|
|
+ if comment_soup.select('button[aria-label="顯示更多"]') != 0:
|
|
|
+ all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
|
|
|
+ for ap in all_review:
|
|
|
+ ap.click()
|
|
|
|
|
|
comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
count = 0
|
|
|
reviews = []
|
|
|
for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
|
|
@@ -264,36 +293,35 @@ def get_reviews(driver, output):
|
|
|
|
|
|
|
|
|
def find_photo_list(driver):
|
|
|
- time.sleep(2)
|
|
|
+ time.sleep(0.5)
|
|
|
wait = WebDriverWait(driver, 60)
|
|
|
wait.until(
|
|
|
EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
|
|
|
)
|
|
|
- count_list = []
|
|
|
- for i in range(1, 6):
|
|
|
- try:
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
|
|
|
- count_list += [element.get_attribute('data-photo-index')]
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- except:
|
|
|
- break
|
|
|
- time.sleep(1)
|
|
|
+ page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
|
|
|
photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
photo_url = []
|
|
|
- for photo_id in count_list:
|
|
|
- for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
|
|
|
- if i['style'].find('width') != -1:
|
|
|
- sentence = i['style']
|
|
|
+ count = 0
|
|
|
+ for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
|
|
|
+ if count > 5: break
|
|
|
+ a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
|
|
|
+ if a_url:
|
|
|
+ if a_url.find('width') != -1:
|
|
|
+ sentence = a_url['style']
|
|
|
photo = re.search(r'https:(.*)\"', sentence)
|
|
|
- print(sentence)
|
|
|
photo_url += [photo.group(0).replace('\"','')]
|
|
|
- break
|
|
|
+ count += 1
|
|
|
return photo_url
|
|
|
|
|
|
|
|
|
def find_big_photo(output, driver):
|
|
|
- element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
|
|
|
+ # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
|
|
|
+ wait = WebDriverWait(driver, 60)
|
|
|
+ wait.until(
|
|
|
+ EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
|
|
|
+ )
|
|
|
+ element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
output['shop_photo'] = '[]'
|
|
|
output['menu_photo'] = '[]'
|
|
@@ -302,14 +330,18 @@ def find_big_photo(output, driver):
|
|
|
'全部': 'shop_photo',
|
|
|
'菜單': 'menu_photo'
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
+ driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
|
|
|
+ photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
tab_dict = {}
|
|
|
for tab_index in [0, 1, 2]:
|
|
|
- photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
|
|
|
- if photo_name == '菜單':
|
|
|
- tab_dict[photo_name] = tab_index
|
|
|
- elif photo_name == '全部':
|
|
|
- tab_dict[photo_name] = tab_index
|
|
|
+ selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
|
|
|
+ if len(selector) != 0:
|
|
|
+ photo_name = selector[0].text
|
|
|
+ if photo_name == '菜單':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
+ elif photo_name == '全部':
|
|
|
+ tab_dict[photo_name] = tab_index
|
|
|
print(tab_dict)
|
|
|
for tab_ in tab_dict:
|
|
|
tab_index = tab_dict[tab_]
|
|
@@ -367,32 +399,52 @@ def data_select_insert(db, table_name, table_col, data):
|
|
|
|
|
|
|
|
|
def time_click(driver):
|
|
|
+ shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
status = ''
|
|
|
try:
|
|
|
- time_css = "span[aria-label='顯示本週營業時間']"
|
|
|
- element = driver.find_element_by_css_selector(time_css)
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- status = '正常'
|
|
|
+ if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
|
|
|
+ time_css = "span[aria-label='顯示本週營業時間']"
|
|
|
+ element = driver.find_element_by_css_selector(time_css)
|
|
|
+ driver.implicitly_wait(10)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ status = '正常'
|
|
|
+
|
|
|
+ elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
|
|
|
+ status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
|
|
|
+# status = '永久停業' or '暫時關閉'
|
|
|
+
|
|
|
+ elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
|
|
|
+ status = 'error'
|
|
|
+
|
|
|
+ return status
|
|
|
+ except:
|
|
|
+ return ''
|
|
|
|
|
|
- except NoSuchElementException:
|
|
|
- time_css = "div[aria-expanded='false']"
|
|
|
- elem = driver.find_element_by_css_selector(time_css)
|
|
|
- if elem:
|
|
|
- status = '暫時關閉'
|
|
|
- return status
|
|
|
+
|
|
|
+def get_new_keyword(db):
|
|
|
+ result = db.query('select distinct(keyword) from shop_item_list order by keyword')
|
|
|
+ result = pd.DataFrame([i for i in result])
|
|
|
+ progress = db.query('select distinct(kw) from progress_list2')
|
|
|
+ progress = pd.DataFrame([i for i in progress])
|
|
|
+
|
|
|
+ if len(progress) != 0:
|
|
|
+ keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
|
|
|
+ else:
|
|
|
+ keyword = result.iloc[0].values[0]
|
|
|
+
|
|
|
+ return keyword
|
|
|
|
|
|
|
|
|
def get_not_cralwer_url(keyword):
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table = db['shop_item_list']
|
|
|
+ table = db['shop_item_list3']
|
|
|
url_list = list(table.find(keyword=keyword))
|
|
|
- shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
|
|
|
- error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
|
|
|
+ shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
|
|
|
+ error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
|
|
|
|
|
|
url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
- url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
- url_pd = url_pd[url_pd['item_url_length']!=1000]
|
|
|
+ # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
+ # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
|
|
|
|
|
@@ -426,66 +478,162 @@ def serive_create_linux(profilepath):
|
|
|
|
|
|
return driver
|
|
|
|
|
|
-def main():
|
|
|
- keyword = '咖啡'
|
|
|
- db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
- url_pd = get_not_cralwer_url(keyword)
|
|
|
|
|
|
- print('drvier start...')
|
|
|
- driver = brower_start()
|
|
|
+def find_lon_lat(driver):
|
|
|
+ e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
|
|
|
+ size = e.size
|
|
|
+ total_height = size['height']
|
|
|
+ total_width = size['width']
|
|
|
|
|
|
- # driver = serive_create('Profile 1')
|
|
|
- # profilepath = 'Profile 1'
|
|
|
- # driver = serive_create_linux(profilepath)
|
|
|
-
|
|
|
- for key, row in url_pd.iterrows():
|
|
|
- try:
|
|
|
- name = row['name']
|
|
|
- item_url = row['item_url']
|
|
|
- print(key, name, ': ' ,item_url)
|
|
|
-
|
|
|
- driver.get(item_url)
|
|
|
- for i in range(4, 26, 2):
|
|
|
- element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
|
|
|
- actions = ActionChains(driver)
|
|
|
- actions.move_to_element(element).perform()
|
|
|
- time.sleep(0.5)
|
|
|
- print('start...')
|
|
|
- time_status = time_click(driver)
|
|
|
- time.sleep(0.5)
|
|
|
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
-
|
|
|
- output = {
|
|
|
- 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
|
|
|
- }
|
|
|
- print(output['name'])
|
|
|
+ size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
|
|
|
+ left_width = size2['width']
|
|
|
+ print(total_height, total_width, left_width)
|
|
|
+ x = (total_width - left_width) / 2 + left_width
|
|
|
+ y = total_height / 2
|
|
|
|
|
|
- output = get_shop_info(driver, output, shop_soup)
|
|
|
+ e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
|
|
|
+ action = webdriver.common.action_chains.ActionChains(driver)
|
|
|
+ action.move_to_element_with_offset(e, x, y)
|
|
|
+ action.context_click()
|
|
|
+ action.perform()
|
|
|
|
|
|
- output = get_intro_info(driver, output)
|
|
|
+ time.sleep(0.5)
|
|
|
|
|
|
- output = get_time_list(shop_soup, output)
|
|
|
+ element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
|
|
|
+ lat, lon = element.text.split(',')
|
|
|
+ return float(lat), float(lon)
|
|
|
|
|
|
- output = get_reviews(driver, output)
|
|
|
|
|
|
- output = find_big_photo(output, driver)
|
|
|
+def get_unique_id(driver):
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+ for i in range(5):
|
|
|
+ ele = driver.find_element(By.CSS_SELECTOR, "input")
|
|
|
+ short_url = ele.get_attribute('value')
|
|
|
+ unique_id = short_url.split('/')[-1]
|
|
|
+ if len(unique_id) != 0:
|
|
|
+ break
|
|
|
+ time.sleep(0.5)
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ return unique_id
|
|
|
|
|
|
- output_name = output['name'].replace('(','').replace(')', '')
|
|
|
- query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
- query_name = query_name.replace(' ','')
|
|
|
- output['item_url'] = item_url
|
|
|
- output['keyword'] = keyword
|
|
|
- output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
- data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
+ print(elmts)
|
|
|
+ if len(elmts)>1:
|
|
|
+ elmt=elmts[1]
|
|
|
+ else:
|
|
|
+ elmt=elmts[0]
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.move_to_element(elmt).click().perform()
|
|
|
+ for i in range(time_):
|
|
|
+ try:
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
except:
|
|
|
- error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
- data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
- driver.close()
|
|
|
- driver = brower_start()
|
|
|
- # driver = serive_create_linux(profilepath)
|
|
|
-
|
|
|
+ traceback.print_exc()
|
|
|
+ time.sleep(0.5)
|
|
|
|
|
|
|
|
|
+def main():
|
|
|
+ db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+ db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table2 = db2['progress_list2']
|
|
|
+ # keyword = '麻辣火鍋'
|
|
|
+ # if len(sys.argv) >1:
|
|
|
+ # keyword=sys.argv[1]
|
|
|
+ # port=4444
|
|
|
+ # if len(sys.argv) >2:
|
|
|
+ # port=int(sys.argv[2])
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ print('restart docker p{}'.format(port))
|
|
|
+ os.system('sudo docker container restart p'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+
|
|
|
+ for i in range(5):
|
|
|
+ keyword = get_new_keyword(db2)
|
|
|
+ table2.insert({'kw':keyword,'num':0})
|
|
|
+ url_pd = get_not_cralwer_url(keyword)
|
|
|
+ print('drvier start {}...'.format(keyword))
|
|
|
+ driver = brower_start(port)
|
|
|
+ #driver = serive_create('Profile 6')
|
|
|
+ #profilepath = 'Profile 1'
|
|
|
+ #driver = serive_create_linux(profilepath)
|
|
|
+
|
|
|
+ for key, row in url_pd.iterrows():
|
|
|
+ try:
|
|
|
+ name = row['name']
|
|
|
+ item_url = row['item_url']
|
|
|
+ print(key, name, ': ' ,item_url)
|
|
|
+
|
|
|
+ print('start...')
|
|
|
+ driver.get(item_url)
|
|
|
+ page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
|
|
|
+ lat, lon = find_lon_lat(driver)
|
|
|
+ unique_id = get_unique_id(driver)
|
|
|
+ time_status = time_click(driver)
|
|
|
+ if time_status == 'error' or len(unique_id) ==0:
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list2', error_table_col, row)
|
|
|
+ continue
|
|
|
+ time.sleep(0.5)
|
|
|
+ shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
+ output = {
|
|
|
+ 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
|
|
|
+ 'lon': lon,
|
|
|
+ 'lat': lat,
|
|
|
+ 'unique_id': unique_id.replace('?share','')
|
|
|
+ }
|
|
|
+ print(output['name'], lon, lat, unique_id)
|
|
|
+
|
|
|
+ print('get_shop_info')
|
|
|
+ output = get_shop_info(driver, output, shop_soup)
|
|
|
+
|
|
|
+ print('get_intro_info')
|
|
|
+ if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
|
|
|
+ output = get_intro_info(driver, output)
|
|
|
+ else:
|
|
|
+ for key in intro_list:
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
+
|
|
|
+ print('get_time_list')
|
|
|
+ if time_status == '正常':
|
|
|
+ output = get_time_list(shop_soup, output)
|
|
|
+ else:
|
|
|
+ output['open_now'] = False
|
|
|
+ output['periods'] = ''
|
|
|
+ output['weekday_text'] = ''
|
|
|
+
|
|
|
+ print('user_ratings_total')
|
|
|
+ if output['user_ratings_total'] == '':
|
|
|
+ output['reviews'] = ''
|
|
|
+ else:
|
|
|
+ output = get_reviews(driver, output)
|
|
|
+
|
|
|
+ print('find_big_photo')
|
|
|
+ output = find_big_photo(output, driver)
|
|
|
+
|
|
|
+ output_name = output['name'].replace('(','').replace(')', '')
|
|
|
+ query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
+ query_name = query_name.replace(' ','')
|
|
|
+ output['item_url'] = item_url
|
|
|
+ output['keyword'] = keyword
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
+ data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list2', error_table_col, row)
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
main()
|