|
@@ -75,9 +75,9 @@ def open_time(driver):
|
|
|
|
|
|
|
|
|
def get_shop_info(driver, output, shop_soup):
|
|
|
- current_url_split = driver.current_url.split('@')[1].split(',')
|
|
|
- output['lon'] = current_url_split[1]
|
|
|
- output['lat'] = current_url_split[0]
|
|
|
+ # current_url_split = driver.current_url.split('@')[1].split(',')
|
|
|
+ # output['lon'] = current_url_split[1]
|
|
|
+ # output['lat'] = current_url_split[0]
|
|
|
|
|
|
location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
|
|
|
output['city'] = location[-1]
|
|
@@ -420,16 +420,30 @@ def time_click(driver):
|
|
|
return ''
|
|
|
|
|
|
|
|
|
+def get_new_keyword(db):
|
|
|
+ result = db.query('select distinct(keyword) from shop_item_list order by keyword')
|
|
|
+ result = pd.DataFrame([i for i in result])
|
|
|
+ progress = db.query('select distinct(kw) from progress_list2')
|
|
|
+ progress = pd.DataFrame([i for i in progress])
|
|
|
+
|
|
|
+ if len(progress) != 0:
|
|
|
+ keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
|
|
|
+ else:
|
|
|
+ keyword = result.iloc[0].values[0]
|
|
|
+
|
|
|
+ return keyword
|
|
|
+
|
|
|
+
|
|
|
def get_not_cralwer_url(keyword):
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table = db['shop_item_list']
|
|
|
+ table = db['shop_item_list3']
|
|
|
url_list = list(table.find(keyword=keyword))
|
|
|
- shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
|
|
|
- error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
|
|
|
+ shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
|
|
|
+ error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
|
|
|
|
|
|
url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
- url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
- url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
|
|
|
+ # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
+ # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
|
|
|
|
|
@@ -464,6 +478,44 @@ def serive_create_linux(profilepath):
|
|
|
return driver
|
|
|
|
|
|
|
|
|
+def find_lon_lat():
|
|
|
+ e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
|
|
|
+ size = e.size
|
|
|
+ total_height = size['height']
|
|
|
+ total_width = size['width']
|
|
|
+
|
|
|
+ size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
|
|
|
+ left_width = size2['width']
|
|
|
+
|
|
|
+ x = (total_width - left_width) / 2 + left_width
|
|
|
+ y = total_height / 2
|
|
|
+
|
|
|
+ action = webdriver.common.action_chains.ActionChains(driver)
|
|
|
+ action.move_to_element_with_offset(e, x, y)
|
|
|
+ action.context_click()
|
|
|
+ action.perform()
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
|
|
|
+ lat, lon = element.text.split(',')
|
|
|
+ return float(lat), float(lon)
|
|
|
+
|
|
|
+
|
|
|
+def get_unique_id():
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ ele = driver.find_element(By.CSS_SELECTOR, "input")
|
|
|
+ short_url = ele.get_attribute('value')
|
|
|
+ unique_id = short_url.split('/')[-1]
|
|
|
+
|
|
|
+ element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ return unique_id
|
|
|
+
|
|
|
+
|
|
|
def page_down_(driver, xpath_css, time_):
|
|
|
elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
print(elmts)
|
|
@@ -484,89 +536,99 @@ def page_down_(driver, xpath_css, time_):
|
|
|
|
|
|
def main():
|
|
|
db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
-
|
|
|
- keyword = '麻辣火鍋'
|
|
|
- if len(sys.argv) >1:
|
|
|
- keyword=sys.argv[1]
|
|
|
- port=4444
|
|
|
- if len(sys.argv) >2:
|
|
|
- port=int(sys.argv[2])
|
|
|
-
|
|
|
- url_pd = get_not_cralwer_url(keyword)
|
|
|
- print('drvier start {}...'.format(keyword))
|
|
|
- driver = brower_start(port)
|
|
|
- #driver = serive_create('Profile 6')
|
|
|
- #profilepath = 'Profile 1'
|
|
|
- #driver = serive_create_linux(profilepath)
|
|
|
-
|
|
|
- for key, row in url_pd.iterrows():
|
|
|
- try:
|
|
|
- name = row['name']
|
|
|
- item_url = row['item_url']
|
|
|
- print(key, name, ': ' ,item_url)
|
|
|
-
|
|
|
- print('start...')
|
|
|
- driver.get(item_url)
|
|
|
- page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
|
|
|
-
|
|
|
- time_status = time_click(driver)
|
|
|
- if time_status == 'error':
|
|
|
- error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
- data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
- continue
|
|
|
- time.sleep(0.5)
|
|
|
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
-
|
|
|
- output = {
|
|
|
- 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
|
|
|
- }
|
|
|
- print(output['name'])
|
|
|
-
|
|
|
- print('get_shop_info')
|
|
|
- output = get_shop_info(driver, output, shop_soup)
|
|
|
-
|
|
|
- print('get_intro_info')
|
|
|
- if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
|
|
|
- output = get_intro_info(driver, output)
|
|
|
- else:
|
|
|
- for key in intro_list:
|
|
|
- output[intro_list[key][0]] = '[]'
|
|
|
+ db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table2 = db2['progress_list2']
|
|
|
+ # keyword = '麻辣火鍋'
|
|
|
+ # if len(sys.argv) >1:
|
|
|
+ # keyword=sys.argv[1]
|
|
|
+ # port=4444
|
|
|
+ # if len(sys.argv) >2:
|
|
|
+ # port=int(sys.argv[2])
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ print('restart docker p{}'.format(port))
|
|
|
+ os.system('sudo docker container restart pp'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+
|
|
|
+ for i in range(5):
|
|
|
+ keyword = get_new_keyword(db2)
|
|
|
+ url_pd = get_not_cralwer_url(keyword)
|
|
|
+ print('drvier start {}...'.format(keyword))
|
|
|
+ driver = brower_start(port)
|
|
|
+ #driver = serive_create('Profile 6')
|
|
|
+ #profilepath = 'Profile 1'
|
|
|
+ #driver = serive_create_linux(profilepath)
|
|
|
+
|
|
|
+ for key, row in url_pd.iterrows():
|
|
|
+ try:
|
|
|
+ name = row['name']
|
|
|
+ item_url = row['item_url']
|
|
|
+ print(key, name, ': ' ,item_url)
|
|
|
+
|
|
|
+ print('start...')
|
|
|
+ driver.get(item_url)
|
|
|
+ page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
|
|
|
+ lat, lon = find_lon_lat()
|
|
|
+ unique_id = get_unique_id()
|
|
|
+ time_status = time_click(driver)
|
|
|
+ if time_status == 'error':
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list2', error_table_col, row)
|
|
|
+ continue
|
|
|
+ time.sleep(0.5)
|
|
|
+ shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+
|
|
|
+ output = {
|
|
|
+ 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
|
|
|
+ 'lon': lon,
|
|
|
+ 'lat': lat,
|
|
|
+ 'unique_id': unique_id
|
|
|
+ }
|
|
|
+ print(output['name'], lon, lat)
|
|
|
+
|
|
|
+ print('get_shop_info')
|
|
|
+ output = get_shop_info(driver, output, shop_soup)
|
|
|
+
|
|
|
+ print('get_intro_info')
|
|
|
+ if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
|
|
|
+ output = get_intro_info(driver, output)
|
|
|
+ else:
|
|
|
+ for key in intro_list:
|
|
|
+ output[intro_list[key][0]] = '[]'
|
|
|
|
|
|
- print('get_time_list')
|
|
|
- if time_status == '正常':
|
|
|
- output = get_time_list(shop_soup, output)
|
|
|
- else:
|
|
|
- output['open_now'] = False
|
|
|
- output['periods'] = ''
|
|
|
- output['weekday_text'] = ''
|
|
|
+ print('get_time_list')
|
|
|
+ if time_status == '正常':
|
|
|
+ output = get_time_list(shop_soup, output)
|
|
|
+ else:
|
|
|
+ output['open_now'] = False
|
|
|
+ output['periods'] = ''
|
|
|
+ output['weekday_text'] = ''
|
|
|
|
|
|
- print('user_ratings_total')
|
|
|
- if output['user_ratings_total'] == '':
|
|
|
- output['reviews'] = ''
|
|
|
- else:
|
|
|
- output = get_reviews(driver, output)
|
|
|
-
|
|
|
- print('find_big_photo')
|
|
|
- output = find_big_photo(output, driver)
|
|
|
-
|
|
|
- output_name = output['name'].replace('(','').replace(')', '')
|
|
|
- query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
- query_name = query_name.replace(' ','')
|
|
|
- output['item_url'] = item_url
|
|
|
- output['keyword'] = keyword
|
|
|
- output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
- data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
- error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
- data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
- time.sleep(1)
|
|
|
- # driver.close()
|
|
|
- # driver = brower_start(port)
|
|
|
- # driver = serive_create_linux(profilepath)
|
|
|
-
|
|
|
+ print('user_ratings_total')
|
|
|
+ if output['user_ratings_total'] == '':
|
|
|
+ output['reviews'] = ''
|
|
|
+ else:
|
|
|
+ output = get_reviews(driver, output)
|
|
|
+
|
|
|
+ print('find_big_photo')
|
|
|
+ output = find_big_photo(output, driver)
|
|
|
+
|
|
|
+ output_name = output['name'].replace('(','').replace(')', '')
|
|
|
+ query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
+ query_name = query_name.replace(' ','')
|
|
|
+ output['item_url'] = item_url
|
|
|
+ output['keyword'] = keyword
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
+ data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list2', error_table_col, row)
|
|
|
+ time.sleep(1)
|
|
|
|
|
|
+ table2.insert({'kw':keyword,'num':key})
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|