|
@@ -18,26 +18,47 @@ import time
|
|
|
import json
|
|
|
import re
|
|
|
|
|
|
-def brower_start():
|
|
|
+def brower_start(port):
|
|
|
options = webdriver.ChromeOptions()
|
|
|
browser = webdriver.Remote(
|
|
|
- command_executor='http://192.53.174.202:4444/wd/hub',
|
|
|
+ #command_executor='http://192.53.174.202:4444/wd/hub',
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
desired_capabilities=options.to_capabilities()
|
|
|
)
|
|
|
return browser
|
|
|
|
|
|
|
|
|
-def get_url_list(driver):
|
|
|
- for i in range(5, 43, 2):
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ elmts = driver.find_elements_by_xpath(xpath_css)
|
|
|
+ print(elmts)
|
|
|
+ if len(elmts)>1:
|
|
|
+ elmt=elmts[1]
|
|
|
+ else:
|
|
|
+ elmt=elmts[0]
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.move_to_element(elmt).click().perform()
|
|
|
+ for i in range(time_):
|
|
|
try:
|
|
|
- wait = WebDriverWait(driver, 60)
|
|
|
- wait.until(
|
|
|
- EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
|
|
|
- )
|
|
|
- driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
|
|
|
- time.sleep(0.5)
|
|
|
+ actions = ActionChains(driver)
|
|
|
+ actions.send_keys(Keys.PAGE_DOWN).perform()
|
|
|
except:
|
|
|
- pass
|
|
|
+ traceback.print_exc()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def get_url_list(driver):
|
|
|
+ # for i in range(5, 43, 2):
|
|
|
+ # try:
|
|
|
+ # wait = WebDriverWait(driver, 60)
|
|
|
+ # wait.until(
|
|
|
+ # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
|
|
|
+ # )
|
|
|
+ # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
|
|
|
+ # time.sleep(0.5)
|
|
|
+ # except:
|
|
|
+ # pass
|
|
|
+ page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 8)
|
|
|
+
|
|
|
url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
url_list = []
|
|
|
for i in url_soup.find_all('a'):
|
|
@@ -46,7 +67,7 @@ def get_url_list(driver):
|
|
|
url_list += [[i['href'], i['aria-label']]]
|
|
|
except:
|
|
|
pass
|
|
|
-
|
|
|
+ print(len(url_list))
|
|
|
return url_list
|
|
|
|
|
|
|
|
@@ -59,42 +80,47 @@ def keyin_keyword(driver, keyword):
|
|
|
|
|
|
def main():
|
|
|
data = pd.read_csv('lat_long_location.csv', index_col = 0)
|
|
|
- db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+
|
|
|
+ keyword = '麻辣火鍋'
|
|
|
+ if len(sys.argv) >1:
|
|
|
+ keyword=sys.argv[1]
|
|
|
+ port=4444
|
|
|
+ if len(sys.argv) >2:
|
|
|
+ port=int(sys.argv[2])
|
|
|
+
|
|
|
print('drvier start...')
|
|
|
- driver = brower_start()
|
|
|
-
|
|
|
-# for keyword in ['碗粿','炒麵','肉粽']:
|
|
|
- for keyword in ['碗粿']:
|
|
|
-
|
|
|
- for k, row in data.iterrows():
|
|
|
- try:
|
|
|
- latitude = row['latitude'] #緯度
|
|
|
- longitude = row['longitude'] #精度
|
|
|
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
- driver.get(url)
|
|
|
- keyin_keyword(driver, keyword)
|
|
|
+ driver = brower_start(port)
|
|
|
+ db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
+
|
|
|
+ for k, row in data.iterrows():
|
|
|
+ try:
|
|
|
+ latitude = row['latitude'] #緯度
|
|
|
+ longitude = row['longitude'] #精度
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+
|
|
|
+ for page in range(4):
|
|
|
+ print(keyword, k, row['location'], latitude, longitude, page)
|
|
|
+ url_list = get_url_list(driver)
|
|
|
|
|
|
- for page in range(4):
|
|
|
- print(keyword, k, row['location'], latitude, longitude, page)
|
|
|
- url_list = get_url_list(driver)
|
|
|
-
|
|
|
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
- for item in url_list:
|
|
|
- result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
- insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
- .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
|
|
|
-
|
|
|
- DA.mysql_insert_data(db, insert_sql)
|
|
|
-
|
|
|
- if page < 2 :
|
|
|
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
- except:
|
|
|
- error = pd.DataFrame([row])
|
|
|
- error.to_csv('error_shop_item_list.csv', mode='a', header = False)
|
|
|
- driver.close()
|
|
|
- driver = brower_start()
|
|
|
+ shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
+ for item in url_list:
|
|
|
+ result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
+ insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
+ .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
|
|
|
+
|
|
|
+ DA.mysql_insert_data(db, insert_sql)
|
|
|
+
|
|
|
+ if page < 2 :
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ except:
|
|
|
+ error = pd.DataFrame([row])
|
|
|
+ error.to_csv('error_shop_item_list.csv', mode='a', header = False)
|
|
|
+ #driver.close()
|
|
|
+ #driver = brower_start()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|