123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- # -*- coding: utf-8 -*-
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- from datetime import datetime
- import pandas as pd
- import time
- import json
- import re
- def brower_start():
- options = webdriver.ChromeOptions()
- browser = webdriver.Remote(
- command_executor='http://192.53.174.202:4444/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def get_url_list(driver):
- for i in range(5, 43, 2):
- try:
- wait = WebDriverWait(driver, 60)
- wait.until(
- EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
- )
- driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
- time.sleep(0.5)
- except:
- pass
- url_soup = BeautifulSoup(driver.page_source, 'html.parser')
- url_list = []
- for i in url_soup.find_all('a'):
- try:
- if i['href'].find('maps/place') != -1:
- url_list += [[i['href'], i['aria-label']]]
- except:
- pass
-
- return url_list
- def keyin_keyword(driver, keyword):
- button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
- time.sleep(3)
- def main():
- data = pd.read_csv('lat_long_location.csv', index_col = 0)
- db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
- print('drvier start...')
- driver = brower_start()
- # for keyword in ['碗粿','炒麵','肉粽']:
- for keyword in ['碗粿']:
- for k, row in data.iterrows():
- try:
- latitude = row['latitude'] #緯度
- longitude = row['longitude'] #精度
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
- driver.get(url)
- keyin_keyword(driver, keyword)
-
- for page in range(4):
- print(keyword, k, row['location'], latitude, longitude, page)
- url_list = get_url_list(driver)
-
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
- for item in url_list:
- result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
- insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
- .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
- DA.mysql_insert_data(db, insert_sql)
-
- if page < 2 :
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(element).click(element).perform()
- except:
- error = pd.DataFrame([row])
- error.to_csv('error_shop_item_list.csv', mode='a', header = False)
- driver.close()
- driver = brower_start()
- if __name__ == '__main__':
- main()
|