# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * import dataset import sys from datetime import datetime import pandas as pd import time import traceback import json import re import os import selenium def brower_start(port): options = webdriver.ChromeOptions() # browser = webdriver.Chrome(options=options) # 上面成功再來用docker browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=options.to_capabilities() ) return browser def get_url_list(driver): wait = WebDriverWait(driver, 30) try: wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]'))) except selenium.common.exceptions.TimeoutException: traceback.print_exc() return "EMPTY" # elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]") elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']") print(elmts) if len(elmts)>1: elmt=elmts[1] else: elmt=elmts[0] # webdriver.ActionChains(driver).move_to_element(elmt).click().perform() for i in range(8): try: # print(elmt) # print('before send key') elmt.send_keys(Keys.PAGE_DOWN) except: # print('exception') traceback.print_exc() # print('after send key') time.sleep(0.5) url_soup = BeautifulSoup(driver.page_source, 'html.parser') url_list = [] for i in url_soup.find_all('a'): try: if i['href'].find('maps/place') != -1: url_list += [[i['href'], i['aria-label']]] except: pass return url_list def keyin_keyword(driver, keyword): button = driver.find_element_by_id("searchbox") driver.implicitly_wait(30) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) def main(): db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') table=db['shop_item_list'] keyword = '虱目魚' if len(sys.argv) >1: keyword=sys.argv[1] port=4444 if len(sys.argv) >2: port=int(sys.argv[2]) os.system('docker container restart p'+str(port)) time.sleep(8) print('drvier start...') driver = brower_start(port) num=0 cursor=db.query('select num from progress_list where kw = "'+keyword+'"') for c in cursor: num=c['num'] break table2=db['progress_list'] cursor=db.query('select * from lat_lon_loc where num >= '+str(num)) # cursor=db.query('select * from lat_lon_loc') lst=[] for c in cursor: lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']}) for r in lst: latitude = r['lat'] #緯度 longitude = r['lon'] #精度 table2.upsert({'kw':keyword,'num':r['num']},['kw']) url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude) driver.get(url) keyin_keyword(driver, keyword) failcnt=0 for page in range(4): print( r['loc'], latitude, longitude, page) url_list = get_url_list(driver) if url_list == 'EMPTY': failcnt+=1 if failcnt >=2: break continue print(url_list) shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date'] for item in url_list: try: table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) except: print('dup entry') # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")] # print(result) if page < 2 : element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e') driver.implicitly_wait(30) ActionChains(driver).move_to_element(element).click(element).perform() if __name__ == '__main__': main()