| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 | # -*- coding: utf-8 -*-from selenium import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byfrom bs4 import BeautifulSoupfrom utility import database_access as DAfrom utility.parseutils import *from utility.connect import *import datasetimport sysfrom datetime import datetimeimport pandas as pdimport timeimport jsonimport redef brower_start(port):    options = webdriver.ChromeOptions()#    browser = webdriver.Chrome(options=options)#    上面成功再來用docker    browser = webdriver.Remote(        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',        desired_capabilities=options.to_capabilities()    )    return browserdef get_url_list(driver):    for i in range(5, 43, 2):        try:            wait = WebDriverWait(driver, 60)            wait.until(                EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))            )            driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)            time.sleep(1)        except:            pass    url_soup = BeautifulSoup(driver.page_source, 'html.parser')    url_list = []    for i in url_soup.find_all('a'):        try:            if i['href'].find('maps/place') != -1:                url_list += [[i['href'], i['aria-label']]]        except:            pass        return url_listdef keyin_keyword(driver, keyword):    button = driver.find_element_by_id("searchbox")    driver.implicitly_wait(30)    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()    time.sleep(3)def main():    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')    keyword = '青年旅館'    if len(sys.argv) >1:        keyword=sys.argv[1]    port=4444    if len(sys.argv) >2:        port=int(sys.argv[2])    print('drvier start...')    driver = brower_start(port)    num=0    cursor=db.query('select num from progress_list where kw = "'+keyword+'"')    for c in cursor:        num=c['num']        break    table2=db['progress_list']    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))#    cursor=db.query('select * from lat_lon_loc')    lst=[]    for c in cursor:        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})    for r in lst:        latitude = r['lat'] #緯度        longitude = r['lon'] #精度        table2.upsert({'kw':keyword,'num':r['num']},['kw'])        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)        driver.get(url)        keyin_keyword(driver, keyword)                for page in range(4):            print( r['loc'], latitude, longitude, page)            url_list = get_url_list(driver)            print(url_list)            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']            for item in url_list:                result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]                print(result)#                insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\#                                .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))#                DA.mysql_insert_data(db, insert_sql)                        if page < 2 :                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')                driver.implicitly_wait(30)                ActionChains(driver).move_to_element(element).click(element).perform() if __name__ == '__main__':    main()
 |