| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 | # -*- coding: utf-8 -*-from selenium import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byfrom bs4 import BeautifulSoupfrom utility import database_access as DAfrom utility.parseutils import *from utility.connect import *import datasetimport sysfrom datetime import datetimeimport pandas as pdimport timeimport tracebackimport jsonimport reimport osimport seleniumdef brower_start(port):    options = webdriver.ChromeOptions()#    browser = webdriver.Chrome(options=options)#    上面成功再來用docker    browser = webdriver.Remote(        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',        desired_capabilities=options.to_capabilities()    )    return browserdef get_url_list(driver):    wait = WebDriverWait(driver, 30)    try:        wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))    except selenium.common.exceptions.TimeoutException:        traceback.print_exc()        return "EMPTY"#    elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")    elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")    print(elmts)    if len(elmts)>1:        elmt=elmts[1]    else:        elmt=elmts[0]#    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()    for i in range(8):        try:#            print(elmt)#            print('before send key')            elmt.send_keys(Keys.PAGE_DOWN)        except:#            print('exception')            traceback.print_exc()#        print('after send key')        time.sleep(0.5)    url_soup = BeautifulSoup(driver.page_source, 'html.parser')    url_list = []    for i in url_soup.find_all('a'):        try:            if i['href'].find('maps/place') != -1:                url_list += [[i['href'], i['aria-label']]]        except:            pass    return url_listdef keyin_keyword(driver, keyword):    button = driver.find_element_by_id("searchbox")    driver.implicitly_wait(30)    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()    time.sleep(3)def main():    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')    table=db['shop_item_list']    keyword = '虱目魚'    if len(sys.argv) >1:        keyword=sys.argv[1]    port=4444    if len(sys.argv) >2:        port=int(sys.argv[2])        os.system('docker container restart p'+str(port))        time.sleep(8)    print('drvier start...')    driver = brower_start(port)    num=0    cursor=db.query('select num from progress_list where kw = "'+keyword+'"')    for c in cursor:        num=c['num']        break    table2=db['progress_list']    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))#    cursor=db.query('select * from lat_lon_loc')    lst=[]    for c in cursor:        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})    for r in lst:        latitude = r['lat'] #緯度        longitude = r['lon'] #精度        table2.upsert({'kw':keyword,'num':r['num']},['kw'])        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)        driver.get(url)        keyin_keyword(driver, keyword)        failcnt=0        for page in range(4):            print( r['loc'], latitude, longitude, page)            url_list = get_url_list(driver)            if url_list == 'EMPTY':                failcnt+=1                if failcnt >=2:                    break                continue            print(url_list)            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']            for item in url_list:                try:                    table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})                except:                    print('dup entry')#                result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]#                print(result)                        if page < 2 :                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')                driver.implicitly_wait(30)                ActionChains(driver).move_to_element(element).click(element).perform() if __name__ == '__main__':    main()
 |