noodlesloves
/
GooglePoiCrawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
							# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

from utility import database_access as DA
from utility.parseutils import *
from utility.connect import *
import dataset
import sys
from datetime import datetime
import pandas as pd
import time
import traceback
import json
import re
import os
import selenium

def brower_start(port):
    options = webdriver.ChromeOptions()

#    browser = webdriver.Chrome(options=options)
#    上面成功再來用docker
    browser = webdriver.Remote(
        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
        desired_capabilities=options.to_capabilities()
    )
    return browser


def get_url_list(driver):
    wait = WebDriverWait(driver, 30)
    try:
        wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
    except selenium.common.exceptions.TimeoutException:
        traceback.print_exc()
        return "EMPTY"

#    elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")

    elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")

    print(elmts)
    if len(elmts)>1:
        elmt=elmts[1]
    else:
        elmt=elmts[0]

#    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()

    for i in range(8):
        try:
#            print(elmt)
#            print('before send key')
            elmt.send_keys(Keys.PAGE_DOWN)
        except:
#            print('exception')
            traceback.print_exc()
#        print('after send key')
        time.sleep(0.5)

    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
    url_list = []
    for i in url_soup.find_all('a'):
        try:
            if i['href'].find('maps/place') != -1:
                url_list += [[i['href'], i['aria-label']]]
        except:
            pass
    return url_list


def keyin_keyword(driver, keyword):
    button = driver.find_element_by_id("searchbox")
    driver.implicitly_wait(30)
    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
    time.sleep(3)


def main():
    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
    table=db['shop_item_list']

    keyword = '虱目魚'
    if len(sys.argv) >1:
        keyword=sys.argv[1]
    port=4444
    if len(sys.argv) >2:
        port=int(sys.argv[2])
        os.system('docker container restart p'+str(port))
        time.sleep(8)

    print('drvier start...')
    driver = brower_start(port)


    num=0
    cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
    for c in cursor:
        num=c['num']
        break

    table2=db['progress_list']


    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
#    cursor=db.query('select * from lat_lon_loc')
    lst=[]
    for c in cursor:
        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})


    for r in lst:

        latitude = r['lat'] #緯度
        longitude = r['lon'] #精度
        table2.upsert({'kw':keyword,'num':r['num']},['kw'])
        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
        driver.get(url)

        keyin_keyword(driver, keyword)
        failcnt=0
        for page in range(4):
            print( r['loc'], latitude, longitude, page)
            url_list = get_url_list(driver)
            if url_list == 'EMPTY':
                failcnt+=1
                if failcnt >=2:
                    break
                continue
            print(url_list)
            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
            for item in url_list:
                try:
                    table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                except:
                    print('dup entry')
#                result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
#                print(result)
            
            if page < 2 :
                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
                driver.implicitly_wait(30)
                ActionChains(driver).move_to_element(element).click(element).perform() 

if __name__ == '__main__':
    main()