123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- # -*- coding: utf-8 -*-
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- import dataset
- import sys
- from datetime import datetime
- import pandas as pd
- import time
- import traceback
- import json
- import re
- import os
- import selenium
- def brower_start(port):
- options = webdriver.ChromeOptions()
- # browser = webdriver.Chrome(options=options)
- # 上面成功再來用docker
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def get_url_list(driver):
- wait = WebDriverWait(driver, 30)
- try:
- wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
- except selenium.common.exceptions.TimeoutException:
- traceback.print_exc()
- return "EMPTY"
- # elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")
- elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")
- print(elmts)
- if len(elmts)>1:
- elmt=elmts[1]
- else:
- elmt=elmts[0]
- # webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
- for i in range(8):
- try:
- # print(elmt)
- # print('before send key')
- elmt.send_keys(Keys.PAGE_DOWN)
- except:
- # print('exception')
- traceback.print_exc()
- # print('after send key')
- time.sleep(0.5)
- url_soup = BeautifulSoup(driver.page_source, 'html.parser')
- url_list = []
- for i in url_soup.find_all('a'):
- try:
- if i['href'].find('maps/place') != -1:
- url_list += [[i['href'], i['aria-label']]]
- except:
- pass
- return url_list
- def keyin_keyword(driver, keyword):
- button = driver.find_element_by_id("searchbox")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
- time.sleep(3)
- def main():
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
- table=db['shop_item_list']
- keyword = '虱目魚'
- if len(sys.argv) >1:
- keyword=sys.argv[1]
- port=4444
- if len(sys.argv) >2:
- port=int(sys.argv[2])
- os.system('docker container restart p'+str(port))
- time.sleep(8)
- print('drvier start...')
- driver = brower_start(port)
- num=0
- cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
- for c in cursor:
- num=c['num']
- break
- table2=db['progress_list']
- cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
- # cursor=db.query('select * from lat_lon_loc')
- lst=[]
- for c in cursor:
- lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
- for r in lst:
- latitude = r['lat'] #緯度
- longitude = r['lon'] #精度
- table2.upsert({'kw':keyword,'num':r['num']},['kw'])
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
- driver.get(url)
- keyin_keyword(driver, keyword)
- failcnt=0
- for page in range(4):
- print( r['loc'], latitude, longitude, page)
- url_list = get_url_list(driver)
- if url_list == 'EMPTY':
- failcnt+=1
- if failcnt >=2:
- break
- continue
- print(url_list)
- shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
- for item in url_list:
- try:
- table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
- except:
- print('dup entry')
- # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
- # print(result)
-
- if page < 2 :
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(element).click(element).perform()
- if __name__ == '__main__':
- main()
|