|
@@ -0,0 +1,154 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
+import dataset
|
|
|
+import sys
|
|
|
+from datetime import datetime
|
|
|
+import pandas as pd
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+import json
|
|
|
+import re
|
|
|
+import os
|
|
|
+import selenium
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+
|
|
|
+# browser = webdriver.Chrome(options=options)
|
|
|
+# 上面成功再來用docker
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def get_url_list(driver):
|
|
|
+ wait = WebDriverWait(driver, 30)
|
|
|
+ try:
|
|
|
+ wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
|
|
|
+ except selenium.common.exceptions.TimeoutException:
|
|
|
+ traceback.print_exc()
|
|
|
+ return "EMPTY"
|
|
|
+
|
|
|
+# elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")
|
|
|
+
|
|
|
+ elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")
|
|
|
+
|
|
|
+ print(elmts)
|
|
|
+ if len(elmts)>1:
|
|
|
+ elmt=elmts[1]
|
|
|
+ else:
|
|
|
+ elmt=elmts[0]
|
|
|
+
|
|
|
+# webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
+
|
|
|
+ for i in range(8):
|
|
|
+ try:
|
|
|
+# print(elmt)
|
|
|
+# print('before send key')
|
|
|
+ elmt.send_keys(Keys.PAGE_DOWN)
|
|
|
+ except:
|
|
|
+# print('exception')
|
|
|
+ traceback.print_exc()
|
|
|
+# print('after send key')
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ url_list = []
|
|
|
+ for i in url_soup.find_all('a'):
|
|
|
+ try:
|
|
|
+ if i['href'].find('maps/place') != -1:
|
|
|
+ url_list += [[i['href'], i['aria-label']]]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ return url_list
|
|
|
+
|
|
|
+
|
|
|
+def keyin_keyword(driver, keyword):
|
|
|
+ button = driver.find_element_by_id("searchbox")
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
|
|
|
+ time.sleep(3)
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table=db['shop_item_list']
|
|
|
+
|
|
|
+ keyword = '虱目魚'
|
|
|
+ if len(sys.argv) >1:
|
|
|
+ keyword=sys.argv[1]
|
|
|
+ port=4444
|
|
|
+ if len(sys.argv) >2:
|
|
|
+ port=int(sys.argv[2])
|
|
|
+ os.system('docker container restart p'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+
|
|
|
+ print('drvier start...')
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+
|
|
|
+ num=0
|
|
|
+ cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
|
|
|
+ for c in cursor:
|
|
|
+ num=c['num']
|
|
|
+ break
|
|
|
+
|
|
|
+ table2=db['progress_list']
|
|
|
+
|
|
|
+
|
|
|
+ cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
+# cursor=db.query('select * from lat_lon_loc')
|
|
|
+ lst=[]
|
|
|
+ for c in cursor:
|
|
|
+ lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for r in lst:
|
|
|
+
|
|
|
+ latitude = r['lat'] #緯度
|
|
|
+ longitude = r['lon'] #精度
|
|
|
+ table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+ failcnt=0
|
|
|
+ for page in range(4):
|
|
|
+ print( r['loc'], latitude, longitude, page)
|
|
|
+ url_list = get_url_list(driver)
|
|
|
+ if url_list == 'EMPTY':
|
|
|
+ failcnt+=1
|
|
|
+ if failcnt >=2:
|
|
|
+ break
|
|
|
+ continue
|
|
|
+ print(url_list)
|
|
|
+ shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
|
|
|
+ for item in url_list:
|
|
|
+ try:
|
|
|
+ table.insert({'name':item[1],'lon':longitude, 'lat':latitude, 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ except:
|
|
|
+ print('dup entry')
|
|
|
+# result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
|
|
|
+# print(result)
|
|
|
+
|
|
|
+ if page < 2 :
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|