# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import traceback from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * from datetime import datetime import pandas as pd import dataset import time import json import re import sys, os def brower_start(port): options = webdriver.ChromeOptions() browser = webdriver.Remote( #command_executor='http://192.53.174.202:4444/wd/hub', command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=options.to_capabilities() ) return browser def page_down_(driver, xpath_css, time_): elmts = driver.find_elements_by_xpath(xpath_css) print(elmts) if len(elmts)>1: elmt=elmts[1] else: elmt=elmts[0] actions = ActionChains(driver) actions.move_to_element(elmt).click().perform() for i in range(time_): try: actions = ActionChains(driver) actions.send_keys(Keys.PAGE_DOWN).perform() except: traceback.print_exc() time.sleep(0.5) def get_url_list(driver): # for i in range(5, 43, 2): # try: # wait = WebDriverWait(driver, 60) # wait.until( # EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i))) # ) # driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN) # time.sleep(0.5) # except: # pass # wait = WebDriverWait(driver, 30) # try: # wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]'))) # except selenium.common.exceptions.TimeoutException: # traceback.print_exc() # return "EMPTY" page_down_(driver, '//div[@class="TFQHme"]', 8) url_soup = BeautifulSoup(driver.page_source, 'html.parser') url_list = [] for i in url_soup.find_all('a'): try: if i['href'].find('maps/place') != -1: url_list += [[i['href'], i['aria-label']]] except: pass # print(len(url_list)) return url_list def keyin_keyword(driver, keyword): button = driver.find_element_by_id("searchbox") driver.implicitly_wait(30) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) # def get_crawler_list(db): # result = db.query('select keyword, count(*) from shop_item_list group by keyword') # result = pd.DataFrame([i for i in result]) # result.columns = ['keyword', 'count'] # result = result[result['count'] < 100] # keyword = result.sample(1).iloc[0]['keyword'] # num=0 # cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"') # for c in cursor: # num=c['num'] # break # cursor=db.query('select * from lat_lon_loc where num >= '+str(num)) # # cursor=db.query('select * from lat_lon_loc') # lst=[] # for c in cursor: # lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']}) # return keyword, lst def get_crawler_list(db): result = db.query('select distinct(keyword) from shop_item_list order by keyword') result = pd.DataFrame([i for i in result]) progress = db.query('select distinct(kw) from progress_list2') progress = pd.DataFrame([i for i in progress]) if len(progress) != 0: keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0] else: keyword = result.iloc[0].values[0] return keyword def main(): # data = pd.read_csv('lat_long_location.csv', index_col = 0) # keyword = '麻辣火鍋' lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]] port=4447 if len(sys.argv) > 1 : port=int(sys.argv[1]) print('restart docker p{}'.format(port)) os.system('sudo docker container restart pp'+str(port)) time.sleep(8) # if len(sys.argv) >2: # port=int(sys.argv[2]) print('drvier start...') driver = brower_start(port) # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') table=db['shop_item_list3'] table2=db['progress_list2'] for i in range(20): try: keyword = get_crawler_list(db) print(keyword) c = 0 for row in lon_lat: c += 1 # latitude = row['lat'] #緯度 # longitude = row['lon'] #精度 latitude = row[1] #緯度 longitude = row[0] #精度 # table2.upsert({'kw':keyword,'num':row['num']},['kw']) table2.insert({'kw':keyword,'num':c}) url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude) driver.get(url) keyin_keyword(driver, keyword) failcnt = 0 for page in range(5): print(keyword, latitude, longitude, page) url_list = get_url_list(driver) duplicate = 0 # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date'] for item in url_list: try: table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \ 'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) except: duplicate += 1 print(len(url_list), duplicate) # result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")] # insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\ # .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result)) # DA.mysql_insert_data(db, insert_sql) if page < 2 : element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e') driver.implicitly_wait(30) ActionChains(driver).move_to_element(element).click(element).perform() except: pass if __name__ == '__main__': main()