# -*- coding: utf-8 -*- #from selenium import webdriver from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import traceback from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * from datetime import datetime import pandas as pd import dataset import time import json import re import sys, os import socket import brotli from selenium.webdriver.common.desired_capabilities import DesiredCapabilities chrome_window=False def brower_start(port): options = webdriver.ChromeOptions() if chrome_window: browser = webdriver.Chrome( desired_capabilities=options.to_capabilities() ) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=host.docker.internal:8787') # Specify your Kubernetes service-name here chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options={'addr':'0.0.0.0','port':8787,'auto_config': False} ) # seleniumwire_options = {'addr': '172.17.0.2','port':4444}) browser.set_window_size(1400,1000) return browser def page_down_(driver, xpath_css, time_): e = driver.find_element_by_css_selector('span[class="Jl2AFb"]') result_count = e.text.split('-')[1].replace(' 項結果','') print(result_count) if int(result_count) > 5: for i in range(time_): e = driver.find_elements_by_css_selector('div[class="TFQHme"]') action = webdriver.common.action_chains.ActionChains(driver) action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0) action.click() action.perform() time.sleep(0.5) def get_url_list(driver): page_down_(driver, '//div[@class="TFQHme"]', 8) url_soup = BeautifulSoup(driver.page_source, 'html.parser') url_list = [] for i in url_soup.find_all('a'): try: if i['href'].find('maps/place') != -1: url_list += [[i['href'], i['aria-label']]] except: pass # print(len(url_list)) return url_list def keyin_keyword(driver, keyword): button = driver.find_element_by_id("searchbox") driver.implicitly_wait(30) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) def get_next_job(db): result={} cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1') for c in cursor: result['kw']=c['kw'] result['num']=c['num'] break cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"') for c in cursor: result['lat']=c['lat'] result['lon']=c['lon'] result['loc']=c['loc'] break return result def write_to_file(jsobj,fname): import codecs fw=codecs.open(fname,'w','utf-8') fw.write(str(jsobj)) fw.close() def parsing_js(orig): resultobj=[] content="" lines=orig.split('\n') for l in lines: newl=l.replace('\\"','"') # if '\\\\"' in newl: # print(newl) # newl=newl.repace('\\\\"','') newl=newl.replace('\\"','"') content+=newl result=re.search(r'\[\["',content) print(result) content_begin=result.start() result=re.search(r'\]\]"',content) print(result) content_end=result.end() jscontent=content[content_begin:content_end-1] # write_to_file(jscontent,'c:/tmp/debug.txt') jsobj=json.loads(jscontent) for x in jsobj[0][1][1:]: print(x[14][11]) print(x[14][10]) print(x[14][2]) print(x[14][78]) try: resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) except: traceback.print_exc() return resultobj def save_js_to_db(jsobj,num,keyword): global store_list_table for r in jsobj: r['num']=num r['keyword']=keyword store_list_table.upsert(r,keys=['place_id']) def process_web_request(driver,area_num,keyword): query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]'))) time.sleep(8) print("ppppppppp&**********************") for request in driver.requests: if request.response: if 'https://www.google.com.tw/search?tbm=map' in request.url : print('parsing js:') resp = brotli.decompress(request.response.body) jstext=resp.decode('utf-8') resultobj=parsing_js(jstext) save_js_to_db(resultobj,area_num,keyword) def main(): global chrome_window global store_list_table failcnt=0 localip=socket.gethostbyname(socket.gethostname()) if localip=='192.168.1.108': # chrome_window=True chrome_window=False db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') store_list_table = db['swire_store_list'] table2 = db['swire_progress_list'] port=4444 # if len(sys.argv) > 1 : # port=int(sys.argv[1]) if True: print('restart docker p{}'.format(port)) # os.system('sudo docker container restart p'+str(port)) os.system('docker container restart p'+str(port)) time.sleep(10) print('drvier start...') driver = brower_start(port) while True: try: job=get_next_job(db) print(job) keyword = job['kw'] latitude = job['lat'] #緯度 longitude = job['lon'] #精度 area_num=job['num'] url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude) driver.get(url) keyin_keyword(driver, keyword) process_web_request(driver,area_num,keyword) while True: element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e') if element.get_attribute('disabled'): break # driver.implicitly_wait(30) ActionChains(driver).move_to_element(element).click(element).perform() process_web_request(driver,area_num,keyword) table2.upsert({'kw':keyword,'num':job['num']},['kw']) except: traceback.print_exc() failcnt+=1 if failcnt>=15: sys.exit() pass if __name__ == '__main__': main()