# -*- coding: utf-8 -*- #from selenium import webdriver from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import traceback from datetime import datetime import dataset import time import json import gzip import re import sys, os import socket import brotli from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib.parse from seleniumwire.utils import decode as sw_decode #chrome_window=False chrome_window=True globalkw=None proxyport=8787 def build_cache(db): id_dict={} cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;') for c in cursor: id_dict[c['place_id']]=1 return id_dict # def brower_start(port): global proxyport global chrome_window print(proxyport) options = webdriver.ChromeOptions() if chrome_window: options.add_argument('--ignore-certificate-errors') options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--disable-dev-shm-usage") browser = webdriver.Chrome( options=options # ,seleniumwire_options={'disable_encoding': True} # desired_capabilities=options.to_capabilities() ) browser.set_window_size(1400,1000) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False} ) # seleniumwire_options = {'addr': '172.17.0.2','port':4444}) browser.set_window_size(1400,1000) return browser def page_down_(driver, xpath_css, time_): e = driver.find_element_by_css_selector('span[class="Jl2AFb"]') result_count = e.text.split('-')[1].replace(' 項結果','') print(result_count) if int(result_count) > 5: for i in range(time_): e = driver.find_elements_by_css_selector('div[class="TFQHme"]') action = webdriver.common.action_chains.ActionChains(driver) action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0) action.click() action.perform() time.sleep(0.5) def keyin_keyword(driver, keyword): print('key in keyword:' +keyword) button = driver.find_element_by_id("searchbox") driver.implicitly_wait(30) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) def scan_job(db,kw): result={'kw':kw} cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid FROM google_poi.conv_log group by num ) t2 where t1.id=t2.mid having diff>0 order by rand()') for c in cursor: result['num']=c['num'] break cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"') for c in cursor: result['lat']=c['lat'] result['lon']=c['lon'] result['loc']=c['loc'] return result def get_next_job(db,repeat=False,repkw=None,repnum=None): global globalkw result={} # if globalkw is not None: # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"') # else: # cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1') # cursor = db.query('select kw,num from areacodes where expand=0 order by rand()') cursor = db.query('select kw,num from areacodes order by rand()') for c in cursor: # repkw=c['kw'] if repkw is None: repkw=c['kw'] result['kw']=c['kw'] result['num']=c['num'] break if repkw is not None: result['kw']=repkw if result.get('num') is not None: cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"') for c in cursor: result['lat']=c['lat'] result['lon']=c['lon'] result['loc']=c['loc'] break if repeat and repkw!= 'REP': result['kw']=repkw result['num']=repnum # if 'REP' in repkw: # if repnum=='REP': # repnum=None # cursor = db.query('select num from swire_store_list order by rand() limit 1') # for c in cursor: # repnum=c['num'] # break # if repnum is None: # cursor = db.query('select num from swire_store_list order by rand() limit 1') # for c in cursor: # repnum=c['num'] # break # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1') # cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1') # for c in cursor: # result['kw']=c['keyword'] # result['num']=c['num'] # result['lat']=c['lat_txt'] # result['lon']=c['lon_txt'] # result['loc']='' # return result if repeat: # cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1') # cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1') cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where keyword <> "火鍋餐廳" order by rand() limit 1') for c in cursor: result['kw']=c['keyword'] result['lat']=c['lat_txt'] result['lon']=c['lon_txt'] result['num']=c['num'] result['loc']='' return result def write_to_file(jsobj,fname): import codecs fw=codecs.open(fname,'w','utf-8') fw.write(str(jsobj)) fw.close() def parsing_js(orig): resultobj=[] content="" lines=orig.split('\n') for l in lines: newl=l.replace('\\"','"') # if '\\\\"' in newl: # print(newl) # newl=newl.repace('\\\\"','') newl=newl.replace('\\"','"') content+=newl result=re.search(r'\[\["',content) print(result) content_begin=result.start() result=re.search(r'\]\]"',content) print(result) content_end=result.end() jscontent=content[content_begin:content_end-1] # write_to_file(jscontent,'c:/tmp/debug.txt') # write_to_file(jscontent,'c:/tmp/headless.txt') jsobj=json.loads(jscontent) for x in jsobj[0][1][1:]: print(x[14][11]) print(x[14][9]) reviews_cnt=None photo=None rating=None biz_id=None loc_x=None loc_y=None addr_elmts=None tel=None try: rating=x[14][4][7] reviews_cnt=x[14][4][8] except: traceback.print_exc() try: photo=x[14][37][0][0][0] num_photos=x[14][37][0][0][6][1] except: traceback.print_exc() try: loc_x=x[14][37][0][0][29][0] loc_y=x[14][37][0][0][29][1] except: traceback.print_exc() try: biz_id=x[14][57][2] tel=x[14][178][0][3] except: traceback.print_exc() try: addr_elmts=str(x[14][82]) except: traceback.print_exc() category=str(x[14][13]) topic=str(x[14][89]) print(x[14][13]) print(x[14][10]) print(x[14][2]) print(x[14][78]) try: resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) except: traceback.print_exc() return resultobj def save_js_to_db(jsobj,num,keyword): global store_list_table global iddict for r in jsobj: if iddict.get(r['place_id']) is not None: continue r['num']=num r['keyword']=keyword try: store_list_table.insert(r) # store_list_table.upsert(r,keys=['place_id']) except: traceback.print_exc() # store_list_table.upsert(r,keys=['place_id']) def process_web_request(db,driver,area_num,keyword): global prev_cnt # query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]'))) time.sleep(0.8) time.sleep(3) print("ppppppppp&**********************") for request in driver.requests: if 'search?' in request.url : print('searching.....') # else: # print(request.url[20:60]) if request.response: # if 'https://www.google.com.tw/search?tbm=map' in request.url : if 'search?' in request.url : print('parsing js:') # resp=request.response.body # resp = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity')) # data = data.decode("utf8") # print(request.response.header) # sys.exit() # driver.quit() resp = request.response.body print(request.response.headers.get('Content-Encoding')) if 'gzip' in request.response.headers.get('Content-Encoding'): resp = gzip.decompress(request.response.body) if 'br' in request.response.headers.get('Content-Encoding'): resp = brotli.decompress(request.response.body) jstext=resp.decode('utf-8') resultobj=parsing_js(jstext) print("before",datetime.now()) print("num: "+str(area_num)) save_js_to_db(resultobj,area_num,keyword) print("after",datetime.now()) # aft_cnt=0 # cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ') # for c in cursor: # aft_cnt=c['cnt'] # break # db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()}) del driver.requests # time.sleep(9999) def main(): global chrome_window global store_list_table global globalkw global proxyport global iddict global prev_cnt port=4444 # if len(sys.argv) == 3 : # port=int(sys.argv[1]) # proxyport=int(sys.argv[2]) if len(sys.argv)>1: globalkw=sys.argv[1] port=int(sys.argv[2]) proxyport=int(sys.argv[3]) print(globalkw, port, proxyport) failcnt=0 localip=socket.gethostbyname(socket.gethostname()) # if localip=='192.168.1.108': # chrome_window=True # chrome_window=False db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') iddict=build_cache(db) store_list_table = db['swire_store_list'] # table2 = db['swire_progress_list'] table2 = db['swire_area_progress'] if not chrome_window: print('restart docker p{}'.format(port)) # os.system('sudo docker container restart p'+str(port)) os.system('docker container restart p'+str(port)) time.sleep(10) print('drvier start...') driver = brower_start(port) area_num=None repeating=False while True: try: if len(sys.argv) > 4 : repkw=sys.argv[1] repnum=sys.argv[2] if 'SCAN' in repkw: job=scan_job(db,repnum) else: repeating=True job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum) else: job=get_next_job(db, repkw=globalkw) print(job) keyword = job['kw'] latitude = job['lat'] #緯度 longitude = job['lon'] #精度 area_num=job['num'] safe_string = urllib.parse.quote_plus(keyword) url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude) # prev_cnt=0 # cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ') # for c in cursor: # prev_cnt=c['cnt'] # break # url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude) # url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW' # print(url) # url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z' driver.get(url) time.sleep(3) keyin_keyword(driver, keyword) process_web_request(db,driver,area_num,keyword) pagecnt=0 while True: element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e') if element.get_attribute('disabled'): break # driver.implicitly_wait(30) ActionChains(driver).move_to_element(element).click(element).perform() process_web_request(db,driver,area_num,keyword) if repeating: break pagecnt+=1 if pagecnt>=5: break # table2.upsert({'kw':keyword,'num':job['num']},['kw']) table2.insert({'kw':keyword,'num':job['num']},['kw']) db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ') except: traceback.print_exc() failcnt+=1 if failcnt>=15: sys.exit() pass if __name__ == '__main__': main()