# -*- coding: utf-8 -*- from seleniumwire import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import selenium import traceback from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * from datetime import datetime import pandas as pd import dataset import requests, random, time, json import re, sys, os import socket, brotli from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib.parse chrome_window=False #chrome_window=True globalkw=None proxyport=8787 def build_cache(db): id_dict={} cursor = db.query('SELECT place_id FROM {}.{};'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_STORE_LIST)) for c in cursor: id_dict[c['place_id']]=1 return id_dict def brower_start(port): global proxyport global chrome_window print(proxyport) options = webdriver.ChromeOptions() if chrome_window: browser = webdriver.Chrome( desired_capabilities=options.to_capabilities() ) else: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=chrome_options.to_capabilities(), seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False} ) # seleniumwire_options = {'addr': '172.17.0.2','port':4444}) browser.set_window_size(1400,1000) return browser def keyin_keyword(driver, keyword): button = driver.find_element_by_id("searchbox") driver.implicitly_wait(30) ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() time.sleep(3) def scan_job(db, kw): result = {'kw' : kw} table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'],MYSQL_CONFIG['TABLE_CONV_LOG']) cursor = db.query('select t1.num,next-prev as diff from {} t1, \ (SELECT num,max(id) mid FROM {} group by num ) t2 \ where t1.id=t2.mid having diff>0 order by rand()'.format(table_name, table_name)) for c in cursor: result['num']=c['num'] break cursor = db.query('select lat,lon,loc from {} where num ="'.format(TABLE_LAT_LON)+str(result['num'])+'"') for c in cursor: result['lat'] = c['lat'] result['lon'] = c['lon'] result['loc'] = c['loc'] return result def get_next_job(db, repeat=False, repkw=None, repnum=None): global globalkw result={} cursor = db.query('select kw, num from {} where expand = 0 order by rand()'.format(TABLE_AREACODES)) for c in cursor: if repkw is None: repkw = c['kw'] result['kw'] = c['kw'] result['num'] = c['num'] break if repkw is not None: result['kw'] = repkw if result.get('num') is not None: cursor = db.query('select lat,lon,loc from {} where num ="{}"'.format(TABLE_LAT_LON, str(result['num']))) for c in cursor: result['lat']=c['lat'] result['lon']=c['lon'] result['loc']=c['loc'] break if repeat and repkw!= 'REP': result['kw']=repkw result['num']=repnum if 'REP' in repkw: if repnum=='REP': repnum=None cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST)) for c in cursor: repnum=c['num'] break if repnum is None: cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST)) for c in cursor: repnum=c['num'] break cursor = db.query('select lat_txt,lon_txt,keyword,num from {} where num="{}" limit 1'.format(TABLE_STORE_LIST, str(repnum))) for c in cursor: result['kw']=c['keyword'] result['num']=c['num'] result['lat']=c['lat_txt'] result['lon']=c['lon_txt'] result['loc']='' return result if repeat: cursor = db.query('select lat_txt,lon_txt,keyword from {} order by rand() limit 1'.format(TABLE_STORE_LIST)) for c in cursor: result['kw']=c['keyword'] result['lat']=c['lat_txt'] result['lon']=c['lon_txt'] return result def write_to_file(jsobj,fname): import codecs fw=codecs.open(fname,'w','utf-8') fw.write(str(jsobj)) fw.close() def parsing_js(orig): resultobj=[] content="" lines=orig.split('\n') for l in lines: newl=l.replace('\\"','"') newl=newl.replace('\\"','"') content+=newl result=re.search(r'\[\["',content) print(result) content_begin=result.start() result=re.search(r'\]\]"',content) print(result) content_end=result.end() jscontent=content[content_begin:content_end-1] # write_to_file(jscontent,'c:/tmp/debug.txt') jsobj=json.loads(jscontent) for x in jsobj[0][1][1:]: print(x[14][11]) print(x[14][9]) reviews_cnt=None photo=None rating=None biz_id=None loc_x=None loc_y=None addr_elmts=None tel=None try: rating=x[14][4][7] reviews_cnt=x[14][4][8] except: traceback.print_exc() try: photo=x[14][37][0][0][0] num_photos=x[14][37][0][0][6][1] except: traceback.print_exc() try: loc_x=x[14][37][0][0][29][0] loc_y=x[14][37][0][0][29][1] except: traceback.print_exc() try: biz_id=x[14][57][2] tel=x[14][178][0][3] except: traceback.print_exc() try: addr_elmts=str(x[14][82]) except: traceback.print_exc() category=str(x[14][13]) topic=str(x[14][89]) print(x[14][13]) print(x[14][10]) print(x[14][2]) print(x[14][78]) try: resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) except: traceback.print_exc() return resultobj def save_js_to_db(jsobj,num,keyword): global store_list_table global iddict for r in jsobj: if iddict.get(r['place_id']) is not None: continue r['num']=num r['keyword']=keyword try: store_list_table.insert(r) except: traceback.print_exc() def process_web_request(db, driver, area_num, keyword): global prev_cnt request_url = None time.sleep(0.8) time.sleep(3) print("ppppppppp&**********************") for request in driver.requests: if 'search?' in request.url : print('searching.....') if request.response: if 'search?' in request.url : print('parsing js:') print(request.url) resp = brotli.decompress(request.response.body) jstext = resp.decode('utf-8') resultobj = parsing_js(jstext) print("before",datetime.now()) print("num: "+str(area_num)) save_js_to_db(resultobj, area_num, keyword) print("after",datetime.now()) aft_cnt=0 cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num))) for c in cursor: aft_cnt=c['cnt'] break db[TABLE_CONV_LOG].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()}) del driver.requests def check_area_code(db, kw): if kw: table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES) result = db.query('select distinct(kw) from {}'.format(table_name)) result = [i['kw'] for i in result] if kw not in result: try: sql = 'insert into {} (select num,"{}" as kw, 0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON) db.query(sql) except: traceback.print_exc() def page_down_(driver, time_): try: # action = webdriver.ActionChains(driver) # element = driver.find_element_by_css_selector('a[aria-label="清除搜尋"]') # print(element) # height = element.size['height'] # width = element.size['width'] # action.move_to_element(element).move_by_offset(-width, height).click().perform() action = webdriver.ActionChains(driver) element = driver.find_element_by_css_selector('div[class="TFQHme"]') action.move_to_element(element).click().perform() time.sleep(1) driver.back() time.sleep(1) for i in range(time_): print(i) actions = ActionChains(driver) actions.send_keys(Keys.END).perform() time.sleep(0.5) except: traceback.print_exc() def main(): global chrome_window global store_list_table global globalkw global proxyport global iddict global prev_cnt port=4447 if len(sys.argv)>1: globalkw=sys.argv[1] port=int(sys.argv[2]) proxyport=int(sys.argv[3]) print(globalkw, port, proxyport) failcnt=0 localip=socket.gethostbyname(socket.gethostname()) db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'], MYSQL_CONFIG['MYSQL_PASSWORD'], MYSQL_CONFIG['MYSQL_HOST'], MYSQL_CONFIG['MYSQL_DB'])) store_list_table = db[TABLE_STORE_LIST] table2 = db[TABLE_PROGRESS_LIST] if not chrome_window: print('restart docker pw{}'.format(port)) os.system('sudo docker container restart pw'+str(port)) # os.system('docker container restart p'+str(port)) time.sleep(10) print('drvier start...') driver = brower_start(port) check_area_code(db, globalkw) for i in range(368): area_num=None # if len(sys.argv) > 4 : # repkw = sys.argv[1] # repnum = sys.argv[2] # if 'SCAN' in repkw: # job = scan_job(db, repnum) # else: # job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum) # else: job = get_next_job(db, repkw=globalkw) print(job) keyword = job['kw'] globalkw = keyword latitude = job['lat'] #緯度 longitude = job['lon'] #精度 area_num = job['num'] safe_string = urllib.parse.quote_plus(keyword) for j in range(5): iddict = build_cache(db) if j != 0: latitude_ = float(latitude) + (random.randint(-999,999) / 10000) longitude_ = float(longitude) + (random.randint(-999,999) / 10000) else: latitude_, longitude_ = latitude, longitude url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude_, longitude_) print(url) prev_cnt=0 cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num))) for c in cursor: prev_cnt = c['cnt'] break driver.get(url) time.sleep(2) keyin_keyword(driver, keyword) # page_down_(driver, 3) process_web_request(db, driver, area_num, keyword) time.sleep(1) table2.insert({'kw':keyword,'num':job['num']},['kw']) db.query(f'update {TABLE_AREACODES} set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ') if __name__ == '__main__': main()