|
@@ -0,0 +1,441 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+#from selenium import webdriver
|
|
|
+from seleniumwire import webdriver
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.common.keys import Keys
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+import selenium
|
|
|
+import traceback
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from utility import database_access as DA
|
|
|
+from utility.parseutils import *
|
|
|
+from utility.connect import *
|
|
|
+
|
|
|
+from datetime import datetime
|
|
|
+import pandas as pd
|
|
|
+import dataset
|
|
|
+import time
|
|
|
+import json
|
|
|
+import re
|
|
|
+import sys, os
|
|
|
+import socket
|
|
|
+import brotli
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
+import urllib.parse
|
|
|
+#chrome_window=False
|
|
|
+chrome_window=True
|
|
|
+
|
|
|
+globalkw=None
|
|
|
+proxyport=8787
|
|
|
+
|
|
|
+
|
|
|
+def build_cache(db):
|
|
|
+ id_dict={}
|
|
|
+ cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;')
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ id_dict[c['place_id']]=1
|
|
|
+ return id_dict
|
|
|
+ #
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ global proxyport
|
|
|
+ global chrome_window
|
|
|
+ print(proxyport)
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ if chrome_window:
|
|
|
+ options.add_argument('--ignore-certificate-errors')
|
|
|
+ options.add_argument("--no-sandbox")
|
|
|
+ options.add_argument("--headless")
|
|
|
+ options.add_argument("--disable-dev-shm-usage")
|
|
|
+
|
|
|
+ browser = webdriver.Chrome(
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ chrome_options = webdriver.ChromeOptions()
|
|
|
+ chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
|
|
|
+ chrome_options.add_argument('--ignore-certificate-errors')
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=chrome_options.to_capabilities(),
|
|
|
+ seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
|
|
|
+
|
|
|
+ )
|
|
|
+# seleniumwire_options = {'addr': '172.17.0.2','port':4444})
|
|
|
+ browser.set_window_size(1400,1000)
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def page_down_(driver, xpath_css, time_):
|
|
|
+ e = driver.find_element_by_css_selector('span[class="Jl2AFb"]')
|
|
|
+ result_count = e.text.split('-')[1].replace(' 項結果','')
|
|
|
+ print(result_count)
|
|
|
+ if int(result_count) > 5:
|
|
|
+ for i in range(time_):
|
|
|
+ e = driver.find_elements_by_css_selector('div[class="TFQHme"]')
|
|
|
+ action = webdriver.common.action_chains.ActionChains(driver)
|
|
|
+ action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0)
|
|
|
+ action.click()
|
|
|
+ action.perform()
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def get_url_list(driver):
|
|
|
+ page_down_(driver, '//div[@class="TFQHme"]', 8)
|
|
|
+
|
|
|
+ url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ url_list = []
|
|
|
+ for i in url_soup.find_all('a'):
|
|
|
+ try:
|
|
|
+ if i['href'].find('maps/place') != -1:
|
|
|
+ url_list += [[i['href'], i['aria-label']]]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ # print(len(url_list))
|
|
|
+ return url_list
|
|
|
+
|
|
|
+
|
|
|
+def keyin_keyword(driver, keyword):
|
|
|
+ button = driver.find_element_by_id("searchbox")
|
|
|
+ driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
|
|
|
+ time.sleep(3)
|
|
|
+
|
|
|
+
|
|
|
+def scan_job(db,kw):
|
|
|
+ result={'kw':kw}
|
|
|
+ cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid FROM google_poi.conv_log group by num ) t2 where t1.id=t2.mid having diff>0 order by rand()')
|
|
|
+ for c in cursor:
|
|
|
+ result['num']=c['num']
|
|
|
+ break
|
|
|
+ cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
|
|
|
+ for c in cursor:
|
|
|
+ result['lat']=c['lat']
|
|
|
+ result['lon']=c['lon']
|
|
|
+ result['loc']=c['loc']
|
|
|
+ return result
|
|
|
+
|
|
|
+def get_next_job(db,repeat=False,repkw=None,repnum=None):
|
|
|
+ global globalkw
|
|
|
+
|
|
|
+ result={}
|
|
|
+# if globalkw is not None:
|
|
|
+# cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"')
|
|
|
+# else:
|
|
|
+# cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1')
|
|
|
+
|
|
|
+# cursor = db.query('select kw,num from areacodes where expand=0 order by rand()')
|
|
|
+ cursor = db.query('select kw,num from areacodes order by rand()')
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+# repkw=c['kw']
|
|
|
+ if repkw is None:
|
|
|
+ repkw=c['kw']
|
|
|
+ result['kw']=c['kw']
|
|
|
+ result['num']=c['num']
|
|
|
+ break
|
|
|
+ if repkw is not None:
|
|
|
+ result['kw']=repkw
|
|
|
+ if result.get('num') is not None:
|
|
|
+ cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
|
|
|
+ for c in cursor:
|
|
|
+ result['lat']=c['lat']
|
|
|
+ result['lon']=c['lon']
|
|
|
+ result['loc']=c['loc']
|
|
|
+ break
|
|
|
+
|
|
|
+ if repeat and repkw!= 'REP':
|
|
|
+ result['kw']=repkw
|
|
|
+ result['num']=repnum
|
|
|
+
|
|
|
+ if 'REP' in repkw:
|
|
|
+ if repnum=='REP':
|
|
|
+ repnum=None
|
|
|
+# cursor = db.query('select num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1')
|
|
|
+
|
|
|
+ cursor = db.query('select num from swire_store_list order by rand() limit 1')
|
|
|
+ for c in cursor:
|
|
|
+ repnum=c['num']
|
|
|
+ break
|
|
|
+ if repnum is None:
|
|
|
+ cursor = db.query('select num from swire_store_list order by rand() limit 1')
|
|
|
+ for c in cursor:
|
|
|
+ repnum=c['num']
|
|
|
+ break
|
|
|
+
|
|
|
+
|
|
|
+# cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1')
|
|
|
+ cursor = db.query('select lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1')
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ result['kw']=c['keyword']
|
|
|
+ result['num']=c['num']
|
|
|
+ result['lat']=c['lat_txt']
|
|
|
+ result['lon']=c['lon_txt']
|
|
|
+ result['loc']=''
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if repeat:
|
|
|
+# cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
|
|
|
+ cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ result['kw']=c['keyword']
|
|
|
+ result['lat']=c['lat_txt']
|
|
|
+ result['lon']=c['lon_txt']
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def write_to_file(jsobj,fname):
|
|
|
+ import codecs
|
|
|
+ fw=codecs.open(fname,'w','utf-8')
|
|
|
+ fw.write(str(jsobj))
|
|
|
+ fw.close()
|
|
|
+
|
|
|
+def parsing_js(orig):
|
|
|
+ resultobj=[]
|
|
|
+ content=""
|
|
|
+ lines=orig.split('\n')
|
|
|
+ for l in lines:
|
|
|
+ newl=l.replace('\\"','"')
|
|
|
+# if '\\\\"' in newl:
|
|
|
+# print(newl)
|
|
|
+# newl=newl.repace('\\\\"','')
|
|
|
+ newl=newl.replace('\\"','"')
|
|
|
+
|
|
|
+ content+=newl
|
|
|
+ result=re.search(r'\[\["',content)
|
|
|
+ print(result)
|
|
|
+ content_begin=result.start()
|
|
|
+
|
|
|
+ result=re.search(r'\]\]"',content)
|
|
|
+ print(result)
|
|
|
+
|
|
|
+ content_end=result.end()
|
|
|
+
|
|
|
+ jscontent=content[content_begin:content_end-1]
|
|
|
+# write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
+ jsobj=json.loads(jscontent)
|
|
|
+ for x in jsobj[0][1][1:]:
|
|
|
+ print(x[14][11])
|
|
|
+ print(x[14][9])
|
|
|
+ reviews_cnt=None
|
|
|
+ photo=None
|
|
|
+ rating=None
|
|
|
+ biz_id=None
|
|
|
+ loc_x=None
|
|
|
+ loc_y=None
|
|
|
+ addr_elmts=None
|
|
|
+ tel=None
|
|
|
+ try:
|
|
|
+ rating=x[14][4][7]
|
|
|
+ reviews_cnt=x[14][4][8]
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ try:
|
|
|
+ photo=x[14][37][0][0][0]
|
|
|
+ num_photos=x[14][37][0][0][6][1]
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ try:
|
|
|
+ loc_x=x[14][37][0][0][29][0]
|
|
|
+ loc_y=x[14][37][0][0][29][1]
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ try:
|
|
|
+ biz_id=x[14][57][2]
|
|
|
+ tel=x[14][178][0][3]
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+ try:
|
|
|
+ addr_elmts=str(x[14][82])
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ category=str(x[14][13])
|
|
|
+ topic=str(x[14][89])
|
|
|
+ print(x[14][13])
|
|
|
+
|
|
|
+ print(x[14][10])
|
|
|
+ print(x[14][2])
|
|
|
+ print(x[14][78])
|
|
|
+ try:
|
|
|
+ resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ return resultobj
|
|
|
+
|
|
|
+def save_js_to_db(jsobj,num,keyword):
|
|
|
+ global store_list_table
|
|
|
+ global iddict
|
|
|
+ for r in jsobj:
|
|
|
+ if iddict.get(r['place_id']) is not None:
|
|
|
+ continue
|
|
|
+ r['num']=num
|
|
|
+ r['keyword']=keyword
|
|
|
+
|
|
|
+ try:
|
|
|
+ store_list_table.insert(r)
|
|
|
+
|
|
|
+# store_list_table.upsert(r,keys=['place_id'])
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+# store_list_table.upsert(r,keys=['place_id'])
|
|
|
+
|
|
|
+def process_web_request(db,driver,area_num,keyword):
|
|
|
+ global prev_cnt
|
|
|
+# query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
|
+ time.sleep(0.8)
|
|
|
+ time.sleep(3)
|
|
|
+ print("ppppppppp&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if 'search?' in request.url :
|
|
|
+ print('searching.....')
|
|
|
+# else:
|
|
|
+# print(request.url[20:60])
|
|
|
+ if request.response:
|
|
|
+# if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
+ if 'search?' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext=resp.decode('utf-8')
|
|
|
+ resultobj=parsing_js(jstext)
|
|
|
+ print("before",datetime.now())
|
|
|
+ print("num: "+str(area_num))
|
|
|
+ save_js_to_db(resultobj,area_num,keyword)
|
|
|
+ print("after",datetime.now())
|
|
|
+
|
|
|
+ aft_cnt=0
|
|
|
+ cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
|
|
|
+ for c in cursor:
|
|
|
+ aft_cnt=c['cnt']
|
|
|
+ break
|
|
|
+ db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# time.sleep(9999)
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ global chrome_window
|
|
|
+ global store_list_table
|
|
|
+ global globalkw
|
|
|
+ global proxyport
|
|
|
+ global iddict
|
|
|
+ global prev_cnt
|
|
|
+
|
|
|
+ port=4444
|
|
|
+ # if len(sys.argv) == 3 :
|
|
|
+ # port=int(sys.argv[1])
|
|
|
+ # proxyport=int(sys.argv[2])
|
|
|
+
|
|
|
+ if len(sys.argv)>1:
|
|
|
+ globalkw=sys.argv[1]
|
|
|
+ port=int(sys.argv[2])
|
|
|
+ proxyport=int(sys.argv[3])
|
|
|
+ print(globalkw, port, proxyport)
|
|
|
+ failcnt=0
|
|
|
+ localip=socket.gethostbyname(socket.gethostname())
|
|
|
+# if localip=='192.168.1.108':
|
|
|
+# chrome_window=True
|
|
|
+# chrome_window=False
|
|
|
+
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ iddict=build_cache(db)
|
|
|
+ store_list_table = db['swire_store_list']
|
|
|
+
|
|
|
+# table2 = db['swire_progress_list']
|
|
|
+ table2 = db['swire_area_progress']
|
|
|
+
|
|
|
+ if not chrome_window:
|
|
|
+ print('restart docker p{}'.format(port))
|
|
|
+# os.system('sudo docker container restart p'+str(port))
|
|
|
+ os.system('docker container restart p'+str(port))
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
+
|
|
|
+ print('drvier start...')
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+ area_num=None
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ if len(sys.argv) > 4 :
|
|
|
+ repkw=sys.argv[1]
|
|
|
+ repnum=sys.argv[2]
|
|
|
+ if 'SCAN' in repkw:
|
|
|
+ job=scan_job(db,repnum)
|
|
|
+ else:
|
|
|
+ job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum)
|
|
|
+ else:
|
|
|
+ job=get_next_job(db, repkw=globalkw)
|
|
|
+ print(job)
|
|
|
+ keyword = job['kw']
|
|
|
+ latitude = job['lat'] #緯度
|
|
|
+ longitude = job['lon'] #精度
|
|
|
+ area_num=job['num']
|
|
|
+ safe_string = urllib.parse.quote_plus(keyword)
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+
|
|
|
+ prev_cnt=0
|
|
|
+ cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ')
|
|
|
+ for c in cursor:
|
|
|
+ prev_cnt=c['cnt']
|
|
|
+ break
|
|
|
+
|
|
|
+
|
|
|
+# url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
|
|
|
+# url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
|
|
|
+# print(url)
|
|
|
+# url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
|
|
|
+ driver.get(url)
|
|
|
+# time.sleep(3)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+
|
|
|
+ process_web_request(db,driver,area_num,keyword)
|
|
|
+
|
|
|
+ pagecnt=0
|
|
|
+ while True:
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ if element.get_attribute('disabled'):
|
|
|
+ break
|
|
|
+ # driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ process_web_request(db,driver,area_num,keyword)
|
|
|
+ pagecnt+=1
|
|
|
+ if pagecnt>=5:
|
|
|
+ break
|
|
|
+
|
|
|
+
|
|
|
+# table2.upsert({'kw':keyword,'num':job['num']},['kw'])
|
|
|
+ table2.insert({'kw':keyword,'num':job['num']},['kw'])
|
|
|
+ db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
|
|
|
+
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ failcnt+=1
|
|
|
+ if failcnt>=15:
|
|
|
+ sys.exit()
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|