|
@@ -23,6 +23,7 @@ import re
|
|
|
import sys, os
|
|
|
import socket
|
|
|
import brotli
|
|
|
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
|
|
chrome_window=False
|
|
|
|
|
@@ -33,11 +34,19 @@ def brower_start(port):
|
|
|
desired_capabilities=options.to_capabilities()
|
|
|
)
|
|
|
else:
|
|
|
+ chrome_options = webdriver.ChromeOptions()
|
|
|
+ chrome_options.add_argument('--proxy-server=host.docker.internal:8787') # Specify your Kubernetes service-name here
|
|
|
+ chrome_options.add_argument('--ignore-certificate-errors')
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
browser = webdriver.Remote(
|
|
|
command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
- desired_capabilities=options.to_capabilities()
|
|
|
- )
|
|
|
+ desired_capabilities=chrome_options.to_capabilities(),
|
|
|
+ seleniumwire_options={'addr':'0.0.0.0','port':8787,'auto_config': False}
|
|
|
|
|
|
+ )
|
|
|
+# seleniumwire_options = {'addr': '172.17.0.2','port':4444})
|
|
|
+ browser.set_window_size(1400,1000)
|
|
|
return browser
|
|
|
|
|
|
|
|
@@ -78,40 +87,23 @@ def keyin_keyword(driver, keyword):
|
|
|
|
|
|
|
|
|
|
|
|
-def get_crawler_list(db):
|
|
|
-# result = db.query('select * from shop_item_list order by keyword')
|
|
|
-# result = pd.DataFrame([i for i in result])
|
|
|
-# result = result[~result.keyword.str.contains('項')]
|
|
|
-
|
|
|
-# progress = db.query('select distinct(kw) from progress_list2 where num < 367')
|
|
|
-# progress = pd.DataFrame([i for i in progress])
|
|
|
-
|
|
|
-# if len(progress) != 0:
|
|
|
-# keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
|
|
|
-# else:
|
|
|
-# keyword = result.iloc[0]['keyword']
|
|
|
-#
|
|
|
-# return keyword
|
|
|
- return '滷味'
|
|
|
- cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
|
|
|
+def get_next_job(db):
|
|
|
+ result={}
|
|
|
+ cursor = db.query('select distinct(kw),num+1 as num from progress_list2 where num < 367 order by num asc limit 1')
|
|
|
for c in cursor:
|
|
|
- return c['kw']
|
|
|
- return None
|
|
|
+ result['kw']=c['kw']
|
|
|
+ result['num']=c['num']
|
|
|
+ break
|
|
|
|
|
|
-def get_lon_lat_list(db, keyword):
|
|
|
- num=0
|
|
|
- cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
|
|
|
+ cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
|
|
|
for c in cursor:
|
|
|
- num=c['num']
|
|
|
+ result['lat']=c['lat']
|
|
|
+ result['lon']=c['lon']
|
|
|
+ result['loc']=c['loc']
|
|
|
break
|
|
|
|
|
|
- cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
|
|
|
-
|
|
|
- lst=[]
|
|
|
- for c in cursor:
|
|
|
- lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
|
|
|
+ return result
|
|
|
|
|
|
- return lst
|
|
|
|
|
|
def write_to_file(jsobj,fname):
|
|
|
import codecs
|
|
@@ -120,6 +112,7 @@ def write_to_file(jsobj,fname):
|
|
|
fw.close()
|
|
|
|
|
|
def parsing_js(orig):
|
|
|
+ resultobj=[]
|
|
|
content=""
|
|
|
lines=orig.split('\n')
|
|
|
for l in lines:
|
|
@@ -140,75 +133,93 @@ def parsing_js(orig):
|
|
|
content_end=result.end()
|
|
|
|
|
|
jscontent=content[content_begin:content_end-1]
|
|
|
- write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
+# write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
jsobj=json.loads(jscontent)
|
|
|
for x in jsobj[0][1][1:]:
|
|
|
print(x[14][11])
|
|
|
print(x[14][10])
|
|
|
print(x[14][2])
|
|
|
print(x[14][78])
|
|
|
-
|
|
|
+ try:
|
|
|
+ resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ return resultobj
|
|
|
+
|
|
|
+def save_js_to_db(jsobj,num,keyword):
|
|
|
+ global store_list_table
|
|
|
+ for r in jsobj:
|
|
|
+ r['num']=num
|
|
|
+ r['keyword']=keyword
|
|
|
+ store_list_table.upsert(r,keys=['place_id'])
|
|
|
+
|
|
|
+def process_web_request(driver,area_num,keyword):
|
|
|
+ query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
|
+ time.sleep(14)
|
|
|
+
|
|
|
+ print("ppppppppp&**********************")
|
|
|
+ for request in driver.requests:
|
|
|
+ if request.response:
|
|
|
+ if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
+ print('parsing js:')
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext=resp.decode('utf-8')
|
|
|
+ resultobj=parsing_js(jstext)
|
|
|
+ save_js_to_db(resultobj,area_num,keyword)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
global chrome_window
|
|
|
+ global store_list_table
|
|
|
localip=socket.gethostbyname(socket.gethostname())
|
|
|
if localip=='192.168.1.108':
|
|
|
- chrome_window=True
|
|
|
+# chrome_window=True
|
|
|
+ chrome_window=False
|
|
|
+
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
- table = db['shop_item_list3']
|
|
|
- table2 = db['progress_list2']
|
|
|
+ store_list_table = db['swire_store_list']
|
|
|
+
|
|
|
+ table2 = db['swire_progress_list']
|
|
|
|
|
|
- port=4447
|
|
|
- if len(sys.argv) > 1 :
|
|
|
- port=int(sys.argv[1])
|
|
|
+
|
|
|
+ port=4444
|
|
|
+# if len(sys.argv) > 1 :
|
|
|
+# port=int(sys.argv[1])
|
|
|
+ if True:
|
|
|
print('restart docker p{}'.format(port))
|
|
|
- os.system('sudo docker container restart p'+str(port))
|
|
|
- time.sleep(8)
|
|
|
+# os.system('sudo docker container restart p'+str(port))
|
|
|
+ os.system('docker container restart p'+str(port))
|
|
|
+
|
|
|
+ time.sleep(10)
|
|
|
|
|
|
print('drvier start...')
|
|
|
driver = brower_start(port)
|
|
|
|
|
|
|
|
|
- for i in range(10):
|
|
|
+ while True:
|
|
|
try:
|
|
|
- keyword = get_crawler_list(db)
|
|
|
- print(keyword)
|
|
|
- lst = get_lon_lat_list(db, keyword)
|
|
|
-# print(lst)
|
|
|
- print(keyword, len(lst))
|
|
|
-
|
|
|
- for r in lst:
|
|
|
- latitude = r['lat'] #緯度
|
|
|
- longitude = r['lon'] #精度
|
|
|
- area_num=r['num']
|
|
|
- table2.upsert({'kw':keyword,'num':r['num']},['kw'])
|
|
|
-
|
|
|
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
- driver.get(url)
|
|
|
- keyin_keyword(driver, keyword)
|
|
|
- failcnt = 0
|
|
|
-
|
|
|
-# query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
|
- time.sleep(11)
|
|
|
- print("ppppppppp&**********************")
|
|
|
- for request in driver.requests:
|
|
|
- if request.response:
|
|
|
- if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
- print('parsing js:')
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
- jstext=resp.decode('utf-8')
|
|
|
- parsing_js(jstext)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- for page in range(10):
|
|
|
- if page < 2 :
|
|
|
- element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
- if element.get_attribute('disabled'):
|
|
|
- break
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ job=get_next_job(db)
|
|
|
+ print(job)
|
|
|
+ keyword = job['kw']
|
|
|
+ latitude = job['lat'] #緯度
|
|
|
+ longitude = job['lon'] #精度
|
|
|
+ area_num=job['num']
|
|
|
+
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
+ driver.get(url)
|
|
|
+ keyin_keyword(driver, keyword)
|
|
|
+
|
|
|
+ process_web_request(driver,area_num,keyword)
|
|
|
+
|
|
|
+ while True:
|
|
|
+ element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
|
|
|
+ if element.get_attribute('disabled'):
|
|
|
+ break
|
|
|
+ # driver.implicitly_wait(30)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ process_web_request(driver,area_num,keyword)
|
|
|
+ table2.upsert({'kw':keyword,'num':job['num']},['kw'])
|
|
|
+
|
|
|
except:
|
|
|
pass
|
|
|
|