|
@@ -24,12 +24,13 @@ import sys, os
|
|
import socket
|
|
import socket
|
|
import brotli
|
|
import brotli
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
-
|
|
|
|
|
|
+import urllib.parse
|
|
chrome_window=False
|
|
chrome_window=False
|
|
globalkw=None
|
|
globalkw=None
|
|
proxyport=8787
|
|
proxyport=8787
|
|
def brower_start(port):
|
|
def brower_start(port):
|
|
global proxyport
|
|
global proxyport
|
|
|
|
+ global chrome_window
|
|
print(proxyport)
|
|
print(proxyport)
|
|
options = webdriver.ChromeOptions()
|
|
options = webdriver.ChromeOptions()
|
|
if chrome_window:
|
|
if chrome_window:
|
|
@@ -145,15 +146,60 @@ def parsing_js(orig):
|
|
content_end=result.end()
|
|
content_end=result.end()
|
|
|
|
|
|
jscontent=content[content_begin:content_end-1]
|
|
jscontent=content[content_begin:content_end-1]
|
|
-# write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
|
|
|
+ write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
jsobj=json.loads(jscontent)
|
|
jsobj=json.loads(jscontent)
|
|
for x in jsobj[0][1][1:]:
|
|
for x in jsobj[0][1][1:]:
|
|
print(x[14][11])
|
|
print(x[14][11])
|
|
|
|
+ print(x[14][9])
|
|
|
|
+ reviews_cnt=None
|
|
|
|
+ photo=None
|
|
|
|
+ rating=None
|
|
|
|
+ biz_id=None
|
|
|
|
+ loc_x=None
|
|
|
|
+ loc_y=None
|
|
|
|
+ addr_elmts=None
|
|
|
|
+ tel=None
|
|
|
|
+ try:
|
|
|
|
+ rating=x[14][4][7]
|
|
|
|
+ reviews_cnt=x[14][4][8]
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ photo=x[14][37][0][0][0]
|
|
|
|
+ num_photos=x[14][37][0][0][6][1]
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ loc_x=x[14][37][0][0][29][0]
|
|
|
|
+ loc_y=x[14][37][0][0][29][1]
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ biz_id=x[14][57][2]
|
|
|
|
+ tel=x[14][178][0][3]
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ addr_elmts=str(x[14][82])
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ category=str(x[14][13])
|
|
|
|
+ topic=str(x[14][89])
|
|
|
|
+ print(x[14][13])
|
|
|
|
+
|
|
print(x[14][10])
|
|
print(x[14][10])
|
|
print(x[14][2])
|
|
print(x[14][2])
|
|
print(x[14][78])
|
|
print(x[14][78])
|
|
try:
|
|
try:
|
|
- resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
|
|
|
+ resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
except:
|
|
except:
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
return resultobj
|
|
return resultobj
|
|
@@ -163,16 +209,27 @@ def save_js_to_db(jsobj,num,keyword):
|
|
for r in jsobj:
|
|
for r in jsobj:
|
|
r['num']=num
|
|
r['num']=num
|
|
r['keyword']=keyword
|
|
r['keyword']=keyword
|
|
- store_list_table.upsert(r,keys=['place_id'])
|
|
|
|
|
|
+ try:
|
|
|
|
+ store_list_table.insert(r)
|
|
|
|
+
|
|
|
|
+# store_list_table.upsert(r,keys=['place_id'])
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+# store_list_table.upsert(r,keys=['place_id'])
|
|
|
|
|
|
def process_web_request(driver,area_num,keyword):
|
|
def process_web_request(driver,area_num,keyword):
|
|
- query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
|
|
|
|
+# query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
|
|
time.sleep(0.8)
|
|
time.sleep(0.8)
|
|
-
|
|
|
|
|
|
+ time.sleep(3)
|
|
print("ppppppppp&**********************")
|
|
print("ppppppppp&**********************")
|
|
for request in driver.requests:
|
|
for request in driver.requests:
|
|
|
|
+ if 'search?' in request.url :
|
|
|
|
+ print('searching.....')
|
|
|
|
+# else:
|
|
|
|
+# print(request.url[20:60])
|
|
if request.response:
|
|
if request.response:
|
|
- if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
|
|
|
+# if 'https://www.google.com.tw/search?tbm=map' in request.url :
|
|
|
|
+ if 'search?' in request.url :
|
|
print('parsing js:')
|
|
print('parsing js:')
|
|
resp = brotli.decompress(request.response.body)
|
|
resp = brotli.decompress(request.response.body)
|
|
jstext=resp.decode('utf-8')
|
|
jstext=resp.decode('utf-8')
|
|
@@ -181,6 +238,8 @@ def process_web_request(driver,area_num,keyword):
|
|
save_js_to_db(resultobj,area_num,keyword)
|
|
save_js_to_db(resultobj,area_num,keyword)
|
|
print("after",datetime.now())
|
|
print("after",datetime.now())
|
|
|
|
|
|
|
|
+# time.sleep(9999)
|
|
|
|
+
|
|
|
|
|
|
def main():
|
|
def main():
|
|
global chrome_window
|
|
global chrome_window
|
|
@@ -191,9 +250,9 @@ def main():
|
|
globalkw=sys.argv[1]
|
|
globalkw=sys.argv[1]
|
|
failcnt=0
|
|
failcnt=0
|
|
localip=socket.gethostbyname(socket.gethostname())
|
|
localip=socket.gethostbyname(socket.gethostname())
|
|
- if localip=='192.168.1.108':
|
|
|
|
|
|
+# if localip=='192.168.1.108':
|
|
# chrome_window=True
|
|
# chrome_window=True
|
|
- chrome_window=False
|
|
|
|
|
|
+# chrome_window=False
|
|
|
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
store_list_table = db['swire_store_list']
|
|
store_list_table = db['swire_store_list']
|
|
@@ -227,9 +286,14 @@ def main():
|
|
latitude = job['lat'] #緯度
|
|
latitude = job['lat'] #緯度
|
|
longitude = job['lon'] #精度
|
|
longitude = job['lon'] #精度
|
|
area_num=job['num']
|
|
area_num=job['num']
|
|
-
|
|
|
|
- url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
|
|
|
|
|
|
+ safe_string = urllib.parse.quote_plus(keyword)
|
|
|
|
+ url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
|
|
|
|
+# url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
|
|
|
|
+# url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
|
|
|
|
+# print(url)
|
|
|
|
+# url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
|
|
driver.get(url)
|
|
driver.get(url)
|
|
|
|
+# time.sleep(3)
|
|
keyin_keyword(driver, keyword)
|
|
keyin_keyword(driver, keyword)
|
|
|
|
|
|
process_web_request(driver,area_num,keyword)
|
|
process_web_request(driver,area_num,keyword)
|