|
@@ -16,7 +16,7 @@ import gzip
|
|
|
from utility import database_access as DA
|
|
|
from utility.parseutils import *
|
|
|
from utility.connect import *
|
|
|
-
|
|
|
+import redis
|
|
|
from datetime import datetime
|
|
|
from requests import session
|
|
|
import pandas as pd
|
|
@@ -99,7 +99,9 @@ def get_next_job(db):
|
|
|
result = {}
|
|
|
# result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
|
|
|
|
|
|
- result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 100')
|
|
|
+# result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
|
|
|
+# result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from shop_list3 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
|
|
|
+ result = db.query('SELECT * FROM swire_store_list a WHERE fid not in (select fid from shop_list3 ) ORDER BY RAND() limit 30')
|
|
|
|
|
|
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
@@ -110,11 +112,11 @@ def get_next_job(db):
|
|
|
|
|
|
# url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
|
|
|
- remove = db.query('select item_url from error_list3')
|
|
|
- remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
- if len(remove) != 0:
|
|
|
- remove_fid_list = remove['item_url'].to_list()
|
|
|
- url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
|
|
|
+# remove = db.query('select item_url from error_list3')
|
|
|
+# remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
+# if len(remove) != 0:
|
|
|
+# remove_fid_list = remove['item_url'].to_list()
|
|
|
+# url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
|
|
|
|
|
|
return url_pd
|
|
|
|
|
@@ -222,6 +224,19 @@ def parsing_js(resp):
|
|
|
def time_parsing_js(time_json, output):
|
|
|
weekday_text = []
|
|
|
periods = []
|
|
|
+ if time_json is None:
|
|
|
+ output['open_now'] = 'False'
|
|
|
+ output['periods'] = ''
|
|
|
+ output['weekday_text'] = ''
|
|
|
+ output['time_status'] = ''
|
|
|
+ return output
|
|
|
+ if time_json[1] is None:
|
|
|
+ output['open_now'] = 'False'
|
|
|
+ output['periods'] = ''
|
|
|
+ output['weekday_text'] = ''
|
|
|
+ output['time_status'] = ''
|
|
|
+ return output
|
|
|
+
|
|
|
|
|
|
for time_ in time_json[1]:
|
|
|
week = time_[0]
|
|
@@ -413,14 +428,14 @@ def photos_parsing_js(resp):
|
|
|
photo_category_map[row[0]] = row[2]
|
|
|
|
|
|
if photo_category_map[jsobj[13][0]] == '全部':
|
|
|
- for img in jsobj[0][:5]:
|
|
|
+ for img in jsobj[0]:
|
|
|
all += [image_url_change_size(img[6][0])]
|
|
|
|
|
|
elif photo_category_map[jsobj[13][0]] == '菜單':
|
|
|
- for img in jsobj[0][:5]:
|
|
|
+ for img in jsobj[0]:
|
|
|
menu += [image_url_change_size(img[6][0])]
|
|
|
|
|
|
- return menu, all
|
|
|
+ return list(set(menu)), list(set(all))
|
|
|
|
|
|
|
|
|
def process_web_request_photo(driver, output, fid):
|
|
@@ -461,8 +476,18 @@ def process_web_request_photo(driver, output, fid):
|
|
|
# print('parsing js:')
|
|
|
front, _ = fid.split(':')
|
|
|
if request.url.find(front) != -1:
|
|
|
+# resp = brotli.decompress(request.response.body)
|
|
|
print(request.url)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
+ resp=request.response.body
|
|
|
+ if 'gzip' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = gzip.decompress(request.response.body)
|
|
|
+
|
|
|
+ if 'br' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
jstext = resp.decode('utf-8')
|
|
|
menu, all = photos_parsing_js(jstext)
|
|
|
menu_list += menu
|
|
@@ -489,7 +514,8 @@ def main():
|
|
|
error_table = db['error_list2']
|
|
|
|
|
|
iddict=build_cache(db)
|
|
|
-
|
|
|
+ print("iddict...{}".format(datetime.now()))
|
|
|
+
|
|
|
port=4444
|
|
|
if len(sys.argv) == 3 :
|
|
|
port=int(sys.argv[1])
|
|
@@ -506,6 +532,7 @@ def main():
|
|
|
|
|
|
job = get_next_job(db)
|
|
|
c = 0
|
|
|
+
|
|
|
for row, group in job.iterrows():
|
|
|
try:
|
|
|
item_url = group['item_url']
|
|
@@ -526,6 +553,8 @@ def main():
|
|
|
print('parsing shop info....')
|
|
|
for i in range(5):
|
|
|
print('shop info try...{}'.format(i))
|
|
|
+ print("shop info try...{}".format(datetime.now()))
|
|
|
+
|
|
|
driver.get(item_url)
|
|
|
time.sleep(3)
|
|
|
|
|
@@ -541,16 +570,26 @@ def main():
|
|
|
|
|
|
if driver.current_url == item_url:continue
|
|
|
print(driver.current_url)
|
|
|
- output = process_web_request_start(driver, fid)
|
|
|
- if output != 0: break
|
|
|
+ try:
|
|
|
+ output = process_web_request_start(driver, fid)
|
|
|
+ if output != 0: break
|
|
|
+ except:
|
|
|
+ r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
|
|
|
+ msg=traceback.format_exc()
|
|
|
+ r.set('google_error',msg)
|
|
|
+
|
|
|
|
|
|
# reivews
|
|
|
print('parsing reviews....')
|
|
|
+ print("parsing reviews.....{}".format(datetime.now()))
|
|
|
+
|
|
|
if not output['user_ratings_total']:
|
|
|
output['reviews'] = ''
|
|
|
else:
|
|
|
for i in range(3):
|
|
|
print('reviews try...{}'.format(i))
|
|
|
+ print("reviews try.....{}".format(datetime.now()))
|
|
|
+
|
|
|
try:
|
|
|
wait = WebDriverWait(driver, 30)
|
|
|
more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
|
|
@@ -570,14 +609,16 @@ def main():
|
|
|
driver.get(item_url)
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
- if 'reviews' not in output.keys():
|
|
|
- continue
|
|
|
+# if 'reviews' not in output.keys():
|
|
|
+# continue
|
|
|
|
|
|
# photo
|
|
|
print('parsing photo....')
|
|
|
if output['header_image'] != '':
|
|
|
for i in range(3):
|
|
|
print('photo try...{}'.format(i))
|
|
|
+ print("photo try......{}".format(datetime.now()))
|
|
|
+
|
|
|
driver.get(item_url)
|
|
|
time.sleep(0.5)
|
|
|
print(driver.current_url)
|
|
@@ -608,6 +649,8 @@ def main():
|
|
|
|
|
|
print(output)
|
|
|
save_js_to_db(output, fid)
|
|
|
+ print("save_js_to_db......{}".format(datetime.now()))
|
|
|
+
|
|
|
error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
|
|
|
print('*'*10)
|
|
|
|
|
@@ -620,9 +663,13 @@ def main():
|
|
|
break
|
|
|
|
|
|
except:
|
|
|
+ r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
|
|
|
+ msg=traceback.format_exc()
|
|
|
+ r.set('google_error',msg)
|
|
|
error_table3 = db['error_list3']
|
|
|
error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
traceback.print_exc()
|
|
|
+# sys.exit()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|