|
@@ -79,28 +79,25 @@ def brower_start(port):
|
|
|
|
|
|
def get_next_job(db):
|
|
|
result = {}
|
|
|
- result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 5')
|
|
|
+ result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 3')
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
|
|
|
# url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
|
|
|
|
- # remove = db.query('select fid from review_process')
|
|
|
- # remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
- # remove_fid_list = remove['fid'].to_list()
|
|
|
+ remove = db.query('select item_url from error_list3')
|
|
|
+ remove = pd.DataFrame([dict(i) for i in remove])
|
|
|
+ remove_fid_list = remove['item_url'].to_list()
|
|
|
|
|
|
- # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
|
|
|
|
|
|
return url_pd
|
|
|
|
|
|
|
|
|
-def parsing_js(resp, db_name):
|
|
|
+def parsing_js(resp):
|
|
|
txt = json.loads(resp[5::])
|
|
|
|
|
|
output = {}
|
|
|
|
|
|
- if txt[6][11] != db_name:
|
|
|
- return 0
|
|
|
-
|
|
|
output['name'] = txt[6][11]
|
|
|
output['adress_name'] = txt[6][18]
|
|
|
|
|
@@ -267,7 +264,7 @@ def save_js_to_db(jsobj, fid):
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
-def process_web_request_start(driver, db_name):
|
|
|
+def process_web_request_start(driver, fid):
|
|
|
time.sleep(5)
|
|
|
|
|
|
print("start&**********************")
|
|
@@ -277,15 +274,14 @@ def process_web_request_start(driver, db_name):
|
|
|
if 'place?' in request.url :
|
|
|
print('parsing js:')
|
|
|
print(request.url)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
- jstext = resp.decode('utf-8')
|
|
|
- output = parsing_js(jstext, db_name)
|
|
|
- if output == 0:
|
|
|
- continue
|
|
|
- else:
|
|
|
+ front, _ = fid.split(':')
|
|
|
+ if request.url.find(front) != -1:
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ output = parsing_js(jstext)
|
|
|
time.sleep(1)
|
|
|
- return output
|
|
|
- return 0
|
|
|
+ return output, request.url
|
|
|
+ return 0, 0
|
|
|
|
|
|
|
|
|
def reviews_parsing_js(resp):
|
|
@@ -320,7 +316,7 @@ def reviews_parsing_js(resp):
|
|
|
return result
|
|
|
|
|
|
|
|
|
-def process_web_request_reviews(driver, output):
|
|
|
+def process_web_request_reviews(driver, output, start_js):
|
|
|
time.sleep(0.8)
|
|
|
time.sleep(3)
|
|
|
|
|
@@ -331,16 +327,16 @@ def process_web_request_reviews(driver, output):
|
|
|
if 'listentitiesreviews?' in request.url :
|
|
|
print('parsing js:')
|
|
|
print(request.url)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
- jstext = resp.decode('utf-8')
|
|
|
- result = reviews_parsing_js(jstext)
|
|
|
- output['reviews'] = str(result)
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
- return output
|
|
|
+ if start_js.find(request.url.split('!')[-2]) != -1:
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ result = reviews_parsing_js(jstext)
|
|
|
+ output['reviews'] = str(result)
|
|
|
+ time.sleep(1)
|
|
|
+ return output
|
|
|
|
|
|
|
|
|
-def photos_parsing_js(resp, c):
|
|
|
+def photos_parsing_js(resp):
|
|
|
def image_url_change_size(url):
|
|
|
url_split = url.split('=')
|
|
|
new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
|
|
@@ -351,18 +347,22 @@ def photos_parsing_js(resp, c):
|
|
|
|
|
|
menu = []
|
|
|
all = []
|
|
|
+ photo_category_map = {}
|
|
|
+ for row in jsobj[12][0]:
|
|
|
+ photo_category_map[row[0]] = row[2]
|
|
|
|
|
|
- if jsobj[10] == 0:
|
|
|
- for img in jsobj[0]:
|
|
|
+ if photo_category_map[jsobj[13][0]] == '全部':
|
|
|
+ for img in jsobj[0][:5]:
|
|
|
all += [image_url_change_size(img[6][0])]
|
|
|
- else:
|
|
|
- for img in jsobj[0]:
|
|
|
+
|
|
|
+ elif photo_category_map[jsobj[13][0]] == '菜單':
|
|
|
+ for img in jsobj[0][:5]:
|
|
|
menu += [image_url_change_size(img[6][0])]
|
|
|
|
|
|
return menu, all
|
|
|
|
|
|
|
|
|
-def process_web_request_photo(driver, output):
|
|
|
+def process_web_request_photo(driver, output, fid):
|
|
|
try:
|
|
|
driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
|
|
|
photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
@@ -396,19 +396,19 @@ def process_web_request_photo(driver, output):
|
|
|
for request in driver.requests:
|
|
|
if request.response:
|
|
|
# print(request.url)
|
|
|
- c = 0
|
|
|
if 'photo?' in request.url :
|
|
|
print('parsing js:')
|
|
|
print(request.url)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
- jstext = resp.decode('utf-8')
|
|
|
- menu, all = photos_parsing_js(jstext, c)
|
|
|
- menu_list += menu
|
|
|
- all_list += all
|
|
|
- c += 1
|
|
|
+ front, _ = fid.split(':')
|
|
|
+ if request.url.find(front) != -1:
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext = resp.decode('utf-8')
|
|
|
+ menu, all = photos_parsing_js(jstext)
|
|
|
+ menu_list += menu
|
|
|
+ all_list += all
|
|
|
|
|
|
- output['shop_photo'] = str(all_list)
|
|
|
- output['menu_photo'] = str(menu_list)
|
|
|
+ output['shop_photo'] = str(all_list[:5])
|
|
|
+ output['menu_photo'] = str(menu_list[:5])
|
|
|
|
|
|
return output
|
|
|
|
|
@@ -458,7 +458,7 @@ def main():
|
|
|
else:
|
|
|
db_name = num
|
|
|
|
|
|
- print(fid, name, num, keyword, db_name)
|
|
|
+ print(fid, keyword, db_name)
|
|
|
print(item_url)
|
|
|
|
|
|
#shop_info
|
|
@@ -466,8 +466,7 @@ def main():
|
|
|
for i in range(5):
|
|
|
print('shop info try...{}'.format(i))
|
|
|
driver.get(item_url)
|
|
|
- driver.refresh()
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(5)
|
|
|
|
|
|
wait = WebDriverWait(driver, 10)
|
|
|
wait.until(
|
|
@@ -481,7 +480,7 @@ def main():
|
|
|
|
|
|
if driver.current_url == item_url:continue
|
|
|
print(driver.current_url)
|
|
|
- output = process_web_request_start(driver, db_name)
|
|
|
+ output, start_js = process_web_request_start(driver, fid)
|
|
|
if output != 0: break
|
|
|
|
|
|
|
|
@@ -503,7 +502,7 @@ def main():
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
- output = process_web_request_reviews(driver, output)
|
|
|
+ output = process_web_request_reviews(driver, output, start_js)
|
|
|
break
|
|
|
except:
|
|
|
driver.get(item_url)
|
|
@@ -525,7 +524,7 @@ def main():
|
|
|
element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
|
|
- output = process_web_request_photo(driver, output)
|
|
|
+ output = process_web_request_photo(driver, output, fid)
|
|
|
break
|
|
|
except:
|
|
|
pass
|
|
@@ -541,10 +540,10 @@ def main():
|
|
|
output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
|
|
|
|
|
|
- # shop_table.insert(output,['item_url'])
|
|
|
-
|
|
|
save_js_to_db(output, fid)
|
|
|
error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
|
|
|
+ print('*'*10)
|
|
|
+
|
|
|
except:
|
|
|
error_table3 = db['error_list3']
|
|
|
error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|