|
@@ -7,6 +7,8 @@ from selenium.webdriver.common.keys import Keys
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.common.by import By
|
|
|
+from selenium.common.exceptions import TimeoutException
|
|
|
+from selenium.common.exceptions import WebDriverException
|
|
|
import selenium
|
|
|
import traceback
|
|
|
from bs4 import BeautifulSoup
|
|
@@ -76,7 +78,7 @@ def brower_start(port):
|
|
|
|
|
|
def get_next_job(db):
|
|
|
result = {}
|
|
|
- result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 10')
|
|
|
+ result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
|
|
|
url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
|
|
|
# url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
|
|
@@ -178,7 +180,14 @@ def parsing_js(resp):
|
|
|
else:
|
|
|
output['header_image'] = ''
|
|
|
|
|
|
- print(output)
|
|
|
+ if txt[6][126]:
|
|
|
+ output['google_url'] = txt[6][126][4]
|
|
|
+ ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
|
|
|
+ if len(ludocid_str) != 0:
|
|
|
+ ludocid = ludocid_str[0].split('=')[-1]
|
|
|
+ output['ludocid'] = ludocid
|
|
|
+ else:
|
|
|
+ output['google_url'] = ''
|
|
|
# write_to_file(orig,'debug.pickle')
|
|
|
return output
|
|
|
|
|
@@ -269,7 +278,7 @@ def process_web_request_start(driver, fid):
|
|
|
if request.response:
|
|
|
# print(request.url)
|
|
|
if 'place?' in request.url :
|
|
|
- print('parsing js:')
|
|
|
+ # print('parsing js:')
|
|
|
front, _ = fid.split(':')
|
|
|
if request.url.find(front) != -1:
|
|
|
print(request.url)
|
|
@@ -277,8 +286,8 @@ def process_web_request_start(driver, fid):
|
|
|
jstext = resp.decode('utf-8')
|
|
|
output = parsing_js(jstext)
|
|
|
time.sleep(1)
|
|
|
- return output, request.url
|
|
|
- return 0, 0
|
|
|
+ return output
|
|
|
+ return 0
|
|
|
|
|
|
|
|
|
def reviews_parsing_js(resp):
|
|
@@ -312,7 +321,7 @@ def reviews_parsing_js(resp):
|
|
|
return result
|
|
|
|
|
|
|
|
|
-def process_web_request_reviews(driver, output, start_js):
|
|
|
+def process_web_request_reviews(driver, output, ludocid):
|
|
|
time.sleep(3)
|
|
|
|
|
|
print("reviews&**********************")
|
|
@@ -320,8 +329,8 @@ def process_web_request_reviews(driver, output, start_js):
|
|
|
if request.response:
|
|
|
# print(request.url)
|
|
|
if 'listentitiesreviews?' in request.url :
|
|
|
- print('parsing js:')
|
|
|
- if start_js.find(request.url.split('!')[-2]) != -1:
|
|
|
+ # print('parsing js:')
|
|
|
+ if request.url.find(ludocid) != -1:
|
|
|
print(request.url)
|
|
|
resp = brotli.decompress(request.response.body)
|
|
|
jstext = resp.decode('utf-8')
|
|
@@ -396,7 +405,7 @@ def process_web_request_photo(driver, output, fid):
|
|
|
if request.response:
|
|
|
# print(request.url)
|
|
|
if 'photo?' in request.url :
|
|
|
- print('parsing js:')
|
|
|
+ # print('parsing js:')
|
|
|
front, _ = fid.split(':')
|
|
|
if request.url.find(front) != -1:
|
|
|
print(request.url)
|
|
@@ -443,7 +452,7 @@ def main():
|
|
|
driver = brower_start(port)
|
|
|
|
|
|
job = get_next_job(db)
|
|
|
-
|
|
|
+ c = 0
|
|
|
for row, group in job.iterrows():
|
|
|
try:
|
|
|
item_url = group['item_url']
|
|
@@ -459,9 +468,6 @@ def main():
|
|
|
|
|
|
print(fid, keyword, db_name)
|
|
|
print(item_url)
|
|
|
- # ActionChains(driver).key_down(Keys.SHIFT).key_down(Keys.F5).perform()
|
|
|
- # driver.find_element_by_tag_name('body').send_keys(Keys.F5)
|
|
|
- # time.sleep(3)
|
|
|
|
|
|
#shop_info
|
|
|
print('parsing shop info....')
|
|
@@ -482,13 +488,12 @@ def main():
|
|
|
|
|
|
if driver.current_url == item_url:continue
|
|
|
print(driver.current_url)
|
|
|
- output, start_js = process_web_request_start(driver, fid)
|
|
|
+ output = process_web_request_start(driver, fid)
|
|
|
if output != 0: break
|
|
|
|
|
|
-
|
|
|
# reivews
|
|
|
print('parsing reviews....')
|
|
|
- if output['user_ratings_total'] == '':
|
|
|
+ if not output['user_ratings_total']:
|
|
|
output['reviews'] = ''
|
|
|
else:
|
|
|
for i in range(3):
|
|
@@ -504,7 +509,7 @@ def main():
|
|
|
ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
- output_ = process_web_request_reviews(driver, output, start_js)
|
|
|
+ output_ = process_web_request_reviews(driver, output, output['ludocid'])
|
|
|
if output_ != 0:
|
|
|
output = output_
|
|
|
break
|
|
@@ -539,25 +544,33 @@ def main():
|
|
|
else:
|
|
|
output['shop_photo'] = '[]'
|
|
|
output['menu_photo'] = '[]'
|
|
|
-
|
|
|
- print(output)
|
|
|
- query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
|
|
|
+
|
|
|
output['item_url'] = item_url
|
|
|
output['keyword'] = keyword
|
|
|
- output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
+
|
|
|
+ if output['google_url'] == '':
|
|
|
+ query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
|
|
|
+ output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
|
|
|
|
|
|
+ print(output)
|
|
|
save_js_to_db(output, fid)
|
|
|
error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
|
|
|
print('*'*10)
|
|
|
|
|
|
+ except TimeoutException as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ break
|
|
|
+
|
|
|
+ except TimeoutException as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ break
|
|
|
+
|
|
|
except:
|
|
|
error_table3 = db['error_list3']
|
|
|
error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
|
|
|
traceback.print_exc()
|
|
|
- driver.quit()
|
|
|
- print('drvier restart...')
|
|
|
- driver = brower_start(port)
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|