noodles 2 years ago
parent
commit
f83698853e
1 changed files with 48 additions and 49 deletions
  1. 48 49
      run4.py

+ 48 - 49
run4.py

@@ -79,28 +79,25 @@ def brower_start(port):
 
 def get_next_job(db):
     result = {}
-    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 5')
+    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 3')
     url_pd = pd.DataFrame([dict(i) for i in result])
     url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
     # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
-    # remove = db.query('select fid from review_process')
-    # remove = pd.DataFrame([dict(i) for i in remove])
-    # remove_fid_list = remove['fid'].to_list()
+    remove = db.query('select item_url from error_list3')
+    remove = pd.DataFrame([dict(i) for i in remove])
+    remove_fid_list = remove['item_url'].to_list()
 
-    # url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
+    url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
 
     return url_pd
 
 
-def parsing_js(resp, db_name):
+def parsing_js(resp):
     txt = json.loads(resp[5::])
 
     output = {}
 
-    if txt[6][11] != db_name:
-        return 0
-
     output['name'] = txt[6][11]
     output['adress_name'] = txt[6][18]
 
@@ -267,7 +264,7 @@ def save_js_to_db(jsobj, fid):
             traceback.print_exc()
 
 
-def process_web_request_start(driver, db_name):
+def process_web_request_start(driver, fid):
     time.sleep(5)
 
     print("start&**********************")
@@ -277,15 +274,14 @@ def process_web_request_start(driver, db_name):
             if 'place?' in request.url :
                 print('parsing js:')
                 print(request.url)
-                resp = brotli.decompress(request.response.body)
-                jstext = resp.decode('utf-8')
-                output = parsing_js(jstext, db_name)
-                if output  == 0:
-                    continue
-                else:
+                front, _ = fid.split(':')
+                if request.url.find(front) != -1:
+                    resp = brotli.decompress(request.response.body)
+                    jstext = resp.decode('utf-8')
+                    output = parsing_js(jstext)
                     time.sleep(1)
-                    return output
-    return 0
+                    return output, request.url                 
+    return 0, 0
 
 
 def reviews_parsing_js(resp):
@@ -320,7 +316,7 @@ def reviews_parsing_js(resp):
     return result 
 
 
-def process_web_request_reviews(driver, output):
+def process_web_request_reviews(driver, output, start_js):
     time.sleep(0.8)
     time.sleep(3)
 
@@ -331,16 +327,16 @@ def process_web_request_reviews(driver, output):
             if 'listentitiesreviews?' in request.url :
                 print('parsing js:')
                 print(request.url)
-                resp = brotli.decompress(request.response.body)
-                jstext = resp.decode('utf-8')
-                result = reviews_parsing_js(jstext)
-                output['reviews'] = str(result)
-                time.sleep(1)
-
-                return output
+                if start_js.find(request.url.split('!')[-2]) != -1:
+                    resp = brotli.decompress(request.response.body)
+                    jstext = resp.decode('utf-8')
+                    result = reviews_parsing_js(jstext)
+                    output['reviews'] = str(result)
+                    time.sleep(1)
+                    return output
 
 
-def photos_parsing_js(resp, c):
+def photos_parsing_js(resp):
     def image_url_change_size(url):
         url_split = url.split('=')
         new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
@@ -351,18 +347,22 @@ def photos_parsing_js(resp, c):
 
     menu = []
     all = []
+    photo_category_map = {}
+    for row in jsobj[12][0]:
+        photo_category_map[row[0]] = row[2]
 
-    if jsobj[10] == 0:
-        for img in jsobj[0]:
+    if photo_category_map[jsobj[13][0]] == '全部':
+        for img in jsobj[0][:5]:
             all += [image_url_change_size(img[6][0])]
-    else:
-        for img in jsobj[0]:
+
+    elif photo_category_map[jsobj[13][0]] == '菜單':
+        for img in jsobj[0][:5]:
             menu += [image_url_change_size(img[6][0])]
 
     return menu, all
     
 
-def process_web_request_photo(driver, output):
+def process_web_request_photo(driver, output, fid):
     try:
         driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='0']")
         photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
@@ -396,19 +396,19 @@ def process_web_request_photo(driver, output):
     for request in driver.requests:
         if request.response:
             # print(request.url)
-            c = 0
             if 'photo?' in request.url :
                 print('parsing js:')
                 print(request.url)
-                resp = brotli.decompress(request.response.body)
-                jstext = resp.decode('utf-8')
-                menu, all = photos_parsing_js(jstext, c)
-                menu_list += menu
-                all_list += all
-                c += 1
+                front, _ = fid.split(':')
+                if request.url.find(front) != -1:
+                    resp = brotli.decompress(request.response.body)
+                    jstext = resp.decode('utf-8')
+                    menu, all = photos_parsing_js(jstext)
+                    menu_list += menu
+                    all_list += all
 
-    output['shop_photo'] = str(all_list)
-    output['menu_photo'] = str(menu_list)
+    output['shop_photo'] = str(all_list[:5])
+    output['menu_photo'] = str(menu_list[:5])
 
     return output
     
@@ -458,7 +458,7 @@ def main():
             else:
                 db_name = num
 
-            print(fid, name, num, keyword, db_name)
+            print(fid, keyword, db_name)
             print(item_url)
 
             #shop_info
@@ -466,8 +466,7 @@ def main():
             for i in range(5):
                 print('shop info try...{}'.format(i))
                 driver.get(item_url)
-                driver.refresh()
-                time.sleep(5)
+                time.sleep(5)                
 
                 wait = WebDriverWait(driver, 10)
                 wait.until(
@@ -481,7 +480,7 @@ def main():
 
                 if driver.current_url == item_url:continue
                 print(driver.current_url)
-                output = process_web_request_start(driver, db_name)
+                output, start_js = process_web_request_start(driver, fid)
                 if output != 0: break
 
 
@@ -503,7 +502,7 @@ def main():
                         ActionChains(driver).move_to_element(element).click(element).perform()
                         time.sleep(0.5)
 
-                        output = process_web_request_reviews(driver, output)
+                        output = process_web_request_reviews(driver, output, start_js)
                         break
                     except:
                         driver.get(item_url)
@@ -525,7 +524,7 @@ def main():
                         element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
                         ActionChains(driver).move_to_element(element).click(element).perform()
 
-                        output = process_web_request_photo(driver, output)
+                        output = process_web_request_photo(driver, output, fid)
                         break
                     except:
                         pass
@@ -541,10 +540,10 @@ def main():
             output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
             output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
 
-            # shop_table.insert(output,['item_url'])
-
             save_js_to_db(output, fid)
             error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
+            print('*'*10)
+
         except:
             error_table3 = db['error_list3']
             error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})