noodles 2 years ago
parent
commit
25e5c70f5a
1 changed files with 37 additions and 24 deletions
  1. 37 24
      run4.py

+ 37 - 24
run4.py

@@ -7,6 +7,8 @@ from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import WebDriverException
 import selenium
 import traceback
 from bs4 import BeautifulSoup
@@ -76,7 +78,7 @@ def brower_start(port):
 
 def get_next_job(db):
     result = {}
-    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 10')
+    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
     url_pd = pd.DataFrame([dict(i) for i in result])
     url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
     # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
@@ -178,7 +180,14 @@ def parsing_js(resp):
     else:
         output['header_image'] = ''
 
-    print(output)
+    if txt[6][126]:
+        output['google_url'] = txt[6][126][4]
+        ludocid_str = [i for i in txt[6][126][4].split('&') if i.find('ludocid') != -1]
+        if len(ludocid_str) != 0:
+            ludocid = ludocid_str[0].split('=')[-1]
+            output['ludocid'] = ludocid
+    else:
+        output['google_url'] = ''
     # write_to_file(orig,'debug.pickle')
     return output 
 
@@ -269,7 +278,7 @@ def process_web_request_start(driver, fid):
         if request.response:
             # print(request.url)
             if 'place?' in request.url :
-                print('parsing js:')
+                # print('parsing js:')
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
                     print(request.url)
@@ -277,8 +286,8 @@ def process_web_request_start(driver, fid):
                     jstext = resp.decode('utf-8')
                     output = parsing_js(jstext)
                     time.sleep(1)
-                    return output, request.url                 
-    return 0, 0
+                    return output              
+    return 0
 
 
 def reviews_parsing_js(resp):
@@ -312,7 +321,7 @@ def reviews_parsing_js(resp):
     return result 
 
 
-def process_web_request_reviews(driver, output, start_js):
+def process_web_request_reviews(driver, output, ludocid):
     time.sleep(3)
 
     print("reviews&**********************")
@@ -320,8 +329,8 @@ def process_web_request_reviews(driver, output, start_js):
         if request.response:
             # print(request.url)
             if 'listentitiesreviews?' in request.url :
-                print('parsing js:')
-                if start_js.find(request.url.split('!')[-2]) != -1:
+                # print('parsing js:')
+                if request.url.find(ludocid) != -1:
                     print(request.url)
                     resp = brotli.decompress(request.response.body)
                     jstext = resp.decode('utf-8')
@@ -396,7 +405,7 @@ def process_web_request_photo(driver, output, fid):
         if request.response:
             # print(request.url)
             if 'photo?' in request.url :
-                print('parsing js:')
+                # print('parsing js:')
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
                     print(request.url)
@@ -443,7 +452,7 @@ def main():
     driver = brower_start(port)
 
     job = get_next_job(db)
-
+    c = 0
     for row, group in job.iterrows():
         try:
             item_url = group['item_url']
@@ -459,9 +468,6 @@ def main():
 
             print(fid, keyword, db_name)
             print(item_url)
-            # ActionChains(driver).key_down(Keys.SHIFT).key_down(Keys.F5).perform()
-            # driver.find_element_by_tag_name('body').send_keys(Keys.F5)
-            # time.sleep(3)  
              
             #shop_info
             print('parsing shop info....')
@@ -482,13 +488,12 @@ def main():
 
                 if driver.current_url == item_url:continue
                 print(driver.current_url)
-                output, start_js = process_web_request_start(driver, fid)
+                output = process_web_request_start(driver, fid)
                 if output != 0: break
 
-
             # reivews
             print('parsing reviews....')
-            if output['user_ratings_total'] == '':
+            if not output['user_ratings_total']:
                 output['reviews'] = ''
             else:
                 for i in range(3):
@@ -504,7 +509,7 @@ def main():
                         ActionChains(driver).move_to_element(element).click(element).perform()
                         time.sleep(0.5)
 
-                        output_ = process_web_request_reviews(driver, output, start_js)
+                        output_ = process_web_request_reviews(driver, output, output['ludocid'])
                         if output_ != 0:
                             output = output_
                             break
@@ -539,25 +544,33 @@ def main():
             else:
                 output['shop_photo'] = '[]'
                 output['menu_photo'] = '[]'
-
-            print(output)
-            query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
+         
             output['item_url'] = item_url
             output['keyword'] = keyword
-            output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
+
+            if output['google_url'] == '':
+                query_name = output['adress_name'].replace('(','').replace(')', '').replace(' ','')
+                output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
             output['crawler_date'] = datetime.today().strftime("%Y/%m/%d %H:%M")
 
+            print(output)
             save_js_to_db(output, fid)
             error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
             print('*'*10)
 
+        except TimeoutException as e:
+            traceback.print_exc()
+            break
+        
+        except TimeoutException as e:
+            traceback.print_exc()
+            break
+
         except:
             error_table3 = db['error_list3']
             error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
             traceback.print_exc()
-            driver.quit()
-            print('drvier restart...')
-            driver = brower_start(port)
+            
 
 if __name__ == '__main__':
     main()