noodles 2 years ago
parent
commit
7260da945f
1 changed files with 32 additions and 25 deletions
  1. 32 25
      run4.py

+ 32 - 25
run4.py

@@ -32,9 +32,6 @@ chrome_window=False
 globalkw=None
 proxyport=8787
 
-db_columns = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
-              'created_at', 'text', 'photos', 'store_review_time','store_review']
-
 
 def write_to_file(jsobj,fname):
     with open(fname, 'wb') as handle:
@@ -79,16 +76,16 @@ def brower_start(port):
 
 def get_next_job(db):
     result = {}
-    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 3')
+    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 1')
     url_pd = pd.DataFrame([dict(i) for i in result])
     url_pd['fid'] = url_pd['item_url'].apply(lambda x: x.split('data=')[1].split('!')[3])
     # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
     remove = db.query('select item_url from error_list3')
     remove = pd.DataFrame([dict(i) for i in remove])
-    remove_fid_list = remove['item_url'].to_list()
-
-    url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
+    if len(remove) != 0:
+        remove_fid_list = remove['item_url'].to_list()
+        url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
 
     return url_pd
 
@@ -265,7 +262,7 @@ def save_js_to_db(jsobj, fid):
 
 
 def process_web_request_start(driver, fid):
-    time.sleep(5)
+    time.sleep(3)
 
     print("start&**********************")
     for request in driver.requests:
@@ -273,9 +270,9 @@ def process_web_request_start(driver, fid):
             # print(request.url)
             if 'place?' in request.url :
                 print('parsing js:')
-                print(request.url)
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
+                    print(request.url)
                     resp = brotli.decompress(request.response.body)
                     jstext = resp.decode('utf-8')
                     output = parsing_js(jstext)
@@ -285,9 +282,8 @@ def process_web_request_start(driver, fid):
 
 
 def reviews_parsing_js(resp):
-    columns_name = ['author_id','author_page','author_name', 'author_image', 'author_review_count',
-              'review_time', 'review_content', 'review_image',
-              'rating', 'store_review_time','store_review']
+    columns_name = ['id','author_page','author_name', 'profile_photo_url', 'author_review_count',
+              'created_at', 'text', 'photos', 'rating', 'store_review_time','store_review']
     jsobj = json.loads(resp[5::])
     result = []
     for i in range(len(jsobj[2])):
@@ -317,7 +313,6 @@ def reviews_parsing_js(resp):
 
 
 def process_web_request_reviews(driver, output, start_js):
-    time.sleep(0.8)
     time.sleep(3)
 
     print("reviews&**********************")
@@ -326,21 +321,25 @@ def process_web_request_reviews(driver, output, start_js):
             # print(request.url)
             if 'listentitiesreviews?' in request.url :
                 print('parsing js:')
-                print(request.url)
                 if start_js.find(request.url.split('!')[-2]) != -1:
+                    print(request.url)
                     resp = brotli.decompress(request.response.body)
                     jstext = resp.decode('utf-8')
                     result = reviews_parsing_js(jstext)
                     output['reviews'] = str(result)
                     time.sleep(1)
                     return output
+    return 0
 
 
 def photos_parsing_js(resp):
     def image_url_change_size(url):
-        url_split = url.split('=')
-        new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
-        return new_url
+        if url.find('streetviewpixels') != -1:
+            return url
+        else:
+            url_split = url.split('=')
+            new_url = url_split[0] + '=s600-' + '-'.join(url_split[-1].split('-')[-2::])
+            return new_url
 
     jsobj = json.loads(resp[5::])
     # write_to_file(jsobj,'tmp/debug_{}.pickle'.format(c))
@@ -388,7 +387,7 @@ def process_web_request_photo(driver, output, fid):
         )
         element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
         ActionChains(driver).move_to_element(element).click(element).perform()
-        time.sleep(2)
+        time.sleep(1)
 
     print("photo&**********************")
     menu_list = []
@@ -398,9 +397,9 @@ def process_web_request_photo(driver, output, fid):
             # print(request.url)
             if 'photo?' in request.url :
                 print('parsing js:')
-                print(request.url)
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
+                    print(request.url)
                     resp = brotli.decompress(request.response.body)
                     jstext = resp.decode('utf-8')
                     menu, all = photos_parsing_js(jstext)
@@ -424,7 +423,7 @@ def main():
 
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
     store_list_table = db['swire_store_list']
-    shop_table = db['shop_list4']
+    shop_table = db['shop_list3']
     error_table = db['error_list2']
 
     iddict=build_cache(db)
@@ -460,13 +459,16 @@ def main():
 
             print(fid, keyword, db_name)
             print(item_url)
-
+            # ActionChains(driver).key_down(Keys.SHIFT).key_down(Keys.F5).perform()
+            # driver.find_element_by_tag_name('body').send_keys(Keys.F5)
+            # time.sleep(3)  
+             
             #shop_info
             print('parsing shop info....')
             for i in range(5):
                 print('shop info try...{}'.format(i))
                 driver.get(item_url)
-                time.sleep(5)                
+                time.sleep(3)                
 
                 wait = WebDriverWait(driver, 10)
                 wait.until(
@@ -475,7 +477,7 @@ def main():
                 element = driver.find_element_by_id('sb_cb50')
                 driver.implicitly_wait(10)
                 ActionChains(driver).move_to_element(element).click(element).perform()
-                time.sleep(5)
+                time.sleep(3)
                 driver.back()
 
                 if driver.current_url == item_url:continue
@@ -502,12 +504,17 @@ def main():
                         ActionChains(driver).move_to_element(element).click(element).perform()
                         time.sleep(0.5)
 
-                        output = process_web_request_reviews(driver, output, start_js)
-                        break
+                        output_ = process_web_request_reviews(driver, output, start_js)
+                        if output_ != 0:
+                            output = output_
+                            break
                     except:
                         driver.get(item_url)
                         time.sleep(0.5)
 
+                if 'reviews' not in output.keys():
+                    continue
+
             # photo
             print('parsing photo....')
             if output['header_image'] != '':