noodles před 2 roky
rodič
revize
b97b099e65
1 změnil soubory, kde provedl 41 přidání a 30 odebrání
  1. 41 30
      swire_shop_review.py

+ 41 - 30
swire_shop_review.py

@@ -7,6 +7,7 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.common.by import By
 import selenium
+import gzip
 import traceback
 from bs4 import BeautifulSoup
 
@@ -35,13 +36,10 @@ db_columns = ['author_id','author_page','author_name', 'author_image', 'author_r
               'review_time', 'review_content', 'review_image',
               'store_review_time','store_review']
 
+
 def write_to_file(jsobj,fname):
     with open(fname, 'wb') as handle:
         pickle.dump(jsobj, handle, protocol=pickle.HIGHEST_PROTOCOL)
-    # import codecs
-    # fw=codecs.open(fname,'w','utf-8')
-    # fw.write(str(jsobj))
-    # fw.close()
 
 
 def build_cache(db):
@@ -82,15 +80,13 @@ def brower_start(port):
 
 def get_next_job(db):
     result = {}
-    result = db.query('select * from swire_store_list ORDER BY RAND() limit 1')
+    sql = '''select t1.name, t1.ludocid, t1.fid, t1.user_ratings_total, t2.place_id from 
+                (select * from shop_list3 where ludocid is NOT NULL and user_ratings_total is NOT NULL and 
+                fid not in (select fid from review_process ) ORDER BY RAND() limit 1 ) 
+            as t1 join google_poi.swire_store_list as t2 on t1.fid = t2.fid'''
+    result = db.query(sql)
     url_pd = pd.DataFrame([dict(i) for i in result])
-    url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
-
-    remove = db.query('select fid from review_process')
-    remove = pd.DataFrame([dict(i) for i in remove])
-    remove_fid_list = remove['fid'].to_list()
-
-    url_pd = url_pd[~url_pd['fid'].isin(remove_fid_list)]
+    url_pd['item_url'] = url_pd['place_id'].apply(lambda x: 'https://www.google.com/maps/place/?q=place_id:{}'.format(x) )
 
     return url_pd
 
@@ -141,22 +137,32 @@ def save_js_to_db(jsobj, fid):
             traceback.print_exc()
 
 
-def process_web_request(db, driver, fid):
-    time.sleep(0.8)
+def process_web_request(driver, fid, ludocid):
     time.sleep(3)
     print("ppppppppp&**********************")
     for request in driver.requests:
         if request.response:
             # print(request.url)
             if 'listentitiesreviews?' in request.url :
-                print('parsing js:')
-                print(request.url)
-                resp = brotli.decompress(request.response.body)
-                jstext = resp.decode('utf-8')
-                result = parsing_js(jstext)
+                if request.url.find(ludocid) != -1:
+
+                    print('parsing js:')
+                    print(request.url)
+                    resp = brotli.decompress(request.response.body)
+
+                    if 'gzip' in request.response.headers.get('Content-Encoding'):
+                        resp = gzip.decompress(request.response.body)
 
-                save_js_to_db(result, fid)
-                time.sleep(1)
+                    if 'br' in request.response.headers.get('Content-Encoding'):
+                        resp = brotli.decompress(request.response.body)
+
+                    jstext = resp.decode('utf-8')
+                    result = parsing_js(jstext)
+
+                    save_js_to_db(result, fid)
+                    time.sleep(1)
+                    return 1
+    return 0
 
 
 def page_down_(driver, xpath_css, time_):
@@ -200,7 +206,7 @@ def main():
     global proxyport
     global iddict
 
-    localip=socket.gethostbyname(socket.gethostname())
+    # localip=socket.gethostbyname(socket.gethostname())
 
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
     store_list_table = db['swire_store_list']
@@ -229,20 +235,25 @@ def main():
             item_url = group['item_url']
             reviews_cnt = group['reviews_cnt']
             fid = group['fid']
+            ludocid = group['ludocid']
 
             print(reviews_cnt, item_url)
-            driver.get(item_url)
-            time.sleep(0.5)
-            shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-            tmp_value = shop_soup.find('span', {'jsaction':'pane.rating.moreReviews'})
 
-            if tmp_value:
+            for i in range(3):
+                print('reviews try...{}'.format(i))
+                print("reviews try.....{}".format(datetime.now()))
+
+                driver.get(item_url)
+                time.sleep(0.5)
+
                 get_reviews(driver, reviews_cnt)
-                process_web_request(db, driver, fid)
+                status = process_web_request(driver, fid, ludocid)
                 print(driver.current_url)
 
-            db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
-            
+                if status:
+                    db['review_process'].insert({'fid':fid, 'dt':datetime.now()})
+                    break
+
         except:
             traceback.print_exc()