Jared 2 years ago
parent
commit
4e02107706
1 changed files with 30 additions and 4 deletions
  1. 30 4
      run4.py

+ 30 - 4
run4.py

@@ -12,7 +12,7 @@ from selenium.common.exceptions import WebDriverException
 import selenium
 import traceback
 from bs4 import BeautifulSoup
-
+import gzip
 from utility import database_access as DA
 from utility.parseutils import *
 from utility.connect import *
@@ -309,7 +309,17 @@ def process_web_request_start(driver, fid):
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
                     print(request.url)
-                    resp = brotli.decompress(request.response.body)
+#                    resp = brotli.decompress(request.response.body)
+                    resp=request.response.body
+                    if 'gzip' in request.response.headers.get('Content-Encoding'):
+                        resp = gzip.decompress(request.response.body)
+
+                    if 'br' in request.response.headers.get('Content-Encoding'):
+                        resp = brotli.decompress(request.response.body)
+
+    #                resp = brotli.decompress(request.response.body)
+
+
                     jstext = resp.decode('utf-8')
                     output = parsing_js(jstext)
                     time.sleep(1)
@@ -359,9 +369,25 @@ def process_web_request_reviews(driver, output, ludocid):
                 # print('parsing js:')
                 if request.url.find(ludocid) != -1:
                     print(request.url)
-                    resp = brotli.decompress(request.response.body)
+
+#                    resp = brotli.decompress(request.response.body)
+#                    jstext = resp.decode('utf-8')
+#                    result = reviews_parsing_js(jstext)
+#                    resp = brotli.decompress(request.response.body)
+                    resp=request.response.body
+                    if 'gzip' in request.response.headers.get('Content-Encoding'):
+                        resp = gzip.decompress(request.response.body)
+
+                    if 'br' in request.response.headers.get('Content-Encoding'):
+                        resp = brotli.decompress(request.response.body)
+
+    #                resp = brotli.decompress(request.response.body)
+
+
                     jstext = resp.decode('utf-8')
-                    result = reviews_parsing_js(jstext)
+                    result = parsing_js(jstext)
+
+
                     output['reviews'] = str(result)
                     time.sleep(1)
                     return output