Jared %!s(int64=2) %!d(string=hai) anos
pai
achega
c1aed08a9c
Modificáronse 1 ficheiros con 28 adicións e 24 borrados
  1. 28 24
      swire_docker_itemlist.py

+ 28 - 24
swire_docker_itemlist.py

@@ -19,6 +19,8 @@ import socket
 import brotli
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import urllib.parse
+from seleniumwire.utils import decode as sw_decode
+
 #chrome_window=False
 chrome_window=True
 
@@ -46,11 +48,13 @@ def brower_start(port):
         options.add_argument("--headless")
         options.add_argument("--disable-gpu")
         options.add_argument("--disable-dev-shm-usage")
-
         browser = webdriver.Chrome(
             options=options
+#            ,seleniumwire_options={'disable_encoding': True}
 #            desired_capabilities=options.to_capabilities()
         )
+        browser.set_window_size(1400,1000)
+
     else:
         chrome_options = webdriver.ChromeOptions()
         chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
@@ -82,20 +86,6 @@ def page_down_(driver, xpath_css, time_):
             time.sleep(0.5)
 
 
-def get_url_list(driver):
-    page_down_(driver, '//div[@class="TFQHme"]', 8)
-
-    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
-    url_list = []
-    for i in url_soup.find_all('a'):
-        try:
-            if i['href'].find('maps/place') != -1:
-                url_list += [[i['href'], i['aria-label']]]
-        except:
-            pass
-    # print(len(url_list))
-    return url_list
-
 
 def keyin_keyword(driver, keyword):
     button = driver.find_element_by_id("searchbox")
@@ -181,7 +171,8 @@ def get_next_job(db,repeat=False,repkw=None,repnum=None):
 
     if repeat:
 #        cursor = db.query('select  lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
-        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
+#        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
+        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list where keyword <> "火鍋餐廳" order by rand() limit 1')
 
         for c in cursor:
             result['kw']=c['keyword']
@@ -219,7 +210,12 @@ def parsing_js(orig):
     content_end=result.end()
 
     jscontent=content[content_begin:content_end-1]
+
+
+
 #    write_to_file(jscontent,'c:/tmp/debug.txt')
+#    write_to_file(jscontent,'c:/tmp/headless.txt')
+
     jsobj=json.loads(jscontent)
     for x in jsobj[0][1][1:]:
         print(x[14][11])
@@ -310,15 +306,23 @@ def process_web_request(db,driver,area_num,keyword):
             if 'search?' in request.url :
                 print('parsing js:')
 #                resp=request.response.body
-#                resp = brotli.decompress(request.response.content)
-                resp = brotli.decompress(request.response.body)
-
-#                resp = gzip.decompress(request.response.body)
-#                print(resp)
-#                jstext=resp.decode('utf-8')
-                jstext=str(resp)
-                print(jstext)
+#                resp = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
+#                data = data.decode("utf8")
+#                print(request.response.header)
+#                sys.exit()
+#                driver.quit()
+                resp = request.response.body
+                print(request.response.headers.get('Content-Encoding'))
+                if 'gzip' in request.response.headers.get('Content-Encoding'):
+                    resp = gzip.decompress(request.response.body)
+
+                if 'br' in request.response.headers.get('Content-Encoding'):
+                    resp = brotli.decompress(request.response.body)
+                jstext=resp.decode('utf-8')
                 resultobj=parsing_js(jstext)
+
+                break
+
                 print("before",datetime.now())
                 print("num: "+str(area_num))
                 save_js_to_db(resultobj,area_num,keyword)