|
@@ -19,6 +19,8 @@ import socket
|
|
|
import brotli
|
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
import urllib.parse
|
|
|
+from seleniumwire.utils import decode as sw_decode
|
|
|
+
|
|
|
#chrome_window=False
|
|
|
chrome_window=True
|
|
|
|
|
@@ -46,11 +48,13 @@ def brower_start(port):
|
|
|
options.add_argument("--headless")
|
|
|
options.add_argument("--disable-gpu")
|
|
|
options.add_argument("--disable-dev-shm-usage")
|
|
|
-
|
|
|
browser = webdriver.Chrome(
|
|
|
options=options
|
|
|
+# ,seleniumwire_options={'disable_encoding': True}
|
|
|
# desired_capabilities=options.to_capabilities()
|
|
|
)
|
|
|
+ browser.set_window_size(1400,1000)
|
|
|
+
|
|
|
else:
|
|
|
chrome_options = webdriver.ChromeOptions()
|
|
|
chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
|
|
@@ -82,20 +86,6 @@ def page_down_(driver, xpath_css, time_):
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
-def get_url_list(driver):
|
|
|
- page_down_(driver, '//div[@class="TFQHme"]', 8)
|
|
|
-
|
|
|
- url_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- url_list = []
|
|
|
- for i in url_soup.find_all('a'):
|
|
|
- try:
|
|
|
- if i['href'].find('maps/place') != -1:
|
|
|
- url_list += [[i['href'], i['aria-label']]]
|
|
|
- except:
|
|
|
- pass
|
|
|
- # print(len(url_list))
|
|
|
- return url_list
|
|
|
-
|
|
|
|
|
|
def keyin_keyword(driver, keyword):
|
|
|
button = driver.find_element_by_id("searchbox")
|
|
@@ -181,7 +171,8 @@ def get_next_job(db,repeat=False,repkw=None,repnum=None):
|
|
|
|
|
|
if repeat:
|
|
|
# cursor = db.query('select lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1')
|
|
|
- cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
|
|
|
+# cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1')
|
|
|
+ cursor = db.query('select lat_txt,lon_txt,keyword from swire_store_list where keyword <> "火鍋餐廳" order by rand() limit 1')
|
|
|
|
|
|
for c in cursor:
|
|
|
result['kw']=c['keyword']
|
|
@@ -219,7 +210,12 @@ def parsing_js(orig):
|
|
|
content_end=result.end()
|
|
|
|
|
|
jscontent=content[content_begin:content_end-1]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
# write_to_file(jscontent,'c:/tmp/debug.txt')
|
|
|
+# write_to_file(jscontent,'c:/tmp/headless.txt')
|
|
|
+
|
|
|
jsobj=json.loads(jscontent)
|
|
|
for x in jsobj[0][1][1:]:
|
|
|
print(x[14][11])
|
|
@@ -310,15 +306,23 @@ def process_web_request(db,driver,area_num,keyword):
|
|
|
if 'search?' in request.url :
|
|
|
print('parsing js:')
|
|
|
# resp=request.response.body
|
|
|
-# resp = brotli.decompress(request.response.content)
|
|
|
- resp = brotli.decompress(request.response.body)
|
|
|
-
|
|
|
-# resp = gzip.decompress(request.response.body)
|
|
|
-# print(resp)
|
|
|
-# jstext=resp.decode('utf-8')
|
|
|
- jstext=str(resp)
|
|
|
- print(jstext)
|
|
|
+# resp = sw_decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
|
|
|
+# data = data.decode("utf8")
|
|
|
+# print(request.response.header)
|
|
|
+# sys.exit()
|
|
|
+# driver.quit()
|
|
|
+ resp = request.response.body
|
|
|
+ print(request.response.headers.get('Content-Encoding'))
|
|
|
+ if 'gzip' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = gzip.decompress(request.response.body)
|
|
|
+
|
|
|
+ if 'br' in request.response.headers.get('Content-Encoding'):
|
|
|
+ resp = brotli.decompress(request.response.body)
|
|
|
+ jstext=resp.decode('utf-8')
|
|
|
resultobj=parsing_js(jstext)
|
|
|
+
|
|
|
+ break
|
|
|
+
|
|
|
print("before",datetime.now())
|
|
|
print("num: "+str(area_num))
|
|
|
save_js_to_db(resultobj,area_num,keyword)
|