| 
					
				 | 
			
			
				@@ -0,0 +1,441 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# -*- coding: utf-8 -*- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#from selenium import webdriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from seleniumwire import webdriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.action_chains import ActionChains 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.keys import Keys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.support import expected_conditions as EC 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.support.wait import WebDriverWait 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.by import By 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import selenium 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import traceback 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from bs4 import BeautifulSoup 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from utility import database_access as DA 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from utility.parseutils import * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from utility.connect import * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import pandas as pd 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import dataset 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import json 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys, os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import socket 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import brotli 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import urllib.parse 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#chrome_window=False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+chrome_window=True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+globalkw=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+proxyport=8787 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def build_cache(db): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    id_dict={} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    cursor = db.query('SELECT place_id FROM google_poi.swire_store_list;') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        id_dict[c['place_id']]=1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return id_dict 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def brower_start(port): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global proxyport 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global chrome_window 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(proxyport) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    options = webdriver.ChromeOptions() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if chrome_window: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        options.add_argument('--ignore-certificate-errors') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        options.add_argument("--no-sandbox") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        options.add_argument("--headless") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        options.add_argument("--disable-dev-shm-usage") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        browser = webdriver.Chrome( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            desired_capabilities=options.to_capabilities() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        chrome_options = webdriver.ChromeOptions() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        chrome_options.add_argument('--ignore-certificate-errors') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        chrome_options.add_argument("--no-sandbox") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        chrome_options.add_argument("--disable-dev-shm-usage") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        browser = webdriver.Remote( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            desired_capabilities=chrome_options.to_capabilities(), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            seleniumwire_options = {'addr': '172.17.0.2','port':4444}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        browser.set_window_size(1400,1000) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return browser 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def page_down_(driver, xpath_css, time_): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    e = driver.find_element_by_css_selector('span[class="Jl2AFb"]') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result_count = e.text.split('-')[1].replace(' 項結果','') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(result_count) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if int(result_count) > 5: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for i in range(time_): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            e = driver.find_elements_by_css_selector('div[class="TFQHme"]') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            action = webdriver.common.action_chains.ActionChains(driver) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            action.move_to_element_with_offset(e[-1], e[-1].size['width'] + 1 , 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            action.click() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            action.perform() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            time.sleep(0.5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def get_url_list(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    page_down_(driver, '//div[@class="TFQHme"]', 8) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url_soup = BeautifulSoup(driver.page_source, 'html.parser') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for i in url_soup.find_all('a'): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if i['href'].find('maps/place') != -1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                url_list += [[i['href'], i['aria-label']]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            pass 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # print(len(url_list)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return url_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def keyin_keyword(driver, keyword): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    button = driver.find_element_by_id("searchbox") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.implicitly_wait(30) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    time.sleep(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def scan_job(db,kw): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result={'kw':kw} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    cursor = db.query('select t1.num,next-prev as diff from google_poi.conv_log t1, (SELECT num,max(id) mid  FROM google_poi.conv_log group by num  ) t2 where t1.id=t2.mid having diff>0 order by rand()') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['num']=c['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['lat']=c['lat'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['lon']=c['lon'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['loc']=c['loc'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return result 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def get_next_job(db,repeat=False,repkw=None,repnum=None): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global globalkw 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result={} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    if globalkw is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where kw="'+globalkw+'"') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        cursor = db.query('select distinct(kw),num+1 as num from swire_progress_list where num < 367 order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    cursor = db.query('select kw,num  from areacodes where expand=0 order by rand()') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    cursor = db.query('select kw,num  from areacodes order by rand()') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        repkw=c['kw'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if repkw is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            repkw=c['kw'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['kw']=c['kw'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['num']=c['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if repkw is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['kw']=repkw 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if result.get('num') is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lat']=c['lat'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lon']=c['lon'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['loc']=c['loc'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if repeat and repkw!= 'REP': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['kw']=repkw 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        result['num']=repnum 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if 'REP' in repkw: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if repnum=='REP': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            repnum=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            cursor = db.query('select  num from swire_store_list where num not in (select num from conv_log) order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            cursor = db.query('select  num from swire_store_list  order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                repnum=c['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if repnum is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            cursor = db.query('select  num from swire_store_list  order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                repnum=c['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cursor = db.query('select  lat_txt,lon_txt,keyword,num from swire_store_list where num="'+str(repnum)+'" limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['kw']=c['keyword'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['num']=c['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lat']=c['lat_txt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lon']=c['lon_txt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['loc']='' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return result 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if repeat: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        cursor = db.query('select  lat_txt,lon_txt from swire_store_list where num ="'+str(result['num'])+'" and keyword="'+result['kw']+'" order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cursor = db.query('select  lat_txt,lon_txt,keyword from swire_store_list order by rand() limit 1') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['kw']=c['keyword'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lat']=c['lat_txt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result['lon']=c['lon_txt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return result 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def write_to_file(jsobj,fname): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    import codecs 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    fw=codecs.open(fname,'w','utf-8') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    fw.write(str(jsobj)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    fw.close() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def parsing_js(orig): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    resultobj=[] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    content="" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    lines=orig.split('\n') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for l in lines: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        newl=l.replace('\\"','"') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        if '\\\\"' in newl: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            print(newl) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        newl=newl.repace('\\\\"','') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        newl=newl.replace('\\"','"') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        content+=newl 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result=re.search(r'\[\["',content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(result) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    content_begin=result.start() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result=re.search(r'\]\]"',content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(result) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    content_end=result.end() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    jscontent=content[content_begin:content_end-1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    write_to_file(jscontent,'c:/tmp/debug.txt') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    jsobj=json.loads(jscontent) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for x in jsobj[0][1][1:]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][11]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][9]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        reviews_cnt=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        photo=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        rating=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        biz_id=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        loc_x=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        loc_y=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        addr_elmts=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        tel=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            rating=x[14][4][7] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            reviews_cnt=x[14][4][8] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            photo=x[14][37][0][0][0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            num_photos=x[14][37][0][0][6][1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            loc_x=x[14][37][0][0][29][0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            loc_y=x[14][37][0][0][29][1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            biz_id=x[14][57][2] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            tel=x[14][178][0][3] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            addr_elmts=str(x[14][82]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        category=str(x[14][13]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        topic=str(x[14][89]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][13]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][10]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][2]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(x[14][78]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return resultobj 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def save_js_to_db(jsobj,num,keyword): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global store_list_table 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global iddict 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for r in jsobj: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if iddict.get(r['place_id']) is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        r['num']=num 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        r['keyword']=keyword 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            store_list_table.insert(r) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            store_list_table.upsert(r,keys=['place_id']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        store_list_table.upsert(r,keys=['place_id']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def process_web_request(db,driver,area_num,keyword): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global prev_cnt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]'))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    time.sleep(0.8) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    time.sleep(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("ppppppppp&**********************") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for request in driver.requests: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if 'search?' in request.url : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            print('searching.....') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            print(request.url[20:60]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if request.response: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            if 'https://www.google.com.tw/search?tbm=map' in request.url : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if 'search?' in request.url : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                print('parsing js:') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                resp = brotli.decompress(request.response.body) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                jstext=resp.decode('utf-8') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                resultobj=parsing_js(jstext) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                print("before",datetime.now()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                print("num: "+str(area_num)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                save_js_to_db(resultobj,area_num,keyword) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                print("after",datetime.now()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                aft_cnt=0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    aft_cnt=c['cnt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                db['conv_log'].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    time.sleep(9999) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global chrome_window 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global store_list_table 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global globalkw 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global proxyport 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global iddict 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global prev_cnt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    port=4444 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # if len(sys.argv) == 3 : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #     port=int(sys.argv[1]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #     proxyport=int(sys.argv[2]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if len(sys.argv)>1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        globalkw=sys.argv[1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        port=int(sys.argv[2]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        proxyport=int(sys.argv[3]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(globalkw, port, proxyport) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    failcnt=0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    localip=socket.gethostbyname(socket.gethostname()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    if localip=='192.168.1.108': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        chrome_window=True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        chrome_window=False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    iddict=build_cache(db) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    store_list_table = db['swire_store_list'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#    table2 = db['swire_progress_list'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table2 = db['swire_area_progress'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if not chrome_window: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print('restart docker p{}'.format(port)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#        os.system('sudo docker container restart p'+str(port)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        os.system('docker container restart p'+str(port)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        time.sleep(10) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print('drvier start...') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver = brower_start(port) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    area_num=None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    while True: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if len(sys.argv) > 4 : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                repkw=sys.argv[1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                repnum=sys.argv[2] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if 'SCAN' in repkw: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    job=scan_job(db,repnum) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    job=get_next_job(db,repeat=True,repkw=repkw,repnum=repnum) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                job=get_next_job(db, repkw=globalkw) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            print(job) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            keyword  = job['kw'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            latitude = job['lat'] #緯度 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            longitude = job['lon'] #精度 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            area_num=job['num'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            safe_string = urllib.parse.quote_plus(keyword) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            prev_cnt=0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            cursor = db.query('select count(*) as cnt from swire_store_list where num="'+str(area_num)+'" ') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for c in cursor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                prev_cnt=c['cnt'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            print(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            driver.get(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            time.sleep(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            keyin_keyword(driver, keyword) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            process_web_request(db,driver,area_num,keyword) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            pagecnt=0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            while True: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if element.get_attribute('disabled'): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #               driver.implicitly_wait(30) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                ActionChains(driver).move_to_element(element).click(element).perform()  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                process_web_request(db,driver,area_num,keyword) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                pagecnt+=1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if pagecnt>=5: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#            table2.upsert({'kw':keyword,'num':job['num']},['kw']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            table2.insert({'kw':keyword,'num':job['num']},['kw']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            db.query('update areacodes set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            traceback.print_exc() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            failcnt+=1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if failcnt>=15: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                sys.exit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            pass 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == '__main__': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    main() 
			 |