jared hace 3 años
padre
commit
f0f0d0f555
Se han modificado 1 ficheros con 89 adiciones y 78 borrados
  1. 89 78
      swire_shop_item_list.py

+ 89 - 78
swire_shop_item_list.py

@@ -23,6 +23,7 @@ import re
 import sys, os
 import socket
 import brotli
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
 chrome_window=False
 
@@ -33,11 +34,19 @@ def brower_start(port):
             desired_capabilities=options.to_capabilities()
         )
     else:
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=host.docker.internal:8787')  # Specify your Kubernetes service-name here
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
         browser = webdriver.Remote(
             command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
-            desired_capabilities=options.to_capabilities()
-        )
+            desired_capabilities=chrome_options.to_capabilities(),
+            seleniumwire_options={'addr':'0.0.0.0','port':8787,'auto_config': False}
 
+            )
+#            seleniumwire_options = {'addr': '172.17.0.2','port':4444})
+        browser.set_window_size(1400,1000)
     return browser
 
 
@@ -78,40 +87,23 @@ def keyin_keyword(driver, keyword):
 
 
 
-def get_crawler_list(db):
-#    result = db.query('select * from shop_item_list order by keyword')
-#    result = pd.DataFrame([i for i in result])
-#    result = result[~result.keyword.str.contains('項')]
-
-#    progress = db.query('select distinct(kw) from progress_list2 where num < 367')
-#    progress = pd.DataFrame([i for i in progress])
-
-#    if len(progress) != 0:
-#        keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0]['keyword']
-#    else:
-#        keyword = result.iloc[0]['keyword']
-#        
-#    return keyword
-    return '滷味'
-    cursor = db.query('select distinct(kw) from progress_list2 where num < 367 order by num asc limit 1')
+def get_next_job(db):
+    result={}
+    cursor = db.query('select distinct(kw),num+1 as num from progress_list2 where num < 367 order by num asc limit 1')
     for c in cursor:
-        return c['kw']
-    return None
+        result['kw']=c['kw']
+        result['num']=c['num']
+        break
 
-def get_lon_lat_list(db, keyword):
-    num=0
-    cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
+    cursor = db.query('select lat,lon,loc from lat_lon_loc where num ="'+str(result['num'])+'"')
     for c in cursor:
-        num=c['num']
+        result['lat']=c['lat']
+        result['lon']=c['lon']
+        result['loc']=c['loc']
         break
 
-    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
-
-    lst=[]
-    for c in cursor:
-        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+    return result
 
-    return lst
 
 def write_to_file(jsobj,fname):
     import codecs
@@ -120,6 +112,7 @@ def write_to_file(jsobj,fname):
     fw.close()
 
 def parsing_js(orig):
+    resultobj=[]
     content=""
     lines=orig.split('\n')
     for l in lines:
@@ -140,75 +133,93 @@ def parsing_js(orig):
     content_end=result.end()
 
     jscontent=content[content_begin:content_end-1]
-    write_to_file(jscontent,'c:/tmp/debug.txt')
+#    write_to_file(jscontent,'c:/tmp/debug.txt')
     jsobj=json.loads(jscontent)
     for x in jsobj[0][1][1:]:
         print(x[14][11])
         print(x[14][10])
         print(x[14][2])
         print(x[14][78])
-
+        try:
+            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+        except:
+            traceback.print_exc()
+    return resultobj
+
+def save_js_to_db(jsobj,num,keyword):
+    global store_list_table
+    for r in jsobj:
+        r['num']=num
+        r['keyword']=keyword
+        store_list_table.upsert(r,keys=['place_id'])
+
+def process_web_request(driver,area_num,keyword):
+    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
+    time.sleep(14)
+
+    print("ppppppppp&**********************")
+    for request in driver.requests:
+        if request.response:
+            if 'https://www.google.com.tw/search?tbm=map' in request.url :
+                print('parsing js:')
+                resp = brotli.decompress(request.response.body)
+                jstext=resp.decode('utf-8')
+                resultobj=parsing_js(jstext)
+                save_js_to_db(resultobj,area_num,keyword)
 
 
 def main():
     global chrome_window
+    global store_list_table
     localip=socket.gethostbyname(socket.gethostname())
     if localip=='192.168.1.108':
-        chrome_window=True
+#        chrome_window=True
+        chrome_window=False
+
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
-    table = db['shop_item_list3']
-    table2 = db['progress_list2']
+    store_list_table = db['swire_store_list']
+
+    table2 = db['swire_progress_list']
 
-    port=4447
-    if len(sys.argv) > 1 :
-        port=int(sys.argv[1])
+
+    port=4444
+#    if len(sys.argv) > 1 :
+#        port=int(sys.argv[1])
+    if True:
         print('restart docker p{}'.format(port))
-        os.system('sudo docker container restart p'+str(port))
-        time.sleep(8)
+#        os.system('sudo docker container restart p'+str(port))
+        os.system('docker container restart p'+str(port))
+
+        time.sleep(10)
 
     print('drvier start...')
     driver = brower_start(port)
     
 
-    for i in range(10):
+    while True:
         try:
-            keyword  = get_crawler_list(db)
-            print(keyword)
-            lst = get_lon_lat_list(db, keyword)
-#            print(lst)
-            print(keyword, len(lst))
-
-            for r in lst:
-                latitude = r['lat'] #緯度
-                longitude = r['lon'] #精度
-                area_num=r['num']
-                table2.upsert({'kw':keyword,'num':r['num']},['kw'])
-
-                url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
-                driver.get(url)
-                keyin_keyword(driver, keyword)
-                failcnt = 0
-
-#                query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
-                time.sleep(11)
-                print("ppppppppp&**********************")
-                for request in driver.requests:
-                    if request.response:
-                        if 'https://www.google.com.tw/search?tbm=map' in request.url :
-                            print('parsing js:')
-                            resp = brotli.decompress(request.response.body)
-                            jstext=resp.decode('utf-8')
-                            parsing_js(jstext)
-
-
-
-                for page in range(10):
-                    if page < 2 :
-                        element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
-                        if element.get_attribute('disabled'):
-                            break
-                        driver.implicitly_wait(30)
-                        ActionChains(driver).move_to_element(element).click(element).perform() 
+            job=get_next_job(db)
+            print(job)
+            keyword  = job['kw']
+            latitude = job['lat'] #緯度
+            longitude = job['lon'] #精度
+            area_num=job['num']
+
+            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+            driver.get(url)
+            keyin_keyword(driver, keyword)
+
+            process_web_request(driver,area_num,keyword)
+
+            while True:
+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                if element.get_attribute('disabled'):
+                    break
+    #               driver.implicitly_wait(30)
+                ActionChains(driver).move_to_element(element).click(element).perform() 
+                process_web_request(driver,area_num,keyword)
+            table2.upsert({'kw':keyword,'num':job['num']},['kw'])
+
         except:
             pass