jared hace 3 años
padre
commit
2396684043
Se han modificado 1 ficheros con 75 adiciones y 11 borrados
  1. 75 11
      swire_shop_item_list.py

+ 75 - 11
swire_shop_item_list.py

@@ -24,12 +24,13 @@ import sys, os
 import socket
 import brotli
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-
+import urllib.parse
 chrome_window=False
 globalkw=None
 proxyport=8787
 def brower_start(port):
     global proxyport
+    global chrome_window
     print(proxyport)
     options = webdriver.ChromeOptions()
     if chrome_window:
@@ -145,15 +146,60 @@ def parsing_js(orig):
     content_end=result.end()
 
     jscontent=content[content_begin:content_end-1]
-#    write_to_file(jscontent,'c:/tmp/debug.txt')
+    write_to_file(jscontent,'c:/tmp/debug.txt')
     jsobj=json.loads(jscontent)
     for x in jsobj[0][1][1:]:
         print(x[14][11])
+        print(x[14][9])
+        reviews_cnt=None
+        photo=None
+        rating=None
+        biz_id=None
+        loc_x=None
+        loc_y=None
+        addr_elmts=None
+        tel=None
+        try:
+            rating=x[14][4][7]
+            reviews_cnt=x[14][4][8]
+        except:
+            traceback.print_exc()
+
+        try:
+            photo=x[14][37][0][0][0]
+            num_photos=x[14][37][0][0][6][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            loc_x=x[14][37][0][0][29][0]
+            loc_y=x[14][37][0][0][29][1]
+        except:
+            traceback.print_exc()
+
+        try:
+            biz_id=x[14][57][2]
+            tel=x[14][178][0][3]
+        except:
+            traceback.print_exc()
+
+        try:
+            addr_elmts=str(x[14][82])
+        except:
+            traceback.print_exc()
+
+
+
+
+        category=str(x[14][13])
+        topic=str(x[14][89])
+        print(x[14][13])
+
         print(x[14][10])
         print(x[14][2])
         print(x[14][78])
         try:
-            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'place_id':x[14][78],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
+            resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
         except:
             traceback.print_exc()
     return resultobj
@@ -163,16 +209,27 @@ def save_js_to_db(jsobj,num,keyword):
     for r in jsobj:
         r['num']=num
         r['keyword']=keyword
-        store_list_table.upsert(r,keys=['place_id'])
+        try:
+            store_list_table.insert(r)
+
+#            store_list_table.upsert(r,keys=['place_id'])
+        except:
+            traceback.print_exc()
+#        store_list_table.upsert(r,keys=['place_id'])
 
 def process_web_request(driver,area_num,keyword):
-    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
+#    query = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//button[@vet="19128"]')))
     time.sleep(0.8)
-
+    time.sleep(3)
     print("ppppppppp&**********************")
     for request in driver.requests:
+        if 'search?' in request.url :
+            print('searching.....')
+#        else:
+#            print(request.url[20:60])
         if request.response:
-            if 'https://www.google.com.tw/search?tbm=map' in request.url :
+#            if 'https://www.google.com.tw/search?tbm=map' in request.url :
+            if 'search?' in request.url :
                 print('parsing js:')
                 resp = brotli.decompress(request.response.body)
                 jstext=resp.decode('utf-8')
@@ -181,6 +238,8 @@ def process_web_request(driver,area_num,keyword):
                 save_js_to_db(resultobj,area_num,keyword)
                 print("after",datetime.now())
 
+#    time.sleep(9999)
+
 
 def main():
     global chrome_window
@@ -191,9 +250,9 @@ def main():
         globalkw=sys.argv[1]
     failcnt=0
     localip=socket.gethostbyname(socket.gethostname())
-    if localip=='192.168.1.108':
+#    if localip=='192.168.1.108':
 #        chrome_window=True
-        chrome_window=False
+#        chrome_window=False
 
     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
     store_list_table = db['swire_store_list']
@@ -227,9 +286,14 @@ def main():
             latitude = job['lat'] #緯度
             longitude = job['lon'] #精度
             area_num=job['num']
-
-            url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+            safe_string = urllib.parse.quote_plus(keyword)
+            url = 'https://www.google.com.tw/maps/@{},{},18z?hl=zh-TW'.format(latitude, longitude)
+#            url = 'https://www.google.com/maps/search/'+safe_string+'/@{},{},16z/data=!3m1!4b1'.format(latitude, longitude)
+#            url='https://www.google.com/maps/search/'+safe_string+'/@24.7962279,121.0449762,15z/data=!3m1!4b1?hl=zh-TW'
+#            print(url)
+#            url='https://www.google.com/maps/search/%E7%81%AB%E9%8D%8B%E9%A4%90%E5%BB%B3/@24.772608,121.0515456,13z'
             driver.get(url)
+#            time.sleep(3)
             keyin_keyword(driver, keyword)
 
             process_web_request(driver,area_num,keyword)