Jared 2 anni fa
parent
commit
72e5d14360
4 ha cambiato i file con 81 aggiunte e 8 eliminazioni
  1. 10 5
      run3.py
  2. 29 3
      swire_shop_review.py
  3. 25 0
      utility/alston_exp.py
  4. 17 0
      utility/gen_areacodes.py

+ 10 - 5
run3.py

@@ -68,7 +68,7 @@ def brower_start(port):
 #    browser = webdriver.Chrome(options=options)
     options.add_argument('--ignore-certificate-errors')
     options.add_argument("--no-sandbox")
-    options.add_argument("--headless")
+#    options.add_argument("--headless")
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-dev-shm-usage")
     browser = webdriver.Chrome(options=options)
@@ -584,15 +584,18 @@ def main():
     #     port=int(sys.argv[2])
     if len(sys.argv) > 1 :
         port=int(sys.argv[1])
-        print('restart docker p{}'.format(port))
-        os.system('sudo docker container restart p'+str(port))
-        time.sleep(8)
+#        print('restart docker p{}'.format(port))
+#        os.system('sudo docker container restart p'+str(port))
+#        time.sleep(8)
     else:
         port = 2
 
     for i in range(10):
-        result = db2.query('select * from swire_store_list where check_ is null and fid not in (select distinct fid from error_list2)  ORDER BY RAND() limit 500')
+#        result = db2.query('select * from swire_store_list where check_ is null and fid not in (select distinct fid from error_list2)  ORDER BY RAND() limit 500')
+        result = db2.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 500')
+
         url_pd = pd.DataFrame([dict(i) for i in result])
+#        print(url_pd)
         url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
         # keyword = get_new_keyword(db2)
@@ -615,6 +618,7 @@ def main():
     
                 print('start...')
                 driver.get(item_url)
+                time.sleep(9999)
 #                page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
                 page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu-haAclf']", 3)
 
@@ -667,6 +671,7 @@ def main():
                 data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
                 table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
             except Exception as e:
+                traceback.print_exc()
                 table3 = db2['error_list2']
                 table3.insert({'fid':row['fid'],'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                 print(e)

+ 29 - 3
swire_shop_review.py

@@ -21,13 +21,16 @@ import dataset
 import time
 import json
 import re
+import gzip
 import sys, os
 import socket
 import brotli
 import pickle
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import urllib.parse
-chrome_window=False
+chrome_window=True
+#chrome_window=False
+
 globalkw=None
 proxyport=8787
 
@@ -61,9 +64,23 @@ def brower_start(port):
     print(proxyport)
     options = webdriver.ChromeOptions()
     if chrome_window:
+#        browser = webdriver.Chrome(
+##            desired_capabilities=options.to_capabilities()
+#        )
+        options.add_argument('--ignore-certificate-errors')
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--disable-dev-shm-usage")
         browser = webdriver.Chrome(
-            desired_capabilities=options.to_capabilities()
+            options=options
+#            ,seleniumwire_options={'disable_encoding': True}
+#            desired_capabilities=options.to_capabilities()
         )
+        browser.set_window_size(1400,1000)
+
+
+
     else:
         chrome_options = webdriver.ChromeOptions()
         chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
@@ -151,12 +168,21 @@ def process_web_request(db, driver, fid):
             if 'listentitiesreviews?' in request.url :
                 print('parsing js:')
                 print(request.url)
-                resp = brotli.decompress(request.response.body)
+                resp=request.response.body
+                if 'gzip' in request.response.headers.get('Content-Encoding'):
+                    resp = gzip.decompress(request.response.body)
+
+                if 'br' in request.response.headers.get('Content-Encoding'):
+                    resp = brotli.decompress(request.response.body)
+
+#                resp = brotli.decompress(request.response.body)
                 jstext = resp.decode('utf-8')
                 result = parsing_js(jstext)
 
                 save_js_to_db(result, fid)
                 time.sleep(1)
+    del driver.requests
+
 
 
 def page_down_(driver, xpath_css, time_):

+ 25 - 0
utility/alston_exp.py

@@ -0,0 +1,25 @@
+import dataset
+
+
+from pymysql import*
+import xlwt
+import pandas.io.sql as sql
+# connect the mysql with the python
+con=connect(user="choozmo",password="pAssw0rd",host="db.ptt.cx",database="google_poi")
+
+# read the data
+#df=sql.read_sql('select * from shop_list2',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list where keyword = "火鍋";',con)
+#df=sql.read_sql('SELECT name,fid,addr,place_id,keyword,num,crawler_date FROM google_poi.swire_store_list where keyword = "火鍋"',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list where keyword = "火鍋餐廳"',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list ',con)
+df=sql.read_sql('SELECT * FROM google_poi.shop_list3; ',con)
+
+
+
+
+# print the data
+print(df)
+# export the data into the excel sheet
+#df.to_excel('hot_pot.xls')
+df.to_excel('details.xls')

+ 17 - 0
utility/gen_areacodes.py

@@ -0,0 +1,17 @@
+import dataset
+import traceback
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+
+cursor=db.query('select distinct keyword from shop_list2')
+
+lst=[]
+for c in cursor:
+    print(c['keyword'])
+    lst.append(c['keyword'])
+
+for l in lst:
+    try:
+        db.query('insert into google_poi.areacodes (select num,"'+l+'",0 from lat_lon_loc) ') 
+    except:
+        traceback.print_exc()