noodles před 2 roky
rodič
revize
a84b8d6242

binární
details.xls


binární
hot_pot.xls


+ 6 - 5
run.py

@@ -44,12 +44,13 @@ def serive_create(profilepath):
 def brower_start(port):
     options = webdriver.ChromeOptions()
 #    browser = webdriver.Chrome(options=options)
+    browser = webdriver.Chrome(options=options)
 
-    browser = webdriver.Remote(
-        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
-        # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
-        desired_capabilities=options.to_capabilities()
-    )
+#    browser = webdriver.Remote(
+#        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+#        # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
+#        desired_capabilities=options.to_capabilities()
+#    )
     return browser
 
 

+ 10 - 5
run3.py

@@ -68,7 +68,7 @@ def brower_start(port):
 #    browser = webdriver.Chrome(options=options)
     options.add_argument('--ignore-certificate-errors')
     options.add_argument("--no-sandbox")
-    options.add_argument("--headless")
+#    options.add_argument("--headless")
     options.add_argument("--disable-gpu")
     options.add_argument("--disable-dev-shm-usage")
     browser = webdriver.Chrome(options=options)
@@ -584,15 +584,18 @@ def main():
     #     port=int(sys.argv[2])
     if len(sys.argv) > 1 :
         port=int(sys.argv[1])
-        print('restart docker p{}'.format(port))
-        os.system('sudo docker container restart p'+str(port))
-        time.sleep(8)
+#        print('restart docker p{}'.format(port))
+#        os.system('sudo docker container restart p'+str(port))
+#        time.sleep(8)
     else:
         port = 2
 
     for i in range(10):
-        result = db2.query('select * from swire_store_list where check_ is null and fid not in (select distinct fid from error_list2)  ORDER BY RAND() limit 500')
+#        result = db2.query('select * from swire_store_list where check_ is null and fid not in (select distinct fid from error_list2)  ORDER BY RAND() limit 500')
+        result = db2.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 500')
+
         url_pd = pd.DataFrame([dict(i) for i in result])
+#        print(url_pd)
         url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
         # keyword = get_new_keyword(db2)
@@ -615,6 +618,7 @@ def main():
     
                 print('start...')
                 driver.get(item_url)
+                time.sleep(9999)
 #                page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
                 page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu-haAclf']", 3)
 
@@ -667,6 +671,7 @@ def main():
                 data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
                 table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
             except Exception as e:
+                traceback.print_exc()
                 table3 = db2['error_list2']
                 table3.insert({'fid':row['fid'],'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                 print(e)

+ 63 - 16
run4.py

@@ -16,7 +16,7 @@ import gzip
 from utility import database_access as DA
 from utility.parseutils import *
 from utility.connect import *
-
+import redis
 from datetime import datetime
 from requests import session
 import pandas as pd
@@ -99,7 +99,9 @@ def get_next_job(db):
     result = {}
 #    result = db.query('select * from error_list2 where check_=0 ORDER BY RAND() limit 100')
 
-    result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 100')
+#    result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
+#    result = db.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from shop_list3 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 30')
+    result = db.query('SELECT * FROM swire_store_list a WHERE fid not in (select fid from shop_list3 ) ORDER BY RAND() limit 30')
 
 
     url_pd = pd.DataFrame([dict(i) for i in result])
@@ -110,11 +112,11 @@ def get_next_job(db):
 
     # url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
 
-    remove = db.query('select item_url from error_list3')
-    remove = pd.DataFrame([dict(i) for i in remove])
-    if len(remove) != 0:
-        remove_fid_list = remove['item_url'].to_list()
-        url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
+#    remove = db.query('select item_url from error_list3')
+#    remove = pd.DataFrame([dict(i) for i in remove])
+#    if len(remove) != 0:
+#        remove_fid_list = remove['item_url'].to_list()
+#        url_pd = url_pd[~url_pd['item_url'].isin(remove_fid_list)]
 
     return url_pd
 
@@ -222,6 +224,19 @@ def parsing_js(resp):
 def time_parsing_js(time_json, output):
     weekday_text = []
     periods = []
+    if time_json is None:
+        output['open_now'] = 'False'
+        output['periods'] = ''
+        output['weekday_text'] = ''
+        output['time_status'] = ''
+        return output
+    if time_json[1] is None:
+        output['open_now'] = 'False'
+        output['periods'] = ''
+        output['weekday_text'] = ''
+        output['time_status'] = ''
+        return output
+
 
     for time_ in time_json[1]:
         week = time_[0]
@@ -413,14 +428,14 @@ def photos_parsing_js(resp):
         photo_category_map[row[0]] = row[2]
 
     if photo_category_map[jsobj[13][0]] == '全部':
-        for img in jsobj[0][:5]:
+        for img in jsobj[0]:
             all += [image_url_change_size(img[6][0])]
 
     elif photo_category_map[jsobj[13][0]] == '菜單':
-        for img in jsobj[0][:5]:
+        for img in jsobj[0]:
             menu += [image_url_change_size(img[6][0])]
 
-    return menu, all
+    return list(set(menu)), list(set(all))
     
 
 def process_web_request_photo(driver, output, fid):
@@ -461,8 +476,18 @@ def process_web_request_photo(driver, output, fid):
                 # print('parsing js:')
                 front, _ = fid.split(':')
                 if request.url.find(front) != -1:
+#                    resp = brotli.decompress(request.response.body)
                     print(request.url)
-                    resp = brotli.decompress(request.response.body)
+                    resp=request.response.body
+                    if 'gzip' in request.response.headers.get('Content-Encoding'):
+                        resp = gzip.decompress(request.response.body)
+
+                    if 'br' in request.response.headers.get('Content-Encoding'):
+                        resp = brotli.decompress(request.response.body)
+
+
+
+
                     jstext = resp.decode('utf-8')
                     menu, all = photos_parsing_js(jstext)
                     menu_list += menu
@@ -489,7 +514,8 @@ def main():
     error_table = db['error_list2']
 
     iddict=build_cache(db)
-    
+    print("iddict...{}".format(datetime.now()))
+
     port=4444
     if len(sys.argv) == 3 :
         port=int(sys.argv[1])
@@ -506,6 +532,7 @@ def main():
 
     job = get_next_job(db)
     c = 0
+
     for row, group in job.iterrows():
         try:
             item_url = group['item_url']
@@ -526,6 +553,8 @@ def main():
             print('parsing shop info....')
             for i in range(5):
                 print('shop info try...{}'.format(i))
+                print("shop info try...{}".format(datetime.now()))
+
                 driver.get(item_url)
                 time.sleep(3)                
 
@@ -541,16 +570,26 @@ def main():
 
                 if driver.current_url == item_url:continue
                 print(driver.current_url)
-                output = process_web_request_start(driver, fid)
-                if output != 0: break
+                try:
+                    output = process_web_request_start(driver, fid)
+                    if output != 0: break
+                except:
+                    r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
+                    msg=traceback.format_exc()
+                    r.set('google_error',msg)
+
 
             # reivews
             print('parsing reviews....')
+            print("parsing reviews.....{}".format(datetime.now()))
+
             if not output['user_ratings_total']:
                 output['reviews'] = ''
             else:
                 for i in range(3):
                     print('reviews try...{}'.format(i))
+                    print("reviews try.....{}".format(datetime.now()))
+
                     try:
                         wait = WebDriverWait(driver, 30)
                         more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
@@ -570,14 +609,16 @@ def main():
                         driver.get(item_url)
                         time.sleep(0.5)
 
-                if 'reviews' not in output.keys():
-                    continue
+#                if 'reviews' not in output.keys():
+#                    continue
 
             # photo
             print('parsing photo....')
             if output['header_image'] != '':
                 for i in range(3):
                     print('photo try...{}'.format(i))
+                    print("photo try......{}".format(datetime.now()))
+
                     driver.get(item_url)
                     time.sleep(0.5)
                     print(driver.current_url)
@@ -608,6 +649,8 @@ def main():
 
             print(output)
             save_js_to_db(output, fid)
+            print("save_js_to_db......{}".format(datetime.now()))
+
             error_table.upsert({'item_url':item_url,'check_':1},['item_url'])
             print('*'*10)
 
@@ -620,9 +663,13 @@ def main():
             break
 
         except:
+            r = redis.Redis(host='db.ptt.cx', port=6379, db=1,password='choozmo9')
+            msg=traceback.format_exc()
+            r.set('google_error',msg)
             error_table3 = db['error_list3']
             error_table3.insert({'name':name,'keyword':keyword,'item_url':item_url,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
             traceback.print_exc()
+#        sys.exit()
             
 
 if __name__ == '__main__':

+ 23 - 2
swire_shop_review.py

@@ -22,13 +22,16 @@ import dataset
 import time
 import json
 import re
+import gzip
 import sys, os
 import socket
 import brotli
 import pickle
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import urllib.parse
-chrome_window=False
+chrome_window=True
+#chrome_window=False
+
 globalkw=None
 proxyport=8787
 
@@ -59,9 +62,23 @@ def brower_start(port):
     print(proxyport)
     options = webdriver.ChromeOptions()
     if chrome_window:
+#        browser = webdriver.Chrome(
+##            desired_capabilities=options.to_capabilities()
+#        )
+        options.add_argument('--ignore-certificate-errors')
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--disable-dev-shm-usage")
         browser = webdriver.Chrome(
-            desired_capabilities=options.to_capabilities()
+            options=options
+#            ,seleniumwire_options={'disable_encoding': True}
+#            desired_capabilities=options.to_capabilities()
         )
+        browser.set_window_size(1400,1000)
+
+
+
     else:
         chrome_options = webdriver.ChromeOptions()
         chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport))  # Specify your Kubernetes service-name here
@@ -161,7 +178,11 @@ def process_web_request(driver, fid, ludocid):
 
                     save_js_to_db(result, fid)
                     time.sleep(1)
+
+                    del driver.requests
                     return 1
+                    
+    del driver.requests
     return 0
 
 

binární
utility/__pycache__/__init__.cpython-39.pyc


binární
utility/__pycache__/connect.cpython-39.pyc


binární
utility/__pycache__/database_access.cpython-39.pyc


binární
utility/__pycache__/parseutils.cpython-39.pyc


+ 25 - 0
utility/alston_exp.py

@@ -0,0 +1,25 @@
+import dataset
+
+
+from pymysql import*
+import xlwt
+import pandas.io.sql as sql
+# connect the mysql with the python
+con=connect(user="choozmo",password="pAssw0rd",host="db.ptt.cx",database="google_poi")
+
+# read the data
+#df=sql.read_sql('select * from shop_list2',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list where keyword = "火鍋";',con)
+#df=sql.read_sql('SELECT name,fid,addr,place_id,keyword,num,crawler_date FROM google_poi.swire_store_list where keyword = "火鍋"',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list where keyword = "火鍋餐廳"',con)
+#df=sql.read_sql('SELECT * FROM google_poi.swire_store_list ',con)
+df=sql.read_sql('SELECT * FROM google_poi.shop_list3; ',con)
+
+
+
+
+# print the data
+print(df)
+# export the data into the excel sheet
+#df.to_excel('hot_pot.xls')
+df.to_excel('details.xls')

+ 17 - 0
utility/gen_areacodes.py

@@ -0,0 +1,17 @@
+import dataset
+import traceback
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+
+cursor=db.query('select distinct keyword from shop_list2')
+
+lst=[]
+for c in cursor:
+    print(c['keyword'])
+    lst.append(c['keyword'])
+
+for l in lst:
+    try:
+        db.query('insert into google_poi.areacodes (select num,"'+l+'",0 from lat_lon_loc) ') 
+    except:
+        traceback.print_exc()