noodles 3 yıl önce
ebeveyn
işleme
4e733d0b9c
2 değiştirilmiş dosya ile 35 ekleme ve 23 silme
  1. 24 21
      run.py
  2. 11 2
      utility/parseutils.py

+ 24 - 21
run.py

@@ -543,7 +543,8 @@ def page_down_(driver, xpath_css, time_):
 def main():
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
     db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
-    table2 = db2['progress_list2']
+
+    table2 = db2['swire_store_list']
     # keyword = '麻辣火鍋'
     # if len(sys.argv) >1:
     #     keyword=sys.argv[1]
@@ -556,11 +557,15 @@ def main():
         os.system('sudo docker container restart p'+str(port))
         time.sleep(8)
 
-    for i in range(5):
-        keyword = get_new_keyword(db2)
-        table2.insert({'kw':keyword,'num':0})
-        url_pd = get_not_cralwer_url(keyword)
-        print('drvier start {}...'.format(keyword))
+    for i in range(10):
+        result = db2.query('select * from swire_store_list where check_ is null ORDER BY RAND() limit 500')
+        url_pd = pd.DataFrame([dict(i) for i in result])
+        url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
+
+        # keyword = get_new_keyword(db2)
+        # table2.insert({'kw':keyword,'num':0})
+        # url_pd = get_not_cralwer_url(keyword)
+        # print('drvier start {}...'.format(keyword))
         driver = brower_start(port)
         #driver = serive_create('Profile 6')
         #profilepath = 'Profile 1'
@@ -575,23 +580,18 @@ def main():
                 print('start...')
                 driver.get(item_url)
                 page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
-                lat, lon = find_lon_lat(driver)
-                unique_id = get_unique_id(driver)
+                # lat, lon = find_lon_lat(driver)
+                # unique_id = get_unique_id(driver)
                 time_status = time_click(driver)
-                if time_status == 'error' or len(unique_id) ==0:
-                    error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
-                    data_select_insert(db, 'error_list2', error_table_col, row)
-                    continue
                 time.sleep(0.5)
                 shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
 
                 output = {
-                    'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
-                    'lon': lon,
-                    'lat': lat,
-                    'unique_id': unique_id.replace('?share','')
+                    # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
+                    'name': name,
+                    'fid': row['fid']
                 }
-                print(output['name'], lon, lat, unique_id)
+                print(output['name'])
 
                 print('get_shop_info')
                 output = get_shop_info(driver, output, shop_soup)
@@ -624,14 +624,17 @@ def main():
                 query_name = '{}+{}'.format(output_name, output['addr'])
                 query_name = query_name.replace(' ','')
                 output['item_url'] = item_url
-                output['keyword'] = keyword
+                output['keyword'] = row['keyword']
                 output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
                 data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
-                
+                table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
             except Exception as e:
+                table3 = db2['error_list2']
+                table3.insert({'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
                 print(e)
-                error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
-                data_select_insert(db, 'error_list2', error_table_col, row)
+                # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
+                # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
+                # data_select_insert(db, 'error_list2', error_table_col, row)
                 time.sleep(1)
 
         

+ 11 - 2
utility/parseutils.py

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 DB_NAME = 'google_poi'
-SHOP_LIST_TABLE = 'shop_list2'
-SHOP_LIST_TABLE_COL = ['unique_id', 'name', 'lon', 'lat', 'city', 'area', 
+SHOP_LIST_TABLE = 'shop_list3'
+SHOP_LIST_TABLE_COL = ['name', 'fid', 'city', 'area', 
                       'rating', 'user_ratings_total', 'category', 'price_level', 
                       'addr', 'tel', 'services', 'products', 'choices', 
                       'facilities', 'groups', 'plans', 'payments', 'safeties', 
@@ -9,6 +9,15 @@ SHOP_LIST_TABLE_COL = ['unique_id', 'name', 'lon', 'lat', 'city', 'area',
                       'open_now', 'periods', 'weekday_text','reviews',
                       'shop_photo','menu_photo',
                       'google_url', 'item_url', 'keyword', 'crawler_date']
+                      
+# SHOP_LIST_TABLE_COL = ['unique_id', 'name', 'lon', 'lat', 'city', 'area', 
+#                       'rating', 'user_ratings_total', 'category', 'price_level', 
+#                       'addr', 'tel', 'services', 'products', 'choices', 
+#                       'facilities', 'groups', 'plans', 'payments', 'safeties', 
+#                       'specials', 'barrierlevels', 'items' ,
+#                       'open_now', 'periods', 'weekday_text','reviews',
+#                       'shop_photo','menu_photo',
+#                       'google_url', 'item_url', 'keyword', 'crawler_date']
 
 element_list = {
     'category': ['button', {'jsaction':'pane.rating.category'}],