noodles 3 lat temu
rodzic
commit
3c0139a053
2 zmienionych plików z 30 dodań i 13 usunięć
  1. 14 10
      run.py
  2. 16 3
      utility/googlemapsearch.sql

+ 14 - 10
run.py

@@ -386,13 +386,15 @@ def get_not_cralwer_url(keyword):
     table = db['shop_item_list']
     url_list = list(table.find(keyword=keyword))
     shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
-
+    error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
+    
     url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
     url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
-    url_pd = url_pd[url_pd['item_url_length']!=600]
+    url_pd = url_pd[url_pd['item_url_length']!=1000]
     url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
+    url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
 
-    print('have {} URL list'.format(len(url_list)))
+    print('have {} URL list'.format(len(url_pd)))
     # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
 
     return url_pd
@@ -407,8 +409,8 @@ def main():
     print('drvier start...')
     driver = brower_start()
     
-    try:
-        for key, row in url_pd.iterrows():
+    for key, row in url_pd.iterrows():
+        try:    
             name = row['name']
             item_url = row['item_url']
             # result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
@@ -450,11 +452,13 @@ def main():
             time.sleep(1)
             data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
 
-    except:
-        error = pd.DataFrame([row])
-        error.to_csv('error.csv', mode='a', header = False)
-        driver.close()
-        driver = brower_start()
+        except:
+            error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
+            data_select_insert(db, 'error_list', error_table_col, row)
+            # error = pd.DataFrame([row])
+            # error.to_csv('error.csv', mode='a', header = False)
+            driver.close()
+            driver = brower_start()
    
 
 

+ 16 - 3
utility/googlemapsearch.sql

@@ -42,13 +42,26 @@ CREATE TABLE `shop_list` (
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 
-CREATE TABLE `shop_item_list2` (
+CREATE TABLE `shop_item_list` (
    `id` int NOT NULL AUTO_INCREMENT,
    `name` VARCHAR(100),
    `lon` DOUBLE,
    `lat` DOUBLE,
    `keyword` VARCHAR(20),
-   `item_url` VARCHAR(600),
+   `item_url` VARCHAR(200),
+   `crawler_date` char(20) NOT NULL,
+   PRIMARY KEY (`id`),
+   UNIQUE KEY (`item_url`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
+
+CREATE TABLE `error_list` (
+   `id` int NOT NULL AUTO_INCREMENT,
+   `name` VARCHAR(100),
+   `lon` DOUBLE,
+   `lat` DOUBLE,
+   `keyword` VARCHAR(20),
+   `item_url` VARCHAR(200),
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
    UNIQUE KEY (`item_url`)
@@ -88,4 +101,4 @@ CREATE TABLE `shop_reviews_photo_list` (
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
    UNIQUE KEY (`id`,`google_url`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;