noodles 3 éve
szülő
commit
ce2baa0e42
3 módosított fájl, 28 hozzáadás és 4 törlés
  1. 13 3
      run.py
  2. 14 0
      utility/googlemapsearch.sql
  3. 1 1
      utility/parseutils.py

+ 13 - 3
run.py

@@ -14,6 +14,7 @@ from utility.parseutils import *
 from utility.connect import *
 
 from datetime import datetime
+import dataset
 import pandas as pd
 import time
 import json
@@ -383,7 +384,16 @@ def time_click(driver):
 def main():
     # driver = serive_create('Profile 1')
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
-    url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table = db['shop_item_list2']
+    url_list = list(table.find(keyword='咖啡'))
+    shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format('咖啡'))]
+    url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
+    url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
+    url_pd = url_pd[url_pd['item_url_length']!=600]
+    url_list = url_pd[~url_pd['item_url'].isin(shop_item)]['item_url']
+
+    # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
     print('drvier start...')
     driver = brower_start()
     
@@ -391,8 +401,8 @@ def main():
         for key, row in url_list.iterrows():
             name = row['name']
             item_url = row['item_url']
-            result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
-            if len(result) != 0: continue
+            # result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
+            # if len(result) != 0: continue
             print(key, name, ': ' ,item_url)
 
             driver.get(item_url)

+ 14 - 0
utility/googlemapsearch.sql

@@ -35,6 +35,7 @@ CREATE TABLE `shop_list` (
 
    `google_url` VARCHAR(200),
    `item_url` VARCHAR(200),
+   `keyword` VARCHAR(20),
    `crawler_date` char(20) NOT NULL,
    PRIMARY KEY (`id`),
    UNIQUE KEY (`item_url`)
@@ -54,6 +55,19 @@ CREATE TABLE `shop_item_list` (
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 
 
+CREATE TABLE `error_list` (
+   `id` int NOT NULL AUTO_INCREMENT,
+   `name` VARCHAR(100),
+   `lon` DOUBLE,
+   `lat` DOUBLE,
+   `keyword` VARCHAR(20),
+   `item_url` VARCHAR(200),
+   `crawler_date` char(20) NOT NULL,
+   PRIMARY KEY (`id`),
+   UNIQUE KEY (`item_url`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
+
 CREATE TABLE `shop_time_list` (
    `id` int NOT NULL AUTO_INCREMENT,
    `google_url` VARCHAR(200),

+ 1 - 1
utility/parseutils.py

@@ -8,7 +8,7 @@ SHOP_LIST_TABLE_COL = ['name', 'lon', 'lat', 'city', 'area',
                       'specials', 'barrierlevels', 'items' ,
                       'open_now', 'periods', 'weekday_text','reviews',
                       'shop_photo','menu_photo',
-                      'google_url', 'item_url', 'crawler_date']
+                      'google_url', 'item_url', 'keyword', 'crawler_date']
 
 element_list = {
     'category': ['button', {'jsaction':'pane.rating.category'}],