noodles il y a 3 ans
Parent
commit
304a03832c
2 fichiers modifiés avec 17 ajouts et 5 suppressions
  1. 16 4
      run.py
  2. 1 1
      utility/parseutils.py

+ 16 - 4
run.py

@@ -14,6 +14,7 @@ from utility.parseutils import *
 from utility.connect import *
 
 from datetime import datetime
+import dataset
 import pandas as pd
 import time
 import json
@@ -382,17 +383,27 @@ def time_click(driver):
 
 def main():
     # driver = serive_create('Profile 1')
+    keyword = '咖啡'
     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
-    url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+    table = db['shop_item_list2']
+    url_list = list(table.find(keyword=keyword))
+    shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
+    url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
+    url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
+    url_pd = url_pd[url_pd['item_url_length']!=600]
+    url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
+    print('have {} URL list'.format(len(url_list)))
+    # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
     print('drvier start...')
     driver = brower_start()
     
     try:
-        for key, row in url_list.iterrows():
+        for key, row in url_pd.iterrows():
             name = row['name']
             item_url = row['item_url']
-            result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
-            if len(result) != 0: continue
+            # result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
+            # if len(result) != 0: continue
             print(key, name, ': ' ,item_url)
 
             driver.get(item_url)
@@ -425,6 +436,7 @@ def main():
             query_name = '{}+{}'.format(output_name, output['addr'])
             query_name = query_name.replace(' ','')
             output['item_url'] = item_url
+            output['keyword'] = keyword
             output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
             time.sleep(1)
             data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)

+ 1 - 1
utility/parseutils.py

@@ -8,7 +8,7 @@ SHOP_LIST_TABLE_COL = ['name', 'lon', 'lat', 'city', 'area',
                       'specials', 'barrierlevels', 'items' ,
                       'open_now', 'periods', 'weekday_text','reviews',
                       'shop_photo','menu_photo',
-                      'google_url', 'item_url', 'crawler_date']
+                      'google_url', 'item_url', 'keyword', 'crawler_date']
 
 element_list = {
     'category': ['button', {'jsaction':'pane.rating.category'}],