|
@@ -14,6 +14,7 @@ from utility.parseutils import *
|
|
|
from utility.connect import *
|
|
|
|
|
|
from datetime import datetime
|
|
|
+import dataset
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import json
|
|
@@ -382,17 +383,27 @@ def time_click(driver):
|
|
|
|
|
|
def main():
|
|
|
# driver = serive_create('Profile 1')
|
|
|
+ keyword = '咖啡'
|
|
|
db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
|
|
|
- url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
|
|
|
+ table = db['shop_item_list2']
|
|
|
+ url_list = list(table.find(keyword=keyword))
|
|
|
+ shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
|
|
|
+ url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
+ url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
+ url_pd = url_pd[url_pd['item_url_length']!=600]
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
+ print('have {} URL list'.format(len(url_list)))
|
|
|
+ # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
|
|
|
print('drvier start...')
|
|
|
driver = brower_start()
|
|
|
|
|
|
try:
|
|
|
- for key, row in url_list.iterrows():
|
|
|
+ for key, row in url_pd.iterrows():
|
|
|
name = row['name']
|
|
|
item_url = row['item_url']
|
|
|
- result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
|
|
|
- if len(result) != 0: continue
|
|
|
+ # result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
|
|
|
+ # if len(result) != 0: continue
|
|
|
print(key, name, ': ' ,item_url)
|
|
|
|
|
|
driver.get(item_url)
|
|
@@ -425,6 +436,7 @@ def main():
|
|
|
query_name = '{}+{}'.format(output_name, output['addr'])
|
|
|
query_name = query_name.replace(' ','')
|
|
|
output['item_url'] = item_url
|
|
|
+ output['keyword'] = keyword
|
|
|
output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
|
|
|
time.sleep(1)
|
|
|
data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|