|
@@ -386,13 +386,15 @@ def get_not_cralwer_url(keyword):
|
|
|
table = db['shop_item_list']
|
|
|
url_list = list(table.find(keyword=keyword))
|
|
|
shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
|
|
|
-
|
|
|
+ error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
|
|
|
+
|
|
|
url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
|
|
|
url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
|
|
|
- url_pd = url_pd[url_pd['item_url_length']!=600]
|
|
|
+ url_pd = url_pd[url_pd['item_url_length']!=1000]
|
|
|
url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
|
|
|
+ url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
|
|
|
|
|
|
- print('have {} URL list'.format(len(url_list)))
|
|
|
+ print('have {} URL list'.format(len(url_pd)))
|
|
|
# url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
|
|
|
|
|
|
return url_pd
|
|
@@ -407,8 +409,8 @@ def main():
|
|
|
print('drvier start...')
|
|
|
driver = brower_start()
|
|
|
|
|
|
- try:
|
|
|
- for key, row in url_pd.iterrows():
|
|
|
+ for key, row in url_pd.iterrows():
|
|
|
+ try:
|
|
|
name = row['name']
|
|
|
item_url = row['item_url']
|
|
|
# result = DA.mysql_select_data(db, 'select item_url from shop_list where item_url="{}"'.format(item_url))
|
|
@@ -450,11 +452,13 @@ def main():
|
|
|
time.sleep(1)
|
|
|
data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
|
|
|
|
|
|
- except:
|
|
|
- error = pd.DataFrame([row])
|
|
|
- error.to_csv('error.csv', mode='a', header = False)
|
|
|
- driver.close()
|
|
|
- driver = brower_start()
|
|
|
+ except:
|
|
|
+ error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
|
|
|
+ data_select_insert(db, 'error_list', error_table_col, row)
|
|
|
+ # error = pd.DataFrame([row])
|
|
|
+ # error.to_csv('error.csv', mode='a', header = False)
|
|
|
+ driver.close()
|
|
|
+ driver = brower_start()
|
|
|
|
|
|
|
|
|
|