|
@@ -131,7 +131,7 @@ def get_trends(q, url_table, id_cache, driver):
|
|
print('done')
|
|
print('done')
|
|
break
|
|
break
|
|
logger_.info('{} news list update'.format(q))
|
|
logger_.info('{} news list update'.format(q))
|
|
-
|
|
|
|
|
|
+ return driver
|
|
|
|
|
|
def our_rule(url, company, driver):
|
|
def our_rule(url, company, driver):
|
|
url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
|
|
url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
|
|
@@ -243,7 +243,7 @@ def detail_crawler(data, detail_table, url_table, error_table, driver):
|
|
'crawler_date': current
|
|
'crawler_date': current
|
|
})
|
|
})
|
|
|
|
|
|
- driver.close()
|
|
|
|
|
|
+ return driver
|
|
|
|
|
|
|
|
|
|
def get_next_job(db, table, query_key):
|
|
def get_next_job(db, table, query_key):
|
|
@@ -254,9 +254,10 @@ def get_next_job(db, table, query_key):
|
|
|
|
|
|
|
|
|
|
def main():
|
|
def main():
|
|
- location_pd = pd.read_csv('location_list.csv')
|
|
|
|
- location_pd = location_pd[location_pd['縣市']== '台北']
|
|
|
|
- location = location_pd['地區'].to_list()
|
|
|
|
|
|
+ # location_pd = pd.read_csv('location_list.csv')
|
|
|
|
+ # location_pd = location_pd[location_pd['縣市']== '台北']
|
|
|
|
+ # location = location_pd['地區'].to_list()
|
|
|
|
+ location_list = ['台北大安', '台北文山']
|
|
|
|
|
|
if len(sys.argv) > 1 :
|
|
if len(sys.argv) > 1 :
|
|
port=int(sys.argv[1])
|
|
port=int(sys.argv[1])
|
|
@@ -272,21 +273,22 @@ def main():
|
|
detail_table = db['gnews_detail2']
|
|
detail_table = db['gnews_detail2']
|
|
error_table = db['error_list']
|
|
error_table = db['error_list']
|
|
|
|
|
|
- for keyword in location:
|
|
|
|
- if keyword == '文山' or keyword == '大安' or keyword == '中正': continue
|
|
|
|
- query_key = '{} 政治'.format(keyword)
|
|
|
|
- logger_.info('{} start...'.format(query_key))
|
|
|
|
-
|
|
|
|
- # find new news url
|
|
|
|
- id_cache = build_cache(db, url_table_name)
|
|
|
|
- get_trends(query_key, url_table, id_cache, driver)
|
|
|
|
- time.sleep(5)
|
|
|
|
-
|
|
|
|
- url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
|
- logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
|
-
|
|
|
|
- detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
|
- logger_.info('{} news description update'.format(query_key))
|
|
|
|
|
|
+ for keyword in location_list:
|
|
|
|
+ # query_key = '{} 政治'.format(keyword)
|
|
|
|
+ for topic in ['', '政治', '疫情', '娛樂', '生活', '財經']:
|
|
|
|
+ query_key = '{} {}'.format(keyword, topic)
|
|
|
|
+ logger_.info('{} start...'.format(query_key))
|
|
|
|
+
|
|
|
|
+ # find new news url
|
|
|
|
+ id_cache = build_cache(db, url_table_name)
|
|
|
|
+ driver = get_trends(query_key, url_table, id_cache, driver)
|
|
|
|
+ time.sleep(5)
|
|
|
|
+
|
|
|
|
+ url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
|
+ logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
|
+
|
|
|
|
+ driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
|
+ logger_.info('{} news description update'.format(query_key))
|
|
|
|
|
|
db.close()
|
|
db.close()
|
|
driver.close()
|
|
driver.close()
|