3 jaren geleden · 2797e5d67a
--- a/gnews.py
+++ b/gnews.py
@@ -131,7 +131,7 @@ def get_trends(q, url_table, id_cache, driver):
 
															             print('done')

														
 
															             break

														
 
															     logger_.info('{} news list update'.format(q))

														
 
															-

														
 
															+    return driver 

														
 
															 def our_rule(url, company, driver):

														
 
															     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',

														
@@ -243,7 +243,7 @@ def detail_crawler(data, detail_table, url_table, error_table, driver):
 
															                 'crawler_date': current

														
 
															             })

														
 
															-    driver.close()

														
 
															+    return driver

														
 
															 def get_next_job(db, table, query_key):

														
@@ -254,9 +254,10 @@ def get_next_job(db, table, query_key):
 
															 def main():

														
 
															-    location_pd = pd.read_csv('location_list.csv')

														
 
															-    location_pd = location_pd[location_pd['縣市']== '台北']

														
 
															-    location = location_pd['地區'].to_list()

														
 
															+    # location_pd = pd.read_csv('location_list.csv')

														
 
															+    # location_pd = location_pd[location_pd['縣市']== '台北']

														
 
															+    # location = location_pd['地區'].to_list()

														
 
															+    location_list = ['台北大安', '台北文山']

														
 
															     if len(sys.argv) > 1 :

														
 
															         port=int(sys.argv[1])

														
@@ -272,21 +273,22 @@ def main():
 
															     detail_table = db['gnews_detail2']

														
 
															     error_table = db['error_list']

														
 
															-    for keyword in location:

														
 
															-        if keyword == '文山' or keyword == '大安' or keyword == '中正': continue

														
 
															-        query_key = '{} 政治'.format(keyword)

														
 
															-        logger_.info('{} start...'.format(query_key))

														
 
															-

														
 
															-        # find new news url

														
 
															-        id_cache = build_cache(db, url_table_name)

														
 
															-        get_trends(query_key, url_table, id_cache, driver)

														
 
															-        time.sleep(5)

														
 
															-

														
 
															-        url_pd = get_next_job(db, url_table_name, query_key)

														
 
															-        logger_.info('find {} news...'.format(len(url_pd)))

														
 
															-        

														
 
															-        detail_crawler(url_pd, detail_table, url_table, error_table, driver)

														
 
															-        logger_.info('{} news description update'.format(query_key))

														
 
															+    for keyword in location_list:

														
 
															+        # query_key = '{} 政治'.format(keyword)

														
 
															+        for topic in ['', '政治', '疫情', '娛樂', '生活', '財經']:

														
 
															+            query_key = '{} {}'.format(keyword, topic)

														
 
															+            logger_.info('{} start...'.format(query_key))

														
 
															+

														
 
															+            # find new news url

														
 
															+            id_cache = build_cache(db, url_table_name)

														
 
															+            driver = get_trends(query_key, url_table, id_cache, driver)

														
 
															+            time.sleep(5)

														
 
															+

														
 
															+            url_pd = get_next_job(db, url_table_name, query_key)

														
 
															+            logger_.info('find {} news...'.format(len(url_pd)))

														
 
															+            

														
 
															+            driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)

														
 
															+            logger_.info('{} news description update'.format(query_key))

														
 
															     db.close()

														
 
															     driver.close()