noodles před 2 roky
rodič
revize
2797e5d67a
1 změnil soubory, kde provedl 22 přidání a 20 odebrání
  1. 22 20
      gnews.py

+ 22 - 20
gnews.py

@@ -131,7 +131,7 @@ def get_trends(q, url_table, id_cache, driver):
             print('done')
             break
     logger_.info('{} news list update'.format(q))
-
+    return driver 
 
 def our_rule(url, company, driver):
     url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
@@ -243,7 +243,7 @@ def detail_crawler(data, detail_table, url_table, error_table, driver):
                 'crawler_date': current
             })
 
-    driver.close()
+    return driver
 
 
 def get_next_job(db, table, query_key):
@@ -254,9 +254,10 @@ def get_next_job(db, table, query_key):
 
 
 def main():
-    location_pd = pd.read_csv('location_list.csv')
-    location_pd = location_pd[location_pd['縣市']== '台北']
-    location = location_pd['地區'].to_list()
+    # location_pd = pd.read_csv('location_list.csv')
+    # location_pd = location_pd[location_pd['縣市']== '台北']
+    # location = location_pd['地區'].to_list()
+    location_list = ['台北大安', '台北文山']
 
     if len(sys.argv) > 1 :
         port=int(sys.argv[1])
@@ -272,21 +273,22 @@ def main():
     detail_table = db['gnews_detail2']
     error_table = db['error_list']
 
-    for keyword in location:
-        if keyword == '文山' or keyword == '大安' or keyword == '中正': continue
-        query_key = '{} 政治'.format(keyword)
-        logger_.info('{} start...'.format(query_key))
-
-        # find new news url
-        id_cache = build_cache(db, url_table_name)
-        get_trends(query_key, url_table, id_cache, driver)
-        time.sleep(5)
-
-        url_pd = get_next_job(db, url_table_name, query_key)
-        logger_.info('find {} news...'.format(len(url_pd)))
-        
-        detail_crawler(url_pd, detail_table, url_table, error_table, driver)
-        logger_.info('{} news description update'.format(query_key))
+    for keyword in location_list:
+        # query_key = '{} 政治'.format(keyword)
+        for topic in ['', '政治', '疫情', '娛樂', '生活', '財經']:
+            query_key = '{} {}'.format(keyword, topic)
+            logger_.info('{} start...'.format(query_key))
+
+            # find new news url
+            id_cache = build_cache(db, url_table_name)
+            driver = get_trends(query_key, url_table, id_cache, driver)
+            time.sleep(5)
+
+            url_pd = get_next_job(db, url_table_name, query_key)
+            logger_.info('find {} news...'.format(len(url_pd)))
+            
+            driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
+            logger_.info('{} news description update'.format(query_key))
 
     db.close()
     driver.close()