noodles 2 years ago
parent
commit
ff5e8b41a9
6 changed files with 1081 additions and 190 deletions
  1. 190 0
      gnews
  2. 155 190
      gnews.py
  3. 330 0
      gnews_old.py
  4. 369 0
      location_list.csv
  5. 0 0
      utility/__init__.py
  6. 37 0
      utility/log.py

+ 190 - 0
gnews

@@ -0,0 +1,190 @@
+2022-04-30 12:26:29,570 INFO gnews: start...
+2022-04-30 12:28:30,605 INFO gnews: start...
+2022-04-30 12:29:20,197 INFO gnews: start...
+2022-04-30 12:33:26,837 INFO gnews: start...
+2022-04-30 13:38:30,554 INFO gnews: start...
+2022-04-30 13:40:47,568 INFO gnews: start...
+2022-04-30 13:51:25,575 INFO gnews: start...
+2022-04-30 13:53:28,962 INFO gnews: start...
+2022-04-30 13:53:56,912 INFO gnews: page 1
+2022-04-30 13:54:02,072 INFO gnews: 大安 news list update
+2022-04-30 13:59:17,184 INFO gnews: start...
+2022-04-30 13:59:48,493 INFO gnews: page 1
+2022-04-30 13:59:53,609 INFO gnews: 大安 news list update
+2022-04-30 14:00:38,131 INFO gnews: start...
+2022-04-30 14:01:05,316 INFO gnews: page 1
+2022-04-30 14:01:10,429 INFO gnews: 大安 news list update
+2022-04-30 14:04:45,635 INFO gnews: start...
+2022-04-30 14:05:31,189 INFO gnews: start...
+2022-04-30 14:05:54,263 INFO gnews: page 1
+2022-04-30 14:05:59,380 INFO gnews: 大安 news list update
+2022-04-30 14:08:36,970 INFO gnews: start...
+2022-04-30 14:09:42,782 INFO gnews: start...
+2022-04-30 14:10:07,358 INFO gnews: page 1
+2022-04-30 14:12:24,556 INFO gnews: start...
+2022-04-30 14:12:48,721 INFO gnews: page 1
+2022-04-30 14:17:27,862 INFO gnews: start...
+2022-04-30 14:17:56,703 INFO gnews: page 1
+2022-04-30 14:18:14,816 INFO gnews: start...
+2022-04-30 14:18:38,689 INFO gnews: page 1
+2022-04-30 14:21:10,358 INFO gnews: start...
+2022-04-30 14:21:32,466 INFO gnews: page 1
+2022-04-30 14:22:32,075 INFO gnews: start...
+2022-04-30 14:22:54,258 INFO gnews: page 1
+2022-04-30 14:23:52,472 INFO gnews: start...
+2022-04-30 14:24:13,095 INFO gnews: page 1
+2022-04-30 14:24:18,274 INFO gnews: page 2
+2022-04-30 14:25:06,195 INFO gnews: start...
+2022-04-30 14:25:28,137 INFO gnews: page 1
+2022-04-30 14:26:03,293 INFO gnews: start...
+2022-04-30 14:26:25,482 INFO gnews: page 1
+2022-04-30 14:26:32,794 INFO gnews: page 2
+2022-04-30 14:26:38,231 INFO gnews: page 3
+2022-04-30 14:26:43,604 INFO gnews: page 4
+2022-04-30 14:26:48,970 INFO gnews: page 5
+2022-04-30 14:26:54,364 INFO gnews: page 6
+2022-04-30 14:26:59,777 INFO gnews: page 7
+2022-04-30 14:27:05,214 INFO gnews: page 8
+2022-04-30 14:27:10,552 INFO gnews: page 9
+2022-04-30 14:27:15,936 INFO gnews: page 10
+2022-04-30 14:27:21,283 INFO gnews: page 11
+2022-04-30 14:27:26,586 INFO gnews: page 12
+2022-04-30 14:27:32,010 INFO gnews: page 13
+2022-04-30 14:27:37,354 INFO gnews: page 14
+2022-04-30 14:27:42,672 INFO gnews: page 15
+2022-04-30 14:27:48,082 INFO gnews: page 16
+2022-04-30 14:27:53,433 INFO gnews: page 17
+2022-04-30 14:27:58,786 INFO gnews: page 18
+2022-04-30 14:28:04,115 INFO gnews: page 19
+2022-04-30 14:28:10,176 INFO gnews: 大安 news list update
+2022-04-30 14:37:49,306 INFO gnews: start...
+2022-04-30 14:38:17,391 INFO gnews: page 1
+2022-04-30 14:38:23,254 INFO gnews: page 2
+2022-04-30 14:38:28,711 INFO gnews: page 3
+2022-04-30 14:38:34,090 INFO gnews: page 4
+2022-04-30 14:38:39,487 INFO gnews: page 5
+2022-04-30 14:38:44,825 INFO gnews: page 6
+2022-04-30 14:38:50,216 INFO gnews: page 7
+2022-04-30 14:38:55,576 INFO gnews: page 8
+2022-04-30 14:39:00,938 INFO gnews: page 9
+2022-04-30 14:39:06,185 INFO gnews: page 10
+2022-04-30 14:39:11,725 INFO gnews: 大安 政治 news list update
+2022-04-30 14:40:34,557 INFO gnews: start...
+2022-04-30 14:40:56,664 INFO gnews: page 1
+2022-04-30 14:41:02,538 INFO gnews: page 2
+2022-04-30 14:41:07,983 INFO gnews: page 3
+2022-04-30 14:41:13,294 INFO gnews: page 4
+2022-04-30 14:41:18,538 INFO gnews: page 5
+2022-04-30 14:41:23,910 INFO gnews: page 6
+2022-04-30 14:41:29,190 INFO gnews: page 7
+2022-04-30 14:41:34,466 INFO gnews: page 8
+2022-04-30 14:41:39,720 INFO gnews: page 9
+2022-04-30 14:41:44,954 INFO gnews: page 10
+2022-04-30 14:41:50,500 INFO gnews: 大安 政治 news list update
+2022-04-30 14:42:34,926 INFO gnews: start...
+2022-04-30 14:42:56,943 INFO gnews: page 1
+2022-04-30 14:43:01,643 INFO gnews: page 2
+2022-04-30 14:43:07,421 INFO gnews: page 3
+2022-04-30 14:43:12,698 INFO gnews: page 4
+2022-04-30 14:43:18,081 INFO gnews: page 5
+2022-04-30 14:43:23,530 INFO gnews: page 6
+2022-04-30 14:43:28,921 INFO gnews: page 7
+2022-04-30 14:43:34,312 INFO gnews: page 8
+2022-04-30 14:43:39,656 INFO gnews: page 9
+2022-04-30 14:43:44,925 INFO gnews: page 10
+2022-04-30 14:43:50,337 INFO gnews: page 11
+2022-04-30 14:43:55,829 INFO gnews: page 12
+2022-04-30 14:44:02,300 INFO gnews: 文山 政治 news list update
+2022-04-30 15:02:37,155 INFO gnews: start...
+2022-04-30 15:02:54,158 INFO gnews: find 3 news...
+2022-04-30 15:03:46,177 INFO gnews: start...
+2022-04-30 15:03:59,861 INFO gnews: find 3 news...
+2022-04-30 15:04:25,081 INFO gnews: start...
+2022-04-30 15:04:37,910 INFO gnews: find 3 news...
+2022-04-30 15:04:40,280 INFO gnews: 文山 政治 news description update
+2022-04-30 15:07:04,054 INFO gnews: start...
+2022-04-30 15:07:17,399 INFO gnews: find 3 news...
+2022-04-30 15:08:28,920 INFO gnews: start...
+2022-04-30 15:08:42,822 INFO gnews: find 3 news...
+2022-04-30 15:08:55,526 INFO gnews: 文山 政治 news description update
+2022-04-30 15:14:51,822 INFO gnews: start...
+2022-04-30 15:15:07,465 INFO gnews: find 110 news...
+2022-04-30 18:11:42,302 INFO gnews: start...
+2022-04-30 18:12:08,323 INFO gnews: page 1
+2022-04-30 18:23:58,795 INFO gnews: start...
+2022-04-30 18:24:24,695 INFO gnews: page 1
+2022-04-30 18:29:50,871 INFO gnews: start...
+2022-04-30 18:30:16,045 INFO gnews: page 1
+2022-04-30 18:30:18,631 INFO gnews: 文山 政治 news list update
+2022-04-30 18:31:09,102 INFO gnews: start...
+2022-04-30 18:31:32,041 INFO gnews: page 1
+2022-04-30 18:31:37,593 INFO gnews: page 2
+2022-04-30 18:31:42,939 INFO gnews: page 3
+2022-04-30 18:31:48,268 INFO gnews: page 4
+2022-04-30 18:31:53,584 INFO gnews: page 5
+2022-04-30 18:31:58,862 INFO gnews: page 6
+2022-04-30 18:32:04,157 INFO gnews: page 7
+2022-04-30 18:32:09,435 INFO gnews: page 8
+2022-04-30 18:32:14,724 INFO gnews: page 9
+2022-04-30 18:32:20,120 INFO gnews: page 10
+2022-04-30 18:32:25,354 INFO gnews: page 11
+2022-04-30 18:32:30,729 INFO gnews: page 12
+2022-04-30 18:32:36,641 INFO gnews: 文山 政治 news list update
+2022-04-30 18:32:48,271 INFO gnews: start...
+2022-04-30 18:33:10,769 INFO gnews: page 1
+2022-04-30 18:33:16,392 INFO gnews: page 2
+2022-04-30 18:33:21,561 INFO gnews: page 3
+2022-04-30 18:33:26,378 INFO gnews: page 4
+2022-04-30 18:33:31,110 INFO gnews: page 5
+2022-04-30 18:33:36,313 INFO gnews: page 6
+2022-04-30 18:33:41,356 INFO gnews: page 7
+2022-04-30 18:33:46,363 INFO gnews: page 8
+2022-04-30 18:33:51,518 INFO gnews: page 9
+2022-04-30 18:33:57,440 INFO gnews: 大安 政治 news list update
+2022-04-30 18:34:04,809 INFO gnews: start...
+2022-04-30 18:34:18,600 INFO gnews: find 40 news...
+2022-04-30 18:34:36,143 WARNING gnews: https://www.gvm.com.tw/article/89442 : cannot find content
+2022-04-30 18:35:37,684 WARNING gnews: https://times.hinet.net/news/23837482 : cannot find content
+2022-04-30 18:36:04,489 INFO gnews: 大安 政治 news description update
+2022-04-30 18:36:44,915 INFO gnews: start...
+2022-04-30 18:36:57,841 INFO gnews: find 116 news...
+2022-04-30 18:41:05,604 WARNING gnews: https://www.taiwannews.com.tw/ch/news/4506402 : cannot find content
+2022-04-30 18:42:52,341 WARNING gnews: https://www.peopo.org/news/579875 : cannot find content
+2022-04-30 18:43:03,249 INFO gnews: 文山 政治 news description update
+2022-04-30 18:48:16,124 INFO gnews: 中正 政治 start...
+2022-04-30 18:48:25,717 INFO gnews: page 1
+2022-04-30 18:48:31,340 INFO gnews: page 2
+2022-04-30 18:48:36,757 INFO gnews: page 3
+2022-04-30 18:48:42,211 INFO gnews: page 4
+2022-04-30 18:48:47,507 INFO gnews: page 5
+2022-04-30 18:48:52,893 INFO gnews: page 6
+2022-04-30 18:48:58,167 INFO gnews: page 7
+2022-04-30 18:49:03,339 INFO gnews: page 8
+2022-04-30 18:49:08,466 INFO gnews: page 9
+2022-04-30 18:49:13,739 INFO gnews: page 10
+2022-04-30 18:49:19,015 INFO gnews: page 11
+2022-04-30 18:49:24,276 INFO gnews: page 12
+2022-04-30 18:49:29,757 INFO gnews: page 13
+2022-04-30 18:49:34,825 INFO gnews: page 14
+2022-04-30 18:49:41,230 INFO gnews: 中正 政治 news list update
+2022-04-30 18:49:46,342 INFO gnews: find 122 news...
+2022-04-30 18:49:52,131 WARNING gnews: https://watchout.tw/reports/orG2cYOz3vJBFeg4v2Zy : cannot find content
+2022-04-30 18:51:08,267 WARNING gnews: https://www.taiwannews.com.tw/ch/news/4499961 : cannot find content
+2022-04-30 18:51:19,571 WARNING gnews: https://talk.ltn.com.tw/article/paper/1510746 : cannot find content
+2022-04-30 18:51:40,197 WARNING gnews: https://talk.ltn.com.tw/article/breakingnews/3889518 : cannot find content
+2022-04-30 18:53:22,050 WARNING gnews: https://talk.ltn.com.tw/article/breakingnews/3889407 : cannot find content
+2022-04-30 18:53:49,959 WARNING gnews: https://www.taiwannews.com.tw/ch/news/4508123 : cannot find content
+2022-04-30 18:54:03,339 WARNING gnews: https://www.upmedia.mg/news_info.php?Type=2&SerialNo=142191 : cannot find content
+2022-04-30 18:55:30,364 WARNING gnews: http://inews.nmgnews.com.cn/system/2022/04/06/013287021.shtml : cannot find content
+2022-04-30 19:45:05,937 INFO gnews: 大同 政治 start...
+2022-04-30 19:45:15,494 INFO gnews: page 1
+2022-04-30 19:45:20,730 INFO gnews: page 2
+2022-04-30 19:45:25,600 INFO gnews: page 3
+2022-04-30 19:45:30,623 INFO gnews: page 4
+2022-04-30 19:45:35,553 INFO gnews: page 5
+2022-04-30 19:45:40,762 INFO gnews: page 6
+2022-04-30 19:45:45,912 INFO gnews: page 7
+2022-04-30 19:45:51,023 INFO gnews: page 8
+2022-04-30 19:45:56,245 INFO gnews: page 9
+2022-04-30 19:46:01,688 INFO gnews: 大同 政治 news list update
+2022-04-30 19:46:06,732 INFO gnews: find 56 news...

+ 155 - 190
gnews.py

@@ -1,45 +1,35 @@
 from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.wait import WebDriverWait
 import time, pickle, sys, os, re, time, requests
-# import dataset
-import telegram
+import dataset
 import pandas as pd
 from datetime import datetime, timedelta
-from utility import database_access as DA
 from newspaper import Article
 from utility import log
 from bs4 import BeautifulSoup
-from ckiptagger import WS, POS, NER
-from utility.connect import *
+# from ckiptagger import WS, POS, NER
+
 
 logger_ = log.init_logging('gnews', 'gnews')
-bot = telegram.Bot(token='1661195930:AAG8TNHUuXoghSFf3VC-oKbo_erep7Y1io4')
-
-URL_LIST_COL = ['news_title', 'news_desc', 'news_url', 'search_keyword', 'company', 'news_day','crawler_date','_status']
-GNEWS_DETAIL_COL = ['news_url', 'news_content', 'news_day', 'crawler_date']
-
-def serive_create(profilepath):
-    option = webdriver.ChromeOptions()
-    option.add_argument('--headless')
-    option.add_argument('--no-sandbox')
-    option.add_argument('--disable-web-security')
-    option.add_argument('--allow-running-insecure-content')
-    option.add_argument('--incognito')
-    option.add_argument(
-        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
-    # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
-    option.add_argument(
-        "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
-    option.add_argument("profile-directory="+profilepath)
-    # driver = webdriver.Chrome('./utility/chromedriver', options=option)
-    driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
-                              service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
-
-    executor_url = driver.command_executor._url
-    session_id = driver.session_id
-    print(session_id)
-    print(executor_url)
-
-    return driver
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def build_cache(db, table):
+    id_dict=[]
+    cursor = db.query('SELECT url FROM gnews.{};'.format(table))
+
+    for c in cursor:
+        id_dict += [c['url']]
+    return id_dict
 
 
 def conv_time(t):
@@ -68,70 +58,79 @@ def conv_time(t):
     return s
 
 
-def get_trends(q, db, current, profilepath):
-    driver = serive_create(profilepath)
-
+def get_trends(q, url_table, id_cache, driver):
+    
     driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
     time.sleep(3)
-    # soup = BeautifulSoup(driver.page_source, 'html.parser')
-    # content = soup.prettify()
-    # print(content)
+
+    # click tool
+    element = driver.find_element(By.ID, "hdtb-tls")
+    driver.implicitly_wait(5)
+    ActionChains(driver).move_to_element(element).click(element).perform()
+
+    # click time
+    element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
+    driver.implicitly_wait(5)
+    ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
+
+    # click time
+    element = driver.find_element(By.XPATH,"//div[@id='lb']")
+    ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
+    for e in ele:
+        if e.text == '過去 1 個月':
+            print(e.text)
+            driver.implicitly_wait(5)
+            ActionChains(driver).move_to_element(e).click(e).perform()
+            break
+
     c = 0
     while True:
-        time.sleep(15)
+        time.sleep(3)
         c += 1
         logger_.info('page {}'.format(c))
-        elmts = driver.find_elements_by_xpath("//div[@class='yr3B8d KWQBje']")
+        elmts = driver.find_elements_by_xpath("//g-card[@class='ftSUBd']")
+
         for elmt in elmts:
-            try:
+            # try:
+                title, url, company = '', '', ''
                 e = elmt.find_element_by_xpath(".//div[@role='heading']")
                 title = e.text
-                # check whether it is traditional
-                title.encode('big5')
-                print(e.text)
-                e2 = elmt.find_element_by_xpath(".//div[@class='Y3v8qd']")
-                # print(e2.text)
-                desc = e2.text
-                e3 = elmt.find_element_by_xpath("..")
-                print(e3.get_attribute('href'))
-                url = e3.get_attribute('href')
-                e4 = elmt.find_element_by_xpath(
-                    ".//div[@class='XTjFC WF4CUc']")
-                # print(e4.text)
-                company = e4.text
-                print(company)
-                e5 = elmt.find_element_by_xpath(".//span[@class='WG9SHc']")
-                # print(e5.text)
-                day = e5.text
+                # print(title)
+
+                url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+                # print(url)
+
+                company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+                # print(company)
+
+                day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
                 day = conv_time(day)
+                # print(day)
 
-                tmp = [title, desc, url, q, company, str(day), current, 0]
-                insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-                .format('url_list', str(tuple(URL_LIST_COL)).replace('\'',''), tuple(tmp))
+                current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+                print(title, url, company, day)
                 
-                DA.mysql_insert_data(db, insert_sql)
-                # if DA.check_unique(table, url):
-                #     table.insert({'title': title,
-                #                   'desc': desc,
-                #                   'url': url,
-                #                   'keyword': q,
-                #                   'company': company,
-                #                   'day': day,
-                #                   'crawler_date': current,
-                #                   '_status': 0})
-            except:
-                print(title, url)
+                if url not in id_cache:
+                    url_table.insert({
+                        'title': title,
+                        'url': url,
+                        'keyword': q,
+                        'company': company,
+                        'day': str(day),
+                        'crawler_date': current,
+                        'page': c,
+                        '_status': 0
+                    })
+            # except:
+            #     print('***',title, url)
         try:
-            elmt = driver.find_element_by_xpath("//a[@id='pnnext']")
+            element = driver.find_element_by_xpath("//a[@id='pnnext']")
+            driver.implicitly_wait(5)
+            ActionChains(driver).move_to_element(element).click(element).perform()
         except:
             print('done')
             break
-        webdriver.ActionChains(driver).move_to_element(elmt).perform()
-        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
-        
-
-    print('done...')
-    driver.close()
+    logger_.info('{} news list update'.format(q))
 
 
 def our_rule(url, company, driver):
@@ -198,69 +197,14 @@ def content_download(url):
     return article.text, article.publish_date
 
 
-def SendCheck(detail_content, url, title, ws, pos, ner):
-    pattern = r'[陳|林|黃|張|李|王|吳|劉|蔡|楊|許|鄭|謝|郭|洪|曾|邱|廖|賴|周|徐|蘇|葉|莊|呂|江|何|蕭|羅|高|簡|朱|鍾|施|游|詹|沈|彭|胡|余|盧|潘|顏|梁|趙|柯|翁|魏|方|孫|張簡|戴|范|歐陽|宋|鄧|杜|侯|曹|薛|傅|丁|溫|紀|范姜|蔣|歐|藍|連|唐|馬|董|石|卓|程|姚|康|馮|古|姜|湯|汪|白|田|涂|鄒|巫|尤|鐘|龔|嚴|韓|黎|阮|袁|童|陸|金|錢|邵][\u4E00-\u9fa5]{2}[^攝]'
-    pattern = re.compile(pattern)
-
-    push = 0
-    content_ = re.sub(r'[^\w\s]', '', detail_content)
-    # print(content_)
-
-    for i in pattern.findall(content_):
-        index_ = content_.find(i)
-        pre = content_[index_-10:index_]
-        after = content_[index_:index_+10]
-        ws_results = ws([pre + after])
-
-        skip = 0
-        for word in ['記者', '報導', '攝影', '攝', '新聞']:
-            if word in ws_results[0]:
-                skip = 1
-        if skip:
-            continue
-
-        pos_results = pos(ws_results)
-        ner_results = ner(ws_results, pos_results)
-        c = 0
-        for i in ner_results[0]:
-            if 'PERSON' in list(i):
-                # print(ner_results)
-                push = 1
-                c = 1
-        if c == 1:
-            break
-
-        if (content_[index_-3:index_]) == '設計師' or (content_[index_-3:index_]) == '發明者' or (content_[index_-3:index_]) == '建築師' or (content_[index_-3:index_]) == '總經理':
-            # print(content_[index_-3:index_] + '-' + content_[index_:3])
-            push = 1
-            break
-
-        elif (content_[index_-2:index_]) == '會長' or (content_[index_-2:index_]) == '副總':
-            # print(content_[index_-2:index_] + '-' + content_[index_:3])
-            push = 1
-            break
-
-        elif (content_[index_-4:index_]) == '專案經理':
-            # print(content_[index_-4:index_] + '-' + content_[index_:3])
-            push = 1
-            break
-
-    if push == 1:
-        # pass
-        # bot.sendMessage(chat_id=1605426233, text=url)
-        params = {"message":  '[ '+ title  + ' ]  ' + url}
-        r = requests.post("https://notify-api.line.me/api/notify",headers=LINE_HEADERS, params=params)
-
-
-def detail_content(data, current, profilepath, db, ws, pos, ner):
-    driver = serive_create(profilepath)
+def detail_crawler(data, detail_table, url_table, error_table, driver):
     error_list = []
     for key, group in data.iterrows():
-        url = group['news_url']
-        company = group['company']
+        url = group['url']
         print(url)
-        print(group['news_title'])
-        date = None
+        company = group['company']
+        date = group['day']
+
         try:
             detail_content = our_rule(url, company, driver)
             if detail_content == '':
@@ -269,62 +213,83 @@ def detail_content(data, current, profilepath, db, ws, pos, ner):
             if detail_content == '':
                 logger_.warning('{} : cannot find content'.format(url))
                 error_list += [url]
+                error_table.insert({
+                    'url':url,
+                    'keyword': group['keyword'],
+                    'error_message': 'cannot find conten',
+                    'crawler_date': current
+                })
+            
+            current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+            detail_table.insert({
+                'url': url, 
+                'keyword': group['keyword'],
+                'detail_content': detail_content,
+                'date': str(date),
+                'company': company,
+                'page': group['page'],
+                'crawler_date': current
+            })
+
+            url_table.upsert({'url':url,'_status':1},['url'])
+            time.sleep(2)
 
-            tmp = [url, detail_content, str(date), current]
-            insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
-            .format('gnews_detail', str(tuple(GNEWS_DETAIL_COL)).replace('\'',''), tuple(tmp))
-            DA.mysql_insert_data(db, insert_sql)
-
-            update_sql = "UPDATE url_list SET _status = 1 WHERE news_url = '{}'".format(url)
-            DA.mysql_insert_data(db, update_sql)
-            # tmp = dict(url=group['url'],
-            #             detail_content=detail_content,
-            #             day2=date,
-            #             crawler_date=current,
-            #             _status=1)
-            # table.update(tmp, ['url'])
-
-            SendCheck(detail_content, url, group['news_title'], ws, pos, ner)
-            time.sleep(15)
         except Exception as e:
-            print(url)
-            logger_.error('{} / {} : cannot find content'.format(url, str(e)))
-            update_sql = "UPDATE url_list SET _status = -1 WHERE news_url = '{}'".format(url)
-            DA.mysql_insert_data(db, update_sql)
-            error_list += [url]
+            current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+            error_table.insert({
+                'url':url,
+                'keyword': group['keyword'],
+                'error_message': str(e),
+                'crawler_date': current
+            })
+
     driver.close()
 
 
+def get_next_job(db, table, query_key):
+    result = db.query("select * from gnews.{} where _status=0 and keyword='{}'".format(table, query_key))
+    url_pd = pd.DataFrame([dict(i) for i in result])
+
+    return url_pd
+
+
 def main():
-    query_key = u'建材'
-    profilepath = 'Profile 1'
-    logger_.info('start...')
-    current = datetime.today().strftime(
-        "%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
-
-    db = DA.mysql_connect(MYSQL_CONFIG, 'gnews')
-    # db = DA.DBconnect()
-    # table = DA.Tabelconnect(db, 'gnews3')
-    get_trends(query_key, db, current, profilepath)
-    
-    logger_.info('{} news list update'.format(query_key))
-    time.sleep(120)
-    
-    ws = WS("./data")
-    pos = POS("./data")
-    ner = NER("./data")
-    # pos,ner='',''
-    # table = DA.Tabelconnect(db, 'gnews3')
-    # data = pd.DataFrame([dict(i) for i in table.find(_status=0)])
-    query_sql = "SELECT * FROM url_list WHERE _status=0"
-    results = DA.mysql_select_data(db, query_sql)
-    results = pd.DataFrame(results, columns=['id'] + URL_LIST_COL)
-
-    logger_.info('find {} news...'.format(len(results)))
-    detail_content(results, current, profilepath, db, ws, pos, ner)
-
-    logger_.info('{} news description update'.format(query_key))
+    location_pd = pd.read_csv('location_list.csv')
+    location_pd = location_pd[location_pd['縣市']== '台北']
+    location = location_pd['地區'].to_list()
+
+    if len(sys.argv) > 1 :
+        port=int(sys.argv[1])
+        print('restart docker pw{}'.format(port))
+        os.system('sudo docker container restart pw'+str(port))
+        time.sleep(8)
+
+    driver = brower_start(port)
+
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
+    url_table_name = 'url_list2'
+    url_table = db[url_table_name]
+    detail_table = db['gnews_detail2']
+    error_table = db['error_list']
+
+    for keyword in location:
+        if keyword == '文山' or keyword == '大安' or keyword == '中正': continue
+        query_key = '{} 政治'.format(keyword)
+        logger_.info('{} start...'.format(query_key))
+
+        # find new news url
+        id_cache = build_cache(db, url_table_name)
+        get_trends(query_key, url_table, id_cache, driver)
+        time.sleep(5)
+
+        url_pd = get_next_job(db, url_table_name, query_key)
+        logger_.info('find {} news...'.format(len(url_pd)))
+        
+        detail_crawler(url_pd, detail_table, url_table, error_table, driver)
+        logger_.info('{} news description update'.format(query_key))
+
     db.close()
+    driver.close()
 
 if __name__ == "__main__":
     main()

+ 330 - 0
gnews_old.py

@@ -0,0 +1,330 @@
+from selenium import webdriver
+import time, pickle, sys, os, re, time, requests
+# import dataset
+import telegram
+import pandas as pd
+from datetime import datetime, timedelta
+from utility import database_access as DA
+from newspaper import Article
+from utility import log
+from bs4 import BeautifulSoup
+from ckiptagger import WS, POS, NER
+from utility.connect import *
+
+logger_ = log.init_logging('gnews', 'gnews')
+bot = telegram.Bot(token='1661195930:AAG8TNHUuXoghSFf3VC-oKbo_erep7Y1io4')
+
+URL_LIST_COL = ['news_title', 'news_desc', 'news_url', 'search_keyword', 'company', 'news_day','crawler_date','_status']
+GNEWS_DETAIL_COL = ['news_url', 'news_content', 'news_day', 'crawler_date']
+
+def serive_create(profilepath):
+    option = webdriver.ChromeOptions()
+    option.add_argument('--headless')
+    option.add_argument('--no-sandbox')
+    option.add_argument('--disable-web-security')
+    option.add_argument('--allow-running-insecure-content')
+    option.add_argument('--incognito')
+    option.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
+    # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
+    option.add_argument(
+        "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
+    option.add_argument("profile-directory="+profilepath)
+    # driver = webdriver.Chrome('./utility/chromedriver', options=option)
+    driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
+                              service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
+
+    executor_url = driver.command_executor._url
+    session_id = driver.session_id
+    print(session_id)
+    print(executor_url)
+
+    return driver
+
+
+def conv_time(t):
+    min = int(re.findall('\d+', t)[0])
+    if u'秒' in t:
+        s = (datetime.now() - timedelta(seconds=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'分鐘' in t:
+        s = (datetime.now() - timedelta(minutes=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'小時' in t:
+        s = (datetime.now() - timedelta(hours=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'天' in t:
+        s = (datetime.now() - timedelta(days=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'週' in t:
+        s = (datetime.now() - timedelta(days=min*7)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+    else:
+        s = t
+    return s
+
+
+def get_trends(q, db, current, profilepath):
+    driver = serive_create(profilepath)
+
+    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
+    time.sleep(3)
+    # soup = BeautifulSoup(driver.page_source, 'html.parser')
+    # content = soup.prettify()
+    # print(content)
+    c = 0
+    while True:
+        time.sleep(15)
+        c += 1
+        logger_.info('page {}'.format(c))
+        elmts = driver.find_elements_by_xpath("//div[@class='yr3B8d KWQBje']")
+        for elmt in elmts:
+            try:
+                e = elmt.find_element_by_xpath(".//div[@role='heading']")
+                title = e.text
+                # check whether it is traditional
+                title.encode('big5')
+                print(e.text)
+                e2 = elmt.find_element_by_xpath(".//div[@class='Y3v8qd']")
+                # print(e2.text)
+                desc = e2.text
+                e3 = elmt.find_element_by_xpath("..")
+                print(e3.get_attribute('href'))
+                url = e3.get_attribute('href')
+                e4 = elmt.find_element_by_xpath(
+                    ".//div[@class='XTjFC WF4CUc']")
+                # print(e4.text)
+                company = e4.text
+                print(company)
+                e5 = elmt.find_element_by_xpath(".//span[@class='WG9SHc']")
+                # print(e5.text)
+                day = e5.text
+                day = conv_time(day)
+
+                tmp = [title, desc, url, q, company, str(day), current, 0]
+                insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+                .format('url_list', str(tuple(URL_LIST_COL)).replace('\'',''), tuple(tmp))
+                
+                DA.mysql_insert_data(db, insert_sql)
+                # if DA.check_unique(table, url):
+                #     table.insert({'title': title,
+                #                   'desc': desc,
+                #                   'url': url,
+                #                   'keyword': q,
+                #                   'company': company,
+                #                   'day': day,
+                #                   'crawler_date': current,
+                #                   '_status': 0})
+            except:
+                print(title, url)
+        try:
+            elmt = driver.find_element_by_xpath("//a[@id='pnnext']")
+        except:
+            print('done')
+            break
+        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+        
+
+    print('done...')
+    driver.close()
+
+
+def our_rule(url, company, driver):
+    url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
+                       '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
+                       'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
+                       '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
+    detail_content = ""
+
+    if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
+        if company in url_domain_list:
+            driver.get(url)
+            if company == '買購不動產新聞台':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='content-font']")
+            elif company == 'HiNet 新聞社群':
+                e = driver.find_elements_by_xpath(".//div[@id='detail']")
+            elif company == '好房網News':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@itemprop='articleBody']")
+            elif company == '自由時報地產天下':
+                e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
+            elif company == '經濟日報':
+                e = driver.find_elements_by_xpath(".//div[@id='article_body']")
+            elif company == '台灣醒報 Awakening News Network':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='markdown-body']")
+            elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
+                e = driver.find_elements_by_xpath(".//div[@class='text']")
+            elif company == 'Bella儂儂':
+                e = driver.find_elements_by_xpath(".//div[@id='content_div']")
+            elif company == '康健雜誌':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='limitContent']")
+            elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='text--desktop text--mobile article-text-size_md  tw-max_width']")
+            elif company == '台灣蘋果日報 娛樂時尚':
+                e = driver.find_elements_by_xpath(
+                    ".//p[@class='text--desktop text--mobile article-text-size_md  tw-max_width']")
+            elif company == '公視新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//article[@class='post-article']")
+            elif company == 'udn 房地產':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@id='story_body_content']")
+            elif company == '公民新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='field-items']")
+            elif company == '自由娛樂':
+                e = driver.find_elements_by_xpath(".//div[@class='text']")
+            elif company == 'HiNet生活誌 - 中華電信':
+                e = driver.find_elements_by_xpath(".//div[@id='detail']")
+            for i in e:
+                detail_content += i.text
+    return detail_content
+
+
+def content_download(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+
+    return article.text, article.publish_date
+
+
+def SendCheck(detail_content, url, title, ws, pos, ner):
+    pattern = r'[陳|林|黃|張|李|王|吳|劉|蔡|楊|許|鄭|謝|郭|洪|曾|邱|廖|賴|周|徐|蘇|葉|莊|呂|江|何|蕭|羅|高|簡|朱|鍾|施|游|詹|沈|彭|胡|余|盧|潘|顏|梁|趙|柯|翁|魏|方|孫|張簡|戴|范|歐陽|宋|鄧|杜|侯|曹|薛|傅|丁|溫|紀|范姜|蔣|歐|藍|連|唐|馬|董|石|卓|程|姚|康|馮|古|姜|湯|汪|白|田|涂|鄒|巫|尤|鐘|龔|嚴|韓|黎|阮|袁|童|陸|金|錢|邵][\u4E00-\u9fa5]{2}[^攝]'
+    pattern = re.compile(pattern)
+
+    push = 0
+    content_ = re.sub(r'[^\w\s]', '', detail_content)
+    # print(content_)
+
+    for i in pattern.findall(content_):
+        index_ = content_.find(i)
+        pre = content_[index_-10:index_]
+        after = content_[index_:index_+10]
+        ws_results = ws([pre + after])
+
+        skip = 0
+        for word in ['記者', '報導', '攝影', '攝', '新聞']:
+            if word in ws_results[0]:
+                skip = 1
+        if skip:
+            continue
+
+        pos_results = pos(ws_results)
+        ner_results = ner(ws_results, pos_results)
+        c = 0
+        for i in ner_results[0]:
+            if 'PERSON' in list(i):
+                # print(ner_results)
+                push = 1
+                c = 1
+        if c == 1:
+            break
+
+        if (content_[index_-3:index_]) == '設計師' or (content_[index_-3:index_]) == '發明者' or (content_[index_-3:index_]) == '建築師' or (content_[index_-3:index_]) == '總經理':
+            # print(content_[index_-3:index_] + '-' + content_[index_:3])
+            push = 1
+            break
+
+        elif (content_[index_-2:index_]) == '會長' or (content_[index_-2:index_]) == '副總':
+            # print(content_[index_-2:index_] + '-' + content_[index_:3])
+            push = 1
+            break
+
+        elif (content_[index_-4:index_]) == '專案經理':
+            # print(content_[index_-4:index_] + '-' + content_[index_:3])
+            push = 1
+            break
+
+    if push == 1:
+        # pass
+        # bot.sendMessage(chat_id=1605426233, text=url)
+        params = {"message":  '[ '+ title  + ' ]  ' + url}
+        r = requests.post("https://notify-api.line.me/api/notify",headers=LINE_HEADERS, params=params)
+
+
+def detail_content(data, current, profilepath, db, ws, pos, ner):
+    driver = serive_create(profilepath)
+    error_list = []
+    for key, group in data.iterrows():
+        url = group['news_url']
+        company = group['company']
+        print(url)
+        print(group['news_title'])
+        date = None
+        try:
+            detail_content = our_rule(url, company, driver)
+            if detail_content == '':
+                detail_content, date = content_download(url)
+
+            if detail_content == '':
+                logger_.warning('{} : cannot find content'.format(url))
+                error_list += [url]
+
+            tmp = [url, detail_content, str(date), current]
+            insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
+            .format('gnews_detail', str(tuple(GNEWS_DETAIL_COL)).replace('\'',''), tuple(tmp))
+            DA.mysql_insert_data(db, insert_sql)
+
+            update_sql = "UPDATE url_list SET _status = 1 WHERE news_url = '{}'".format(url)
+            DA.mysql_insert_data(db, update_sql)
+            # tmp = dict(url=group['url'],
+            #             detail_content=detail_content,
+            #             day2=date,
+            #             crawler_date=current,
+            #             _status=1)
+            # table.update(tmp, ['url'])
+
+            SendCheck(detail_content, url, group['news_title'], ws, pos, ner)
+            time.sleep(15)
+        except Exception as e:
+            print(url)
+            logger_.error('{} / {} : cannot find content'.format(url, str(e)))
+            update_sql = "UPDATE url_list SET _status = -1 WHERE news_url = '{}'".format(url)
+            DA.mysql_insert_data(db, update_sql)
+            error_list += [url]
+    driver.close()
+
+
+def main():
+    query_key = u'建材'
+    profilepath = 'Profile 1'
+    logger_.info('start...')
+    current = datetime.today().strftime(
+        "%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+
+    db = DA.mysql_connect(MYSQL_CONFIG, 'gnews')
+    # db = DA.DBconnect()
+    # table = DA.Tabelconnect(db, 'gnews3')
+    get_trends(query_key, db, current, profilepath)
+    
+    logger_.info('{} news list update'.format(query_key))
+    time.sleep(120)
+    
+    ws = WS("./data")
+    pos = POS("./data")
+    ner = NER("./data")
+    # pos,ner='',''
+    # table = DA.Tabelconnect(db, 'gnews3')
+    # data = pd.DataFrame([dict(i) for i in table.find(_status=0)])
+    query_sql = "SELECT * FROM url_list WHERE _status=0"
+    results = DA.mysql_select_data(db, query_sql)
+    results = pd.DataFrame(results, columns=['id'] + URL_LIST_COL)
+
+    logger_.info('find {} news...'.format(len(results)))
+    detail_content(results, current, profilepath, db, ws, pos, ner)
+
+    logger_.info('{} news description update'.format(query_key))
+    db.close()
+
+if __name__ == "__main__":
+    main()

+ 369 - 0
location_list.csv

@@ -0,0 +1,369 @@
+縣市,地區,location
+基隆,仁愛,基隆市仁愛區
+基隆,中正,基隆市中正區
+基隆,信義,基隆市信義區
+基隆,中山,基隆市中山區
+基隆,安樂,基隆市安樂區
+基隆,暖暖,基隆市暖暖區
+基隆,七堵,基隆市七堵區
+台北,中正,台北市中正區
+台北,大同,台北市大同區
+台北,中山,台北市中山區
+台北,松山,台北市松山區
+台北,大安,台北市大安區
+台北,萬華,台北市萬華區
+台北,信義,台北市信義區
+台北,士林,台北市士林區
+台北,北投,台北市北投區
+台北,內湖,台北市內湖區
+台北,南港,台北市南港區
+台北,文山,台北市文山區
+新北,板橋,新北市板橋區
+新北,新莊,新北市新莊區
+新北,中和,新北市中和區
+新北,永和,新北市永和區
+新北,土城,新北市土城區
+新北,樹林,新北市樹林區
+新北,三峽,新北市三峽區
+新北,鶯歌,新北市鶯歌區
+新北,三重,新北市三重區
+新北,蘆洲,新北市蘆洲區
+新北,五股,新北市五股區
+新北,泰山,新北市泰山區
+新北,林口,新北市林口區
+新北,八里,新北市八里區
+新北,淡水,新北市淡水區
+新北,三芝,新北市三芝區
+新北,石門,新北市石門區
+新北,金山,新北市金山區
+新北,萬里,新北市萬里區
+新北,汐止,新北市汐止區
+新北,瑞芳,新北市瑞芳區
+新北,貢寮,新北市貢寮區
+新北,平溪,新北市平溪區
+新北,雙溪,新北市雙溪區
+新北,新店,新北市新店區
+新北,深坑,新北市深坑區
+新北,石碇,新北市石碇區
+新北,坪林,新北市坪林區
+新北,烏來,新北市烏來區
+宜蘭,宜蘭市,宜蘭縣宜蘭市
+宜蘭,頭城,宜蘭縣頭城鎮
+宜蘭,礁溪,宜蘭縣礁溪鄉
+宜蘭,壯圍,宜蘭縣壯圍鄉
+宜蘭,員山,宜蘭縣員山鄉
+宜蘭,羅東,宜蘭縣羅東鎮
+宜蘭,蘇澳,宜蘭縣蘇澳鎮
+宜蘭,五結,宜蘭縣五結鄉
+宜蘭,三星,宜蘭縣三星鄉
+宜蘭,冬山,宜蘭縣冬山鄉
+宜蘭,大同,宜蘭縣大同鄉
+宜蘭,南澳,宜蘭縣南澳鄉
+桃園,桃園,桃園市桃園區
+桃園,中壢,桃園市中壢區
+桃園,平鎮,桃園市平鎮區
+桃園,八德,桃園市八德區
+桃園,楊梅,桃園市楊梅區
+桃園,蘆竹,桃園市蘆竹區
+桃園,大溪,桃園市大溪區
+桃園,龍潭,桃園市龍潭區
+桃園,龜山,桃園市龜山區
+桃園,大園,桃園市大園區
+桃園,觀音,桃園市觀音區
+桃園,新屋,桃園市新屋區
+桃園,復興,桃園市復興區
+新竹,東區,新竹市東區
+新竹,北區,新竹市北區
+新竹,香山,新竹市香山區
+新竹,竹北,新竹縣竹北市
+新竹,竹東,新竹縣竹東鎮
+新竹,新埔,新竹縣新埔鎮
+新竹,關西,新竹縣關西鎮
+新竹,湖口,新竹縣湖口鄉
+新竹,新豐,新竹縣新豐鄉
+新竹,峨眉,新竹縣峨眉鄉
+新竹,寶山,新竹縣寶山鄉
+新竹,北埔,新竹縣北埔鄉
+新竹,芎林,新竹縣芎林鄉
+新竹,橫山,新竹縣橫山鄉
+新竹,尖石,新竹縣尖石鄉
+新竹,五峰,新竹縣五峰鄉
+苗栗,苗栗,苗栗縣苗栗市
+苗栗,頭份,苗栗縣頭份市
+苗栗,竹南,苗栗縣竹南鎮
+苗栗,後龍,苗栗縣後龍鎮
+苗栗,通霄,苗栗縣通霄鎮
+苗栗,苑裡,苗栗縣苑裡鎮
+苗栗,卓蘭,苗栗縣卓蘭鎮
+苗栗,造橋,苗栗縣造橋鄉
+苗栗,西湖,苗栗縣西湖鄉
+苗栗,頭屋,苗栗縣頭屋鄉
+苗栗,公館,苗栗縣公館鄉
+苗栗,銅鑼,苗栗縣銅鑼鄉
+苗栗,三義,苗栗縣三義鄉
+苗栗,大湖,苗栗縣大湖鄉
+苗栗,獅潭,苗栗縣獅潭鄉
+苗栗,三灣,苗栗縣三灣鄉
+苗栗,南庄,苗栗縣南庄鄉
+苗栗,泰安,苗栗縣泰安鄉
+台中,東區,台中市東區
+台中,南區,台中市南區
+台中,西區,台中市西區
+台中,北區,台中市北區
+台中,中區,台中市中區
+台中,北屯,台中市北屯區
+台中,西屯,台中市西屯區
+台中,南屯,台中市南屯區
+台中,太平,台中市太平區
+台中,大里,台中市大里區
+台中,霧峰,台中市霧峰區
+台中,烏日,台中市烏日區
+台中,豐原,台中市豐原區
+台中,后里,台中市后里區
+台中,石岡,台中市石岡區
+台中,東勢,台中市東勢區
+台中,新社,台中市新社區
+台中,潭子,台中市潭子區
+台中,大雅,台中市大雅區
+台中,神岡,台中市神岡區
+台中,大肚,台中市大肚區
+台中,沙鹿,台中市沙鹿區
+台中,龍井,台中市龍井區
+台中,梧棲,台中市梧棲區
+台中,清水,台中市清水區
+台中,大甲,台中市大甲區
+台中,外埔,台中市外埔區
+台中,大安,台中市大安區
+台中,和平,台中市和平區
+彰化,彰化,彰化縣彰化市
+彰化,員林,彰化縣員林市
+彰化,和美,彰化縣和美鎮
+彰化,鹿港,彰化縣鹿港鎮
+彰化,溪湖,彰化縣溪湖鎮
+彰化,二林,彰化縣二林鎮
+彰化,田中,彰化縣田中鎮
+彰化,北斗,彰化縣北斗鎮
+彰化,花壇,彰化縣花壇鄉
+彰化,芬園,彰化縣芬園鄉
+彰化,大村,彰化縣大村鄉
+彰化,永靖,彰化縣永靖鄉
+彰化,伸港,彰化縣伸港鄉
+彰化,線西,彰化縣線西鄉
+彰化,福興,彰化縣福興鄉
+彰化,秀水,彰化縣秀水鄉
+彰化,埔心,彰化縣埔心鄉
+彰化,埔鹽,彰化縣埔鹽鄉
+彰化,大城,彰化縣大城鄉
+彰化,芳苑,彰化縣芳苑鄉
+彰化,竹塘,彰化縣竹塘鄉
+彰化,社頭,彰化縣社頭鄉
+彰化,二水,彰化縣二水鄉
+彰化,田尾,彰化縣田尾鄉
+彰化,埤頭,彰化縣埤頭鄉
+彰化,溪州,彰化縣溪州鄉
+南投,南投,南投縣南投市
+南投,埔里,南投縣埔里鎮
+南投,草屯,南投縣草屯鎮
+南投,竹山,南投縣竹山鎮
+南投,集集,南投縣集集鎮
+南投,名間,南投縣名間鄉
+南投,鹿谷,南投縣鹿谷鄉
+南投,中寮,南投縣中寮鄉
+南投,魚池,南投縣魚池鄉
+南投,國姓,南投縣國姓鄉
+南投,水里,南投縣水里鄉
+南投,信義,南投縣信義鄉
+南投,仁愛,南投縣仁愛鄉
+雲林,斗六,雲林縣斗六市
+雲林,斗南,雲林縣斗南鎮
+雲林,林內,雲林縣林內鄉
+雲林,古坑,雲林縣古坑鄉
+雲林,大埤,雲林縣大埤鄉
+雲林,莿桐,雲林縣莿桐鄉
+雲林,虎尾,雲林縣虎尾鎮
+雲林,西螺,雲林縣西螺鎮
+雲林,土庫,雲林縣土庫鎮
+雲林,褒忠,雲林縣褒忠鄉
+雲林,二崙,雲林縣二崙鄉
+雲林,崙背,雲林縣崙背鄉
+雲林,麥寮,雲林縣麥寮鄉
+雲林,台西,雲林縣台西鄉
+雲林,東勢,雲林縣東勢鄉
+雲林,北港,雲林縣北港鎮
+雲林,元長,雲林縣元長鄉
+雲林,四湖,雲林縣四湖鄉
+雲林,口湖,雲林縣口湖鄉
+雲林,水林,雲林縣水林鄉
+嘉義,東區,嘉義市東區
+嘉義,西區,嘉義市西區
+嘉義,太保,嘉義縣太保市
+嘉義,朴子,嘉義縣朴子市
+嘉義,布袋,嘉義縣布袋鎮
+嘉義,大林,嘉義縣大林鎮
+嘉義,民雄,嘉義縣民雄鄉
+嘉義,溪口,嘉義縣溪口鄉
+嘉義,新港,嘉義縣新港鄉
+嘉義,六腳,嘉義縣六腳鄉
+嘉義,東石,嘉義縣東石鄉
+嘉義,義竹,嘉義縣義竹鄉
+嘉義,鹿草,嘉義縣鹿草鄉
+嘉義,水上,嘉義縣水上鄉
+嘉義,中埔,嘉義縣中埔鄉
+嘉義,竹崎,嘉義縣竹崎鄉
+嘉義,梅山,嘉義縣梅山鄉
+嘉義,番路,嘉義縣番路鄉
+嘉義,大埔,嘉義縣大埔鄉
+嘉義,阿里山,嘉義縣阿里山鄉
+台南,中西,台南市中西區
+台南,市東,台南市東區
+台南,市南,台南市南區
+台南,市北,台南市北區
+台南,安平,台南市安平區
+台南,安南,台南市安南區
+台南,永康,台南市永康區
+台南,歸仁,台南市歸仁區
+台南,新化,台南市新化區
+台南,左鎮,台南市左鎮區
+台南,玉井,台南市玉井區
+台南,楠西,台南市楠西區
+台南,南化,台南市南化區
+台南,仁德,台南市仁德區
+台南,關廟,台南市關廟區
+台南,龍崎,台南市龍崎區
+台南,官田,台南市官田區
+台南,麻豆,台南市麻豆區
+台南,佳里,台南市佳里區
+台南,西港,台南市西港區
+台南,七股,台南市七股區
+台南,將軍,台南市將軍區
+台南,學甲,台南市學甲區
+台南,北門,台南市北門區
+台南,新營,台南市新營區
+台南,後壁,台南市後壁區
+台南,白河,台南市白河區
+台南,東山,台南市東山區
+台南,六甲,台南市六甲區
+台南,下營,台南市下營區
+台南,柳營,台南市柳營區
+台南,鹽水,台南市鹽水區
+台南,善化,台南市善化區
+台南,大內,台南市大內區
+台南,山上,台南市山上區
+台南,新市,台南市新市區
+台南,安定,台南市安定區
+高雄,楠梓,高雄市楠梓區
+高雄,左營,高雄市左營區
+高雄,鼓山,高雄市鼓山區
+高雄,三民,高雄市三民區
+高雄,鹽埕,高雄市鹽埕區
+高雄,前金,高雄市前金區
+高雄,新興,高雄市新興區
+高雄,苓雅,高雄市苓雅區
+高雄,前鎮,高雄市前鎮區
+高雄,旗津,高雄市旗津區
+高雄,小港,高雄市小港區
+高雄,鳳山,高雄市鳳山區
+高雄,大寮,高雄市大寮區
+高雄,鳥松,高雄市鳥松區
+高雄,林園,高雄市林園區
+高雄,仁武,高雄市仁武區
+高雄,大樹,高雄市大樹區
+高雄,大社,高雄市大社區
+高雄,岡山,高雄市岡山區
+高雄,路竹,高雄市路竹區
+高雄,橋頭,高雄市橋頭區
+高雄,梓官,高雄市梓官區
+高雄,彌陀,高雄市彌陀區
+高雄,永安,高雄市永安區
+高雄,燕巢,高雄市燕巢區
+高雄,田寮,高雄市田寮區
+高雄,阿蓮,高雄市阿蓮區
+高雄,茄萣,高雄市茄萣區
+高雄,湖內,高雄市湖內區
+高雄,旗山,高雄市旗山區
+高雄,美濃,高雄市美濃區
+高雄,內門,高雄市內門區
+高雄,杉林,高雄市杉林區
+高雄,甲仙,高雄市甲仙區
+高雄,六龜,高雄市六龜區
+高雄,茂林,高雄市茂林區
+高雄,桃源,高雄市桃源區
+高雄,那瑪夏,高雄市那瑪夏區
+屏東,屏東,屏東縣屏東市
+屏東,潮州,屏東縣潮州鎮
+屏東,東港,屏東縣東港鎮
+屏東,恆春,屏東縣恆春鎮
+屏東,萬丹,屏東縣萬丹鄉
+屏東,崁頂,屏東縣崁頂鄉
+屏東,新園,屏東縣新園鄉
+屏東,林邊,屏東縣林邊鄉
+屏東,南州,屏東縣南州鄉
+屏東,琉球,屏東縣琉球鄉
+屏東,枋寮,屏東縣枋寮鄉
+屏東,枋山,屏東縣枋山鄉
+屏東,車城,屏東縣車城鄉
+屏東,滿州,屏東縣滿州鄉
+屏東,高樹,屏東縣高樹鄉
+屏東,九如,屏東縣九如鄉
+屏東,鹽埔,屏東縣鹽埔鄉
+屏東,里港,屏東縣里港鄉
+屏東,內埔,屏東縣內埔鄉
+屏東,竹田,屏東縣竹田鄉
+屏東,長治,屏東縣長治鄉
+屏東,麟洛,屏東縣麟洛鄉
+屏東,萬巒,屏東縣萬巒鄉
+屏東,新埤,屏東縣新埤鄉
+屏東,佳冬,屏東縣佳冬鄉
+屏東,霧台,屏東縣霧台鄉
+屏東,泰武,屏東縣泰武鄉
+屏東,瑪家,屏東縣瑪家鄉
+屏東,來義,屏東縣來義鄉
+屏東,春日,屏東縣春日鄉
+屏東,獅子,屏東縣獅子鄉
+屏東,牡丹,屏東縣牡丹鄉
+屏東,三地門,屏東縣三地門鄉
+花蓮,花蓮,花蓮縣花蓮市
+花蓮,吉安,花蓮縣吉安鄉
+花蓮,壽豐,花蓮縣壽豐鄉
+花蓮,新城,花蓮縣新城鄉
+花蓮,鳳林,花蓮縣鳳林鎮
+花蓮,光復,花蓮縣光復鄉
+花蓮,玉里,花蓮縣玉里鎮
+花蓮,豐濱,花蓮縣豐濱鄉
+花蓮,瑞穗,花蓮縣瑞穗鄉
+花蓮,富里,花蓮縣富里鄉
+花蓮,秀林,花蓮縣秀林鄉
+花蓮,萬榮,花蓮縣萬榮鄉
+花蓮,卓溪,花蓮縣卓溪鄉
+台東,台東,台東縣台東市
+台東,成功,台東縣成功鎮
+台東,關山,台東縣關山鎮
+台東,長濱,台東縣長濱鄉
+台東,池上,台東縣池上鄉
+台東,東河,台東縣東河鄉
+台東,鹿野,台東縣鹿野鄉
+台東,卑南,台東縣卑南鄉
+台東,大武,台東縣大武鄉
+台東,綠島,台東縣綠島鄉
+台東,太麻里,台東縣太麻里鄉
+台東,海端,台東縣海端鄉
+台東,延平,台東縣延平鄉
+台東,金峰,台東縣金峰鄉
+台東,達仁,台東縣達仁鄉
+台東,蘭嶼,台東縣蘭嶼鄉
+澎湖,馬公,澎湖縣馬公市
+澎湖,湖西,澎湖縣湖西鄉
+澎湖,白沙,澎湖縣白沙鄉
+澎湖,西嶼,澎湖縣西嶼鄉
+澎湖,望安,澎湖縣望安鄉
+澎湖,七美,澎湖縣七美鄉
+金門,金城,金門縣金城鎮
+金門,金湖,金門縣金湖鎮
+金門,金沙,金門縣金沙鎮
+金門,金寧,金門縣金寧鄉
+金門,烈嶼,金門縣烈嶼鄉
+金門,烏坵,金門縣烏坵鄉
+連江,南竿,連江縣南竿鄉
+連江,北竿,連江縣北竿鄉
+連江,莒光,連江縣莒光鄉
+連江,東引,連江縣東引鄉

+ 0 - 0
utility/__init__.py


+ 37 - 0
utility/log.py

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+from logging import DEBUG
+from logging import Formatter
+from logging import StreamHandler
+from logging import getLogger
+from logging.handlers import RotatingFileHandler
+
+
+def init_logging(module_name, log_file, level=DEBUG, max_bytes=65536000,
+                 backup_count=3):
+    """get the logger for the module_name and the log_file
+    Args:
+        module_name (str): the module name
+        log_file (str): the log file
+        level:
+        max_bytes: 65536000, 64MB
+        backup_count:
+    Returns:
+        Logger: the logger   
+    """
+    
+    logger = getLogger(module_name)
+    if logger.handlers:
+        return logger
+    logger.setLevel(level)
+    logger.propagate = True
+    handler = RotatingFileHandler(
+        filename=log_file, maxBytes=max_bytes, backupCount=backup_count)
+    formatter = Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    handler = StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger