Przeglądaj źródła

gnews_with_keyword

noodles 2 lat temu
rodzic
commit
2b78ba2934
3 zmienionych plików z 365 dodań i 0 usunięć
  1. 61 0
      gnew_md.py
  2. 291 0
      gnews_keyword.py
  3. 13 0
      start.sh

+ 61 - 0
gnew_md.py

@@ -0,0 +1,61 @@
+import dataset
+import os, sys
+import zipfile
+import pandas as pd
+
+from datetime import date
+from datetime import datetime, timedelta
+from jinja2 import Environment, FileSystemLoader
+
+today = date.today().strftime("%Y/%m/%d")
+current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+
+# zipfile example
+def zip_dir(path):
+    zf = zipfile.ZipFile('{}.zip'.format(path), 'w', zipfile.ZIP_DEFLATED)
+   
+    for root, dirs, files in os.walk(path):
+        for file_name in files:
+            zf.write(os.path.join(root, file_name))
+
+
+def data_read(keyword):
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
+
+    result = db.query("select * from gnews.{} a join gnews.{} b on a.url = b.url and a.keyword='{}' and b.crawler_date='{}'".format('url_list2', 'gnews_detail2', keyword, current))
+    url_pd = pd.DataFrame([dict(i) for i in result])
+    
+    db.close()
+
+    return url_pd
+
+
+def main():
+    if len(sys.argv) > 1 :
+        keyword = sys.argv[1]
+
+    output_path = 'gnews_md/{}'.format(today.replace('/',''))
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    print('starting :{}'.format(keyword))
+    data = data_read(keyword)
+    data = data.head(20)
+    for key, row in data.iterrows():
+        file_loader = FileSystemLoader('gnews_md/template')
+        env = Environment(loader=file_loader)
+
+        template = env.get_template('gnews.md')
+        
+        output = template.render( title = row['title'], date = today,
+                                    keyword = [row['keyword']],
+                                    content = row['detail_content']
+                                )
+        
+        with open("{}/{}.md".format(output_path, row['title'].replace('//',' ')), "w") as fh:
+            fh.write(output)
+        
+    zip_dir(output_path)
+
+if __name__ == "__main__":
+    main()

+ 291 - 0
gnews_keyword.py

@@ -0,0 +1,291 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.wait import WebDriverWait
+import time, pickle, sys, os, re, time, requests
+import dataset
+import pandas as pd
+from datetime import datetime, timedelta
+from newspaper import Article
+from utility import log
+from bs4 import BeautifulSoup
+# from ckiptagger import WS, POS, NER
+
+# remote : http://172.17.0.2:4444
+
+logger_ = log.init_logging('gnews', 'gnews')
+current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def build_cache(db, table):
+    id_dict=[]
+    cursor = db.query('SELECT url FROM gnews.{};'.format(table))
+
+    for c in cursor:
+        id_dict += [c['url']]
+    return id_dict
+
+
+def conv_time(t):
+    min = int(re.findall('\d+', t)[0])
+    if u'秒' in t:
+        s = (datetime.now() - timedelta(seconds=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'分鐘' in t:
+        s = (datetime.now() - timedelta(minutes=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'小時' in t:
+        s = (datetime.now() - timedelta(hours=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'天' in t:
+        s = (datetime.now() - timedelta(days=min)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+
+    elif u'週' in t:
+        s = (datetime.now() - timedelta(days=min*7)
+             ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
+    else:
+        s = t
+    return s
+
+
+def get_trends(q, url_table, id_cache, driver):
+    
+    driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
+    time.sleep(3)
+    # print(driver.page_source)
+    # click tool
+    # element = driver.find_element(By.ID, "hdtb-tls")
+    # driver.implicitly_wait(5)
+    # ActionChains(driver).move_to_element(element).click(element).perform()
+
+    # click time
+    # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
+    # driver.implicitly_wait(5)
+    # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
+
+    # click time
+    # element = driver.find_element(By.XPATH,"//div[@id='lb']")
+    # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
+    # for e in ele:
+    #     if e.text == '過去 24 小時':
+    #         print(e.text)
+    #         driver.implicitly_wait(5)
+    #         ActionChains(driver).move_to_element(e).click(e).perform()
+    #         break
+
+    c = 0
+    while True:
+        time.sleep(3)
+        c += 1
+        logger_.info('page {}'.format(c))
+        print(driver.page_source)
+        elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
+        print(elmts)
+        for elmt in elmts:
+            title, url, company = '', '', ''
+            e = elmt.find_element_by_xpath(".//div[@role='heading']")
+            title = e.text
+            # print(title)
+
+            url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
+            # print(url)
+
+            company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
+            # print(company)
+
+            day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
+            day = conv_time(day)
+            # print(day)
+
+            print(title, url, company, day)
+            
+            if url not in id_cache:
+                url_table.insert({
+                    'title': title,
+                    'url': url,
+                    'keyword': q,
+                    'company': company,
+                    'day': str(day),
+                    'crawler_date': current,
+                    'page': c,
+                    '_status': 0
+                })
+
+            if c > 3: break
+        try:
+            element = driver.find_element_by_xpath("//a[@id='pnnext']")
+            driver.implicitly_wait(5)
+            ActionChains(driver).move_to_element(element).click(element).perform()
+        except:
+            print('done')
+            break
+    logger_.info('{} news list update'.format(q))
+    return driver 
+
+def our_rule(url, company, driver):
+    url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
+                       '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
+                       'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
+                       '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
+    detail_content = ""
+
+    if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
+        if company in url_domain_list:
+            driver.get(url)
+            if company == '買購不動產新聞台':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='content-font']")
+            elif company == 'HiNet 新聞社群':
+                e = driver.find_elements_by_xpath(".//div[@id='detail']")
+            elif company == '好房網News':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@itemprop='articleBody']")
+            elif company == '自由時報地產天下':
+                e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
+            elif company == '經濟日報':
+                e = driver.find_elements_by_xpath(".//div[@id='article_body']")
+            elif company == '台灣醒報 Awakening News Network':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='markdown-body']")
+            elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
+                e = driver.find_elements_by_xpath(".//div[@class='text']")
+            elif company == 'Bella儂儂':
+                e = driver.find_elements_by_xpath(".//div[@id='content_div']")
+            elif company == '康健雜誌':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='limitContent']")
+            elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='text--desktop text--mobile article-text-size_md  tw-max_width']")
+            elif company == '台灣蘋果日報 娛樂時尚':
+                e = driver.find_elements_by_xpath(
+                    ".//p[@class='text--desktop text--mobile article-text-size_md  tw-max_width']")
+            elif company == '公視新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//article[@class='post-article']")
+            elif company == 'udn 房地產':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@id='story_body_content']")
+            elif company == '公民新聞':
+                e = driver.find_elements_by_xpath(
+                    ".//div[@class='field-items']")
+            elif company == '自由娛樂':
+                e = driver.find_elements_by_xpath(".//div[@class='text']")
+            elif company == 'HiNet生活誌 - 中華電信':
+                e = driver.find_elements_by_xpath(".//div[@id='detail']")
+            for i in e:
+                detail_content += i.text
+    return detail_content
+
+
+def content_download(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+
+    return article.text, article.publish_date
+
+
+def detail_crawler(data, detail_table, url_table, error_table, driver):
+    error_list = []
+    for key, group in data.iterrows():
+        url = group['url']
+        print(url)
+        company = group['company']
+        date = group['day']
+
+        try:
+            detail_content = our_rule(url, company, driver)
+            if detail_content == '':
+                detail_content, date = content_download(url)
+
+            if detail_content == '':
+                logger_.warning('{} : cannot find content'.format(url))
+                error_list += [url]
+                error_table.insert({
+                    'url':url,
+                    'keyword': group['keyword'],
+                    'error_message': 'cannot find conten',
+                    'crawler_date': current
+                })
+            
+            detail_table.insert({
+                'url': url, 
+                'keyword': group['keyword'],
+                'detail_content': detail_content,
+                'date': str(date),
+                'company': company,
+                'page': group['page'],
+                'crawler_date': current
+            })
+
+            url_table.upsert({'url':url,'_status':1},['url'])
+            time.sleep(2)
+
+        except Exception as e:
+            error_table.insert({
+                'url':url,
+                'keyword': group['keyword'],
+                'error_message': str(e),
+                'crawler_date': current
+            })
+
+    return driver
+
+
+def get_next_job(db, table, query_key):
+    result = db.query("select * from gnews.{} where _status=0 and keyword='{}' and crawler_date='{}'".format(table, query_key, current))
+    url_pd = pd.DataFrame([dict(i) for i in result])
+
+    return url_pd
+
+
+def main():
+
+    if len(sys.argv) > 1 :
+        port = int(sys.argv[1])
+        print('restart docker pw{}'.format(port))
+        os.system('sudo docker container restart pw'+str(port))
+        time.sleep(8)
+
+        keyword = sys.argv[2]
+
+    driver = brower_start(port)
+
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
+    url_table_name = 'url_list2'
+    url_table = db[url_table_name]
+    detail_table = db['gnews_detail2']
+    error_table = db['error_list']
+
+    query_key = keyword
+    logger_.info('{} start...'.format(query_key))
+
+    # find new news url
+    id_cache = build_cache(db, url_table_name)
+    driver = get_trends(query_key, url_table, id_cache, driver)
+    time.sleep(5)
+
+    url_pd = get_next_job(db, url_table_name, query_key)
+    logger_.info('find {} news...'.format(len(url_pd)))
+    
+    driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
+    logger_.info('{} news description update'.format(query_key))
+
+    db.close()
+    driver.close()
+
+if __name__ == "__main__":
+    main()

+ 13 - 0
start.sh

@@ -0,0 +1,13 @@
+keyword='董事會'
+
+
+source /home/noodlesloves/miniconda3/bin/activate
+conda activate noodles
+
+for keyword in 董事會 股東會 股權結構
+do
+    echo $keyword
+    python gnews_keyword.py 4446 $keyword
+    python gnew_md.py $keyword
+done
+conda deactivate