|
@@ -0,0 +1,291 @@
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+import time, pickle, sys, os, re, time, requests
|
|
|
+import dataset
|
|
|
+import pandas as pd
|
|
|
+from datetime import datetime, timedelta
|
|
|
+from newspaper import Article
|
|
|
+from utility import log
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+# from ckiptagger import WS, POS, NER
|
|
|
+
|
|
|
+# remote : http://172.17.0.2:4444
|
|
|
+
|
|
|
+logger_ = log.init_logging('gnews', 'gnews')
|
|
|
+current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def build_cache(db, table):
|
|
|
+ id_dict=[]
|
|
|
+ cursor = db.query('SELECT url FROM gnews.{};'.format(table))
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ id_dict += [c['url']]
|
|
|
+ return id_dict
|
|
|
+
|
|
|
+
|
|
|
+def conv_time(t):
|
|
|
+ min = int(re.findall('\d+', t)[0])
|
|
|
+ if u'秒' in t:
|
|
|
+ s = (datetime.now() - timedelta(seconds=min)
|
|
|
+ ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
|
|
|
+
|
|
|
+ elif u'分鐘' in t:
|
|
|
+ s = (datetime.now() - timedelta(minutes=min)
|
|
|
+ ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
|
|
|
+
|
|
|
+ elif u'小時' in t:
|
|
|
+ s = (datetime.now() - timedelta(hours=min)
|
|
|
+ ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
|
|
|
+
|
|
|
+ elif u'天' in t:
|
|
|
+ s = (datetime.now() - timedelta(days=min)
|
|
|
+ ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
|
|
|
+
|
|
|
+ elif u'週' in t:
|
|
|
+ s = (datetime.now() - timedelta(days=min*7)
|
|
|
+ ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
|
|
|
+ else:
|
|
|
+ s = t
|
|
|
+ return s
|
|
|
+
|
|
|
+
|
|
|
+def get_trends(q, url_table, id_cache, driver):
|
|
|
+
|
|
|
+ driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
|
|
|
+ time.sleep(3)
|
|
|
+ # print(driver.page_source)
|
|
|
+ # click tool
|
|
|
+ # element = driver.find_element(By.ID, "hdtb-tls")
|
|
|
+ # driver.implicitly_wait(5)
|
|
|
+ # ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+
|
|
|
+ # click time
|
|
|
+ # element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
|
|
|
+ # driver.implicitly_wait(5)
|
|
|
+ # ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
|
|
|
+
|
|
|
+ # click time
|
|
|
+ # element = driver.find_element(By.XPATH,"//div[@id='lb']")
|
|
|
+ # ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
|
|
|
+ # for e in ele:
|
|
|
+ # if e.text == '過去 24 小時':
|
|
|
+ # print(e.text)
|
|
|
+ # driver.implicitly_wait(5)
|
|
|
+ # ActionChains(driver).move_to_element(e).click(e).perform()
|
|
|
+ # break
|
|
|
+
|
|
|
+ c = 0
|
|
|
+ while True:
|
|
|
+ time.sleep(3)
|
|
|
+ c += 1
|
|
|
+ logger_.info('page {}'.format(c))
|
|
|
+ print(driver.page_source)
|
|
|
+ elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
|
|
|
+ print(elmts)
|
|
|
+ for elmt in elmts:
|
|
|
+ title, url, company = '', '', ''
|
|
|
+ e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
|
+ title = e.text
|
|
|
+ # print(title)
|
|
|
+
|
|
|
+ url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
|
|
|
+ # print(url)
|
|
|
+
|
|
|
+ company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
|
|
|
+ # print(company)
|
|
|
+
|
|
|
+ day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
|
|
|
+ day = conv_time(day)
|
|
|
+ # print(day)
|
|
|
+
|
|
|
+ print(title, url, company, day)
|
|
|
+
|
|
|
+ if url not in id_cache:
|
|
|
+ url_table.insert({
|
|
|
+ 'title': title,
|
|
|
+ 'url': url,
|
|
|
+ 'keyword': q,
|
|
|
+ 'company': company,
|
|
|
+ 'day': str(day),
|
|
|
+ 'crawler_date': current,
|
|
|
+ 'page': c,
|
|
|
+ '_status': 0
|
|
|
+ })
|
|
|
+
|
|
|
+ if c > 3: break
|
|
|
+ try:
|
|
|
+ element = driver.find_element_by_xpath("//a[@id='pnnext']")
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
+ except:
|
|
|
+ print('done')
|
|
|
+ break
|
|
|
+ logger_.info('{} news list update'.format(q))
|
|
|
+ return driver
|
|
|
+
|
|
|
+def our_rule(url, company, driver):
|
|
|
+ url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
|
|
|
+ '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
|
|
|
+ 'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
|
|
|
+ '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
|
|
|
+ detail_content = ""
|
|
|
+
|
|
|
+ if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
|
|
|
+ if company in url_domain_list:
|
|
|
+ driver.get(url)
|
|
|
+ if company == '買購不動產新聞台':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@class='content-font']")
|
|
|
+ elif company == 'HiNet 新聞社群':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@id='detail']")
|
|
|
+ elif company == '好房網News':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@itemprop='articleBody']")
|
|
|
+ elif company == '自由時報地產天下':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
|
|
|
+ elif company == '經濟日報':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@id='article_body']")
|
|
|
+ elif company == '台灣醒報 Awakening News Network':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@class='markdown-body']")
|
|
|
+ elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@class='text']")
|
|
|
+ elif company == 'Bella儂儂':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@id='content_div']")
|
|
|
+ elif company == '康健雜誌':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@class='limitContent']")
|
|
|
+ elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
|
|
|
+ elif company == '台灣蘋果日報 娛樂時尚':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//p[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
|
|
|
+ elif company == '公視新聞':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//article[@class='post-article']")
|
|
|
+ elif company == 'udn 房地產':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@id='story_body_content']")
|
|
|
+ elif company == '公民新聞':
|
|
|
+ e = driver.find_elements_by_xpath(
|
|
|
+ ".//div[@class='field-items']")
|
|
|
+ elif company == '自由娛樂':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@class='text']")
|
|
|
+ elif company == 'HiNet生活誌 - 中華電信':
|
|
|
+ e = driver.find_elements_by_xpath(".//div[@id='detail']")
|
|
|
+ for i in e:
|
|
|
+ detail_content += i.text
|
|
|
+ return detail_content
|
|
|
+
|
|
|
+
|
|
|
+def content_download(url):
|
|
|
+ article = Article(url)
|
|
|
+ article.download()
|
|
|
+ article.parse()
|
|
|
+
|
|
|
+ return article.text, article.publish_date
|
|
|
+
|
|
|
+
|
|
|
+def detail_crawler(data, detail_table, url_table, error_table, driver):
|
|
|
+ error_list = []
|
|
|
+ for key, group in data.iterrows():
|
|
|
+ url = group['url']
|
|
|
+ print(url)
|
|
|
+ company = group['company']
|
|
|
+ date = group['day']
|
|
|
+
|
|
|
+ try:
|
|
|
+ detail_content = our_rule(url, company, driver)
|
|
|
+ if detail_content == '':
|
|
|
+ detail_content, date = content_download(url)
|
|
|
+
|
|
|
+ if detail_content == '':
|
|
|
+ logger_.warning('{} : cannot find content'.format(url))
|
|
|
+ error_list += [url]
|
|
|
+ error_table.insert({
|
|
|
+ 'url':url,
|
|
|
+ 'keyword': group['keyword'],
|
|
|
+ 'error_message': 'cannot find conten',
|
|
|
+ 'crawler_date': current
|
|
|
+ })
|
|
|
+
|
|
|
+ detail_table.insert({
|
|
|
+ 'url': url,
|
|
|
+ 'keyword': group['keyword'],
|
|
|
+ 'detail_content': detail_content,
|
|
|
+ 'date': str(date),
|
|
|
+ 'company': company,
|
|
|
+ 'page': group['page'],
|
|
|
+ 'crawler_date': current
|
|
|
+ })
|
|
|
+
|
|
|
+ url_table.upsert({'url':url,'_status':1},['url'])
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ error_table.insert({
|
|
|
+ 'url':url,
|
|
|
+ 'keyword': group['keyword'],
|
|
|
+ 'error_message': str(e),
|
|
|
+ 'crawler_date': current
|
|
|
+ })
|
|
|
+
|
|
|
+ return driver
|
|
|
+
|
|
|
+
|
|
|
+def get_next_job(db, table, query_key):
|
|
|
+ result = db.query("select * from gnews.{} where _status=0 and keyword='{}' and crawler_date='{}'".format(table, query_key, current))
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
+
|
|
|
+ return url_pd
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port = int(sys.argv[1])
|
|
|
+ print('restart docker pw{}'.format(port))
|
|
|
+ os.system('sudo docker container restart pw'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+
|
|
|
+ keyword = sys.argv[2]
|
|
|
+
|
|
|
+ driver = brower_start(port)
|
|
|
+
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
|
|
|
+ url_table_name = 'url_list2'
|
|
|
+ url_table = db[url_table_name]
|
|
|
+ detail_table = db['gnews_detail2']
|
|
|
+ error_table = db['error_list']
|
|
|
+
|
|
|
+ query_key = keyword
|
|
|
+ logger_.info('{} start...'.format(query_key))
|
|
|
+
|
|
|
+ # find new news url
|
|
|
+ id_cache = build_cache(db, url_table_name)
|
|
|
+ driver = get_trends(query_key, url_table, id_cache, driver)
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
+ logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
+
|
|
|
+ driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
+ logger_.info('{} news description update'.format(query_key))
|
|
|
+
|
|
|
+ db.close()
|
|
|
+ driver.close()
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|