123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support.wait import WebDriverWait
- import time, pickle, sys, os, re, time, requests
- import dataset
- import traceback
- import pandas as pd
- from datetime import datetime, timedelta
- from newspaper import Article
- from utility import log
- from bs4 import BeautifulSoup
- # from ckiptagger import WS, POS, NER
- # remote : http://172.17.0.2:4444
- logger_ = log.init_logging('gnews', 'gnews')
- current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
- def brower_start(port):
- options = webdriver.ChromeOptions()
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def build_cache(db, table):
- id_dict=[]
- cursor = db.query('SELECT url FROM gnews.{};'.format(table))
- for c in cursor:
- id_dict += [c['url']]
- return id_dict
- def conv_time(t):
- min = int(re.findall('\d+', t)[0])
- if u'秒' in t:
- s = (datetime.now() - timedelta(seconds=min)
- ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
- elif u'分鐘' in t:
- s = (datetime.now() - timedelta(minutes=min)
- ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
- elif u'小時' in t:
- s = (datetime.now() - timedelta(hours=min)
- ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
- elif u'天' in t:
- s = (datetime.now() - timedelta(days=min)
- ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
- elif u'週' in t:
- s = (datetime.now() - timedelta(days=min*7)
- ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
- else:
- s = t
- return s
- def page_down_(driver, time_):
- for i in range(time_):
- try:
- actions = ActionChains(driver)
- actions.send_keys(Keys.PAGE_DOWN).perform()
- except:
- traceback.print_exc()
- time.sleep(0.5)
- # def get_trends(q, url_table, id_cache, driver):
-
- # driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
- # time.sleep(3)
- # driver.refresh()
- # c = 0
- # while True:
- # time.sleep(3)
- # c += 1
- # logger_.info('page {}'.format(c))
- # print(driver.current_url)
- # # print(driver.page_source)
- # elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
- # print(elmts)
- # for elmt in elmts:
- # title, url, company = '', '', ''
- # e = elmt.find_element_by_xpath(".//div[@role='heading']")
- # title = e.text
- # # print(title)
- # url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
- # # print(url)
- # company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
- # # print(company)
- # day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
- # day = conv_time(day)
- # # print(day)
- # print(title, url, company, day)
-
- # if url not in id_cache:
- # url_table.insert({
- # 'title': title,
- # 'url': url,
- # 'keyword': q,
- # 'company': company,
- # 'day': str(day),
- # 'crawler_date': current,
- # 'page': c,
- # '_status': 0
- # })
- # if c > 3: break
- # next_url = driver.current_url
- # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
- # driver.get(next_url)
- # print(next_url)
- # # try:
- # # page_down_(driver, 3)
- # # next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
- # # driver.get(next_url)
- # # print(next_url)
- # # driver.implicitly_wait(5)
- # # ActionChains(driver).move_to_element(element).click(element).perform()
- # # except:
- # # print('done')
- # # break
- # logger_.info('{} news list update'.format(q))
- # return driver
- def our_rule(url, company, driver):
- url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
- '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
- 'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
- '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
- detail_content = ""
- if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
- if company in url_domain_list:
- driver.get(url)
- if company == '買購不動產新聞台':
- e = driver.find_elements_by_xpath(
- ".//div[@class='content-font']")
- elif company == 'HiNet 新聞社群':
- e = driver.find_elements_by_xpath(".//div[@id='detail']")
- elif company == '好房網News':
- e = driver.find_elements_by_xpath(
- ".//div[@itemprop='articleBody']")
- elif company == '自由時報地產天下':
- e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
- elif company == '經濟日報':
- e = driver.find_elements_by_xpath(".//div[@id='article_body']")
- elif company == '台灣醒報 Awakening News Network':
- e = driver.find_elements_by_xpath(
- ".//div[@class='markdown-body']")
- elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
- e = driver.find_elements_by_xpath(".//div[@class='text']")
- elif company == 'Bella儂儂':
- e = driver.find_elements_by_xpath(".//div[@id='content_div']")
- elif company == '康健雜誌':
- e = driver.find_elements_by_xpath(
- ".//div[@class='limitContent']")
- elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
- e = driver.find_elements_by_xpath(
- ".//div[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
- elif company == '台灣蘋果日報 娛樂時尚':
- e = driver.find_elements_by_xpath(
- ".//p[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
- elif company == '公視新聞':
- e = driver.find_elements_by_xpath(
- ".//article[@class='post-article']")
- elif company == 'udn 房地產':
- e = driver.find_elements_by_xpath(
- ".//div[@id='story_body_content']")
- elif company == '公民新聞':
- e = driver.find_elements_by_xpath(
- ".//div[@class='field-items']")
- elif company == '自由娛樂':
- e = driver.find_elements_by_xpath(".//div[@class='text']")
- elif company == 'HiNet生活誌 - 中華電信':
- e = driver.find_elements_by_xpath(".//div[@id='detail']")
- for i in e:
- detail_content += i.text
- return detail_content
- def content_download(url):
- article = Article(url)
- article.download()
- article.parse()
- return article.text, article.publish_date
- def detail_crawler(data, detail_table, url_table, error_table, driver):
- error_list = []
- for key, group in data.iterrows():
- url = group['url']
- print(url)
- company = group['company']
- date = group['day']
- try:
- detail_content = our_rule(url, company, driver)
- if detail_content == '':
- detail_content, date = content_download(url)
- if detail_content == '':
- logger_.warning('{} : cannot find content'.format(url))
- error_list += [url]
- error_table.insert({
- 'url':url,
- 'keyword': group['keyword'],
- 'error_message': 'cannot find conten',
- 'crawler_date': current
- })
-
- detail_table.insert({
- 'url': url,
- 'keyword': group['keyword'],
- 'detail_content': detail_content,
- 'date': str(date),
- 'company': company,
- 'page': group['page'],
- 'crawler_date': current
- })
- url_table.upsert({'url':url,'_status':1},['url'])
- time.sleep(2)
- except Exception as e:
- error_table.insert({
- 'url':url,
- 'keyword': group['keyword'],
- 'error_message': str(e),
- 'crawler_date': current
- })
- return driver
- def get_next_job(db, table, query_key):
- result = db.query("select * from gnews.{} where _status=0 and keyword='{}' and crawler_date='{}'".format(table, query_key, current))
- url_pd = pd.DataFrame([dict(i) for i in result])
- return url_pd
- def get_trends(q, url_table, id_cache, driver, url, c):
- print(url)
- driver.get(url)
- time.sleep(3)
- print(driver.current_url)
- # print(driver.page_source)
- elmts = driver.find_elements_by_xpath("//div[@class='xuvV6b BGxR7d']")
- print(elmts)
- for elmt in elmts:
- title, url, company = '', '', ''
- e = elmt.find_element_by_xpath(".//div[@role='heading']")
- title = e.text
- # print(title)
- url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
- # print(url)
- company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
- # print(company)
- day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
- day = conv_time(day)
- # print(day)
- print(title, url, company, day)
-
- if url not in id_cache:
- url_table.insert({
- 'title': title,
- 'url': url,
- 'keyword': q,
- 'company': company,
- 'day': str(day),
- 'crawler_date': current,
- 'page': c,
- '_status': 0
- })
- # next_url = driver.current_url
- # next_url = next_url.replace('start={}'.format(c-1)*10,'start={}'.format(c)*10)
- # driver.get(next_url)
- # print(next_url)
- next_url = driver.find_element_by_xpath("//a[@id='pnnext']").get_attribute('href')
- logger_.info('{} news list update'.format(q))
- return driver, next_url
- def main():
- if len(sys.argv) > 1 :
- port = int(sys.argv[1])
- print('restart docker p{}'.format(port))
- os.system('sudo docker container restart p'+str(port))
- time.sleep(8)
- keyword = sys.argv[2]
- print(port)
- driver = brower_start(port)
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
- url_table_name = 'url_list2'
- url_table = db[url_table_name]
- detail_table = db['gnews_detail2']
- error_table = db['error_list']
- query_key = keyword
- logger_.info('{} start...'.format(query_key))
- # find new news url
- id_cache = build_cache(db, url_table_name)
- url = "https://www.google.com/search?q={}&tbm=nws".format(query_key)
- # url = "https://www.google.com"
- for i in range(3):
- logger_.info('page {}'.format(i+1))
- driver, url = get_trends(query_key, url_table, id_cache, driver, url, i)
- time.sleep(5)
- url_pd = get_next_job(db, url_table_name, query_key)
- logger_.info('find {} news...'.format(len(url_pd)))
-
- driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
- logger_.info('{} news description update'.format(query_key))
- db.close()
- driver.close()
- if __name__ == "__main__":
- main()
|