gnews.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.support.wait import WebDriverWait
  5. import time, pickle, sys, os, re, time, requests
  6. import dataset
  7. import pandas as pd
  8. from datetime import datetime, timedelta
  9. from newspaper import Article
  10. from utility import log
  11. from bs4 import BeautifulSoup
  12. # from ckiptagger import WS, POS, NER
  13. logger_ = log.init_logging('gnews', 'gnews')
  14. def brower_start(port):
  15. options = webdriver.ChromeOptions()
  16. browser = webdriver.Remote(
  17. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  18. desired_capabilities=options.to_capabilities()
  19. )
  20. return browser
  21. def build_cache(db, table):
  22. id_dict=[]
  23. cursor = db.query('SELECT url FROM gnews.{};'.format(table))
  24. for c in cursor:
  25. id_dict += [c['url']]
  26. return id_dict
  27. def conv_time(t):
  28. min = int(re.findall('\d+', t)[0])
  29. if u'秒' in t:
  30. s = (datetime.now() - timedelta(seconds=min)
  31. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  32. elif u'分鐘' in t:
  33. s = (datetime.now() - timedelta(minutes=min)
  34. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  35. elif u'小時' in t:
  36. s = (datetime.now() - timedelta(hours=min)
  37. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  38. elif u'天' in t:
  39. s = (datetime.now() - timedelta(days=min)
  40. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  41. elif u'週' in t:
  42. s = (datetime.now() - timedelta(days=min*7)
  43. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  44. else:
  45. s = t
  46. return s
  47. def get_trends(q, url_table, id_cache, driver):
  48. driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
  49. time.sleep(3)
  50. # click tool
  51. element = driver.find_element(By.ID, "hdtb-tls")
  52. driver.implicitly_wait(5)
  53. ActionChains(driver).move_to_element(element).click(element).perform()
  54. # click time
  55. element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
  56. driver.implicitly_wait(5)
  57. ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
  58. # click time
  59. element = driver.find_element(By.XPATH,"//div[@id='lb']")
  60. ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
  61. for e in ele:
  62. if e.text == '過去 1 個月':
  63. print(e.text)
  64. driver.implicitly_wait(5)
  65. ActionChains(driver).move_to_element(e).click(e).perform()
  66. break
  67. c = 0
  68. while True:
  69. time.sleep(3)
  70. c += 1
  71. logger_.info('page {}'.format(c))
  72. elmts = driver.find_elements_by_xpath("//g-card[@class='ftSUBd']")
  73. for elmt in elmts:
  74. # try:
  75. title, url, company = '', '', ''
  76. e = elmt.find_element_by_xpath(".//div[@role='heading']")
  77. title = e.text
  78. # print(title)
  79. url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
  80. # print(url)
  81. company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
  82. # print(company)
  83. day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
  84. day = conv_time(day)
  85. # print(day)
  86. current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
  87. print(title, url, company, day)
  88. if url not in id_cache:
  89. url_table.insert({
  90. 'title': title,
  91. 'url': url,
  92. 'keyword': q,
  93. 'company': company,
  94. 'day': str(day),
  95. 'crawler_date': current,
  96. 'page': c,
  97. '_status': 0
  98. })
  99. # except:
  100. # print('***',title, url)
  101. try:
  102. element = driver.find_element_by_xpath("//a[@id='pnnext']")
  103. driver.implicitly_wait(5)
  104. ActionChains(driver).move_to_element(element).click(element).perform()
  105. except:
  106. print('done')
  107. break
  108. logger_.info('{} news list update'.format(q))
  109. return driver
  110. def our_rule(url, company, driver):
  111. url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
  112. '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
  113. 'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
  114. '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
  115. detail_content = ""
  116. if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
  117. if company in url_domain_list:
  118. driver.get(url)
  119. if company == '買購不動產新聞台':
  120. e = driver.find_elements_by_xpath(
  121. ".//div[@class='content-font']")
  122. elif company == 'HiNet 新聞社群':
  123. e = driver.find_elements_by_xpath(".//div[@id='detail']")
  124. elif company == '好房網News':
  125. e = driver.find_elements_by_xpath(
  126. ".//div[@itemprop='articleBody']")
  127. elif company == '自由時報地產天下':
  128. e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
  129. elif company == '經濟日報':
  130. e = driver.find_elements_by_xpath(".//div[@id='article_body']")
  131. elif company == '台灣醒報 Awakening News Network':
  132. e = driver.find_elements_by_xpath(
  133. ".//div[@class='markdown-body']")
  134. elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
  135. e = driver.find_elements_by_xpath(".//div[@class='text']")
  136. elif company == 'Bella儂儂':
  137. e = driver.find_elements_by_xpath(".//div[@id='content_div']")
  138. elif company == '康健雜誌':
  139. e = driver.find_elements_by_xpath(
  140. ".//div[@class='limitContent']")
  141. elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
  142. e = driver.find_elements_by_xpath(
  143. ".//div[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
  144. elif company == '台灣蘋果日報 娛樂時尚':
  145. e = driver.find_elements_by_xpath(
  146. ".//p[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
  147. elif company == '公視新聞':
  148. e = driver.find_elements_by_xpath(
  149. ".//article[@class='post-article']")
  150. elif company == 'udn 房地產':
  151. e = driver.find_elements_by_xpath(
  152. ".//div[@id='story_body_content']")
  153. elif company == '公民新聞':
  154. e = driver.find_elements_by_xpath(
  155. ".//div[@class='field-items']")
  156. elif company == '自由娛樂':
  157. e = driver.find_elements_by_xpath(".//div[@class='text']")
  158. elif company == 'HiNet生活誌 - 中華電信':
  159. e = driver.find_elements_by_xpath(".//div[@id='detail']")
  160. for i in e:
  161. detail_content += i.text
  162. return detail_content
  163. def content_download(url):
  164. article = Article(url)
  165. article.download()
  166. article.parse()
  167. return article.text, article.publish_date
  168. def detail_crawler(data, detail_table, url_table, error_table, driver):
  169. error_list = []
  170. for key, group in data.iterrows():
  171. url = group['url']
  172. print(url)
  173. company = group['company']
  174. date = group['day']
  175. try:
  176. detail_content = our_rule(url, company, driver)
  177. if detail_content == '':
  178. detail_content, date = content_download(url)
  179. if detail_content == '':
  180. logger_.warning('{} : cannot find content'.format(url))
  181. error_list += [url]
  182. error_table.insert({
  183. 'url':url,
  184. 'keyword': group['keyword'],
  185. 'error_message': 'cannot find conten',
  186. 'crawler_date': current
  187. })
  188. current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
  189. detail_table.insert({
  190. 'url': url,
  191. 'keyword': group['keyword'],
  192. 'detail_content': detail_content,
  193. 'date': str(date),
  194. 'company': company,
  195. 'page': group['page'],
  196. 'crawler_date': current
  197. })
  198. url_table.upsert({'url':url,'_status':1},['url'])
  199. time.sleep(2)
  200. except Exception as e:
  201. current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
  202. error_table.insert({
  203. 'url':url,
  204. 'keyword': group['keyword'],
  205. 'error_message': str(e),
  206. 'crawler_date': current
  207. })
  208. return driver
  209. def get_next_job(db, table, query_key):
  210. result = db.query("select * from gnews.{} where _status=0 and keyword='{}'".format(table, query_key))
  211. url_pd = pd.DataFrame([dict(i) for i in result])
  212. return url_pd
  213. def main():
  214. # location_pd = pd.read_csv('location_list.csv')
  215. # location_pd = location_pd[location_pd['縣市']== '台北']
  216. # location = location_pd['地區'].to_list()
  217. location_list = ['台北大安', '台北文山']
  218. if len(sys.argv) > 1 :
  219. port=int(sys.argv[1])
  220. print('restart docker pw{}'.format(port))
  221. os.system('sudo docker container restart pw'+str(port))
  222. time.sleep(8)
  223. driver = brower_start(port)
  224. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
  225. url_table_name = 'url_list2'
  226. url_table = db[url_table_name]
  227. detail_table = db['gnews_detail2']
  228. error_table = db['error_list']
  229. for keyword in location_list:
  230. # query_key = '{} 政治'.format(keyword)
  231. for topic in ['', '政治', '疫情', '娛樂', '生活', '財經']:
  232. query_key = '{} {}'.format(keyword, topic)
  233. logger_.info('{} start...'.format(query_key))
  234. # find new news url
  235. id_cache = build_cache(db, url_table_name)
  236. driver = get_trends(query_key, url_table, id_cache, driver)
  237. time.sleep(5)
  238. url_pd = get_next_job(db, url_table_name, query_key)
  239. logger_.info('find {} news...'.format(len(url_pd)))
  240. driver = detail_crawler(url_pd, detail_table, url_table, error_table, driver)
  241. logger_.info('{} news description update'.format(query_key))
  242. db.close()
  243. driver.close()
  244. if __name__ == "__main__":
  245. main()