gnews_old.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. from selenium import webdriver
  2. import time, pickle, sys, os, re, time, requests
  3. # import dataset
  4. import telegram
  5. import pandas as pd
  6. from datetime import datetime, timedelta
  7. from utility import database_access as DA
  8. from newspaper import Article
  9. from utility import log
  10. from bs4 import BeautifulSoup
  11. from ckiptagger import WS, POS, NER
  12. from utility.connect import *
  13. logger_ = log.init_logging('gnews', 'gnews')
  14. bot = telegram.Bot(token='1661195930:AAG8TNHUuXoghSFf3VC-oKbo_erep7Y1io4')
  15. URL_LIST_COL = ['news_title', 'news_desc', 'news_url', 'search_keyword', 'company', 'news_day','crawler_date','_status']
  16. GNEWS_DETAIL_COL = ['news_url', 'news_content', 'news_day', 'crawler_date']
  17. def serive_create(profilepath):
  18. option = webdriver.ChromeOptions()
  19. option.add_argument('--headless')
  20. option.add_argument('--no-sandbox')
  21. option.add_argument('--disable-web-security')
  22. option.add_argument('--allow-running-insecure-content')
  23. option.add_argument('--incognito')
  24. option.add_argument(
  25. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  26. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  27. option.add_argument(
  28. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  29. option.add_argument("profile-directory="+profilepath)
  30. # driver = webdriver.Chrome('./utility/chromedriver', options=option)
  31. driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  32. service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  33. executor_url = driver.command_executor._url
  34. session_id = driver.session_id
  35. print(session_id)
  36. print(executor_url)
  37. return driver
  38. def conv_time(t):
  39. min = int(re.findall('\d+', t)[0])
  40. if u'秒' in t:
  41. s = (datetime.now() - timedelta(seconds=min)
  42. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  43. elif u'分鐘' in t:
  44. s = (datetime.now() - timedelta(minutes=min)
  45. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  46. elif u'小時' in t:
  47. s = (datetime.now() - timedelta(hours=min)
  48. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  49. elif u'天' in t:
  50. s = (datetime.now() - timedelta(days=min)
  51. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  52. elif u'週' in t:
  53. s = (datetime.now() - timedelta(days=min*7)
  54. ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日')
  55. else:
  56. s = t
  57. return s
  58. def get_trends(q, db, current, profilepath):
  59. driver = serive_create(profilepath)
  60. driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
  61. time.sleep(3)
  62. # soup = BeautifulSoup(driver.page_source, 'html.parser')
  63. # content = soup.prettify()
  64. # print(content)
  65. c = 0
  66. while True:
  67. time.sleep(15)
  68. c += 1
  69. logger_.info('page {}'.format(c))
  70. elmts = driver.find_elements_by_xpath("//div[@class='yr3B8d KWQBje']")
  71. for elmt in elmts:
  72. try:
  73. e = elmt.find_element_by_xpath(".//div[@role='heading']")
  74. title = e.text
  75. # check whether it is traditional
  76. title.encode('big5')
  77. print(e.text)
  78. e2 = elmt.find_element_by_xpath(".//div[@class='Y3v8qd']")
  79. # print(e2.text)
  80. desc = e2.text
  81. e3 = elmt.find_element_by_xpath("..")
  82. print(e3.get_attribute('href'))
  83. url = e3.get_attribute('href')
  84. e4 = elmt.find_element_by_xpath(
  85. ".//div[@class='XTjFC WF4CUc']")
  86. # print(e4.text)
  87. company = e4.text
  88. print(company)
  89. e5 = elmt.find_element_by_xpath(".//span[@class='WG9SHc']")
  90. # print(e5.text)
  91. day = e5.text
  92. day = conv_time(day)
  93. tmp = [title, desc, url, q, company, str(day), current, 0]
  94. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  95. .format('url_list', str(tuple(URL_LIST_COL)).replace('\'',''), tuple(tmp))
  96. DA.mysql_insert_data(db, insert_sql)
  97. # if DA.check_unique(table, url):
  98. # table.insert({'title': title,
  99. # 'desc': desc,
  100. # 'url': url,
  101. # 'keyword': q,
  102. # 'company': company,
  103. # 'day': day,
  104. # 'crawler_date': current,
  105. # '_status': 0})
  106. except:
  107. print(title, url)
  108. try:
  109. elmt = driver.find_element_by_xpath("//a[@id='pnnext']")
  110. except:
  111. print('done')
  112. break
  113. webdriver.ActionChains(driver).move_to_element(elmt).perform()
  114. webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
  115. print('done...')
  116. driver.close()
  117. def our_rule(url, company, driver):
  118. url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報',
  119. '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經',
  120. 'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞',
  121. '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信']
  122. detail_content = ""
  123. if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1:
  124. if company in url_domain_list:
  125. driver.get(url)
  126. if company == '買購不動產新聞台':
  127. e = driver.find_elements_by_xpath(
  128. ".//div[@class='content-font']")
  129. elif company == 'HiNet 新聞社群':
  130. e = driver.find_elements_by_xpath(".//div[@id='detail']")
  131. elif company == '好房網News':
  132. e = driver.find_elements_by_xpath(
  133. ".//div[@itemprop='articleBody']")
  134. elif company == '自由時報地產天下':
  135. e = driver.find_elements_by_xpath(".//div[@data-desc='內文']")
  136. elif company == '經濟日報':
  137. e = driver.find_elements_by_xpath(".//div[@id='article_body']")
  138. elif company == '台灣醒報 Awakening News Network':
  139. e = driver.find_elements_by_xpath(
  140. ".//div[@class='markdown-body']")
  141. elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經':
  142. e = driver.find_elements_by_xpath(".//div[@class='text']")
  143. elif company == 'Bella儂儂':
  144. e = driver.find_elements_by_xpath(".//div[@id='content_div']")
  145. elif company == '康健雜誌':
  146. e = driver.find_elements_by_xpath(
  147. ".//div[@class='limitContent']")
  148. elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞':
  149. e = driver.find_elements_by_xpath(
  150. ".//div[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
  151. elif company == '台灣蘋果日報 娛樂時尚':
  152. e = driver.find_elements_by_xpath(
  153. ".//p[@class='text--desktop text--mobile article-text-size_md tw-max_width']")
  154. elif company == '公視新聞':
  155. e = driver.find_elements_by_xpath(
  156. ".//article[@class='post-article']")
  157. elif company == 'udn 房地產':
  158. e = driver.find_elements_by_xpath(
  159. ".//div[@id='story_body_content']")
  160. elif company == '公民新聞':
  161. e = driver.find_elements_by_xpath(
  162. ".//div[@class='field-items']")
  163. elif company == '自由娛樂':
  164. e = driver.find_elements_by_xpath(".//div[@class='text']")
  165. elif company == 'HiNet生活誌 - 中華電信':
  166. e = driver.find_elements_by_xpath(".//div[@id='detail']")
  167. for i in e:
  168. detail_content += i.text
  169. return detail_content
  170. def content_download(url):
  171. article = Article(url)
  172. article.download()
  173. article.parse()
  174. return article.text, article.publish_date
  175. def SendCheck(detail_content, url, title, ws, pos, ner):
  176. pattern = r'[陳|林|黃|張|李|王|吳|劉|蔡|楊|許|鄭|謝|郭|洪|曾|邱|廖|賴|周|徐|蘇|葉|莊|呂|江|何|蕭|羅|高|簡|朱|鍾|施|游|詹|沈|彭|胡|余|盧|潘|顏|梁|趙|柯|翁|魏|方|孫|張簡|戴|范|歐陽|宋|鄧|杜|侯|曹|薛|傅|丁|溫|紀|范姜|蔣|歐|藍|連|唐|馬|董|石|卓|程|姚|康|馮|古|姜|湯|汪|白|田|涂|鄒|巫|尤|鐘|龔|嚴|韓|黎|阮|袁|童|陸|金|錢|邵][\u4E00-\u9fa5]{2}[^攝]'
  177. pattern = re.compile(pattern)
  178. push = 0
  179. content_ = re.sub(r'[^\w\s]', '', detail_content)
  180. # print(content_)
  181. for i in pattern.findall(content_):
  182. index_ = content_.find(i)
  183. pre = content_[index_-10:index_]
  184. after = content_[index_:index_+10]
  185. ws_results = ws([pre + after])
  186. skip = 0
  187. for word in ['記者', '報導', '攝影', '攝', '新聞']:
  188. if word in ws_results[0]:
  189. skip = 1
  190. if skip:
  191. continue
  192. pos_results = pos(ws_results)
  193. ner_results = ner(ws_results, pos_results)
  194. c = 0
  195. for i in ner_results[0]:
  196. if 'PERSON' in list(i):
  197. # print(ner_results)
  198. push = 1
  199. c = 1
  200. if c == 1:
  201. break
  202. if (content_[index_-3:index_]) == '設計師' or (content_[index_-3:index_]) == '發明者' or (content_[index_-3:index_]) == '建築師' or (content_[index_-3:index_]) == '總經理':
  203. # print(content_[index_-3:index_] + '-' + content_[index_:3])
  204. push = 1
  205. break
  206. elif (content_[index_-2:index_]) == '會長' or (content_[index_-2:index_]) == '副總':
  207. # print(content_[index_-2:index_] + '-' + content_[index_:3])
  208. push = 1
  209. break
  210. elif (content_[index_-4:index_]) == '專案經理':
  211. # print(content_[index_-4:index_] + '-' + content_[index_:3])
  212. push = 1
  213. break
  214. if push == 1:
  215. # pass
  216. # bot.sendMessage(chat_id=1605426233, text=url)
  217. params = {"message": '[ '+ title + ' ] ' + url}
  218. r = requests.post("https://notify-api.line.me/api/notify",headers=LINE_HEADERS, params=params)
  219. def detail_content(data, current, profilepath, db, ws, pos, ner):
  220. driver = serive_create(profilepath)
  221. error_list = []
  222. for key, group in data.iterrows():
  223. url = group['news_url']
  224. company = group['company']
  225. print(url)
  226. print(group['news_title'])
  227. date = None
  228. try:
  229. detail_content = our_rule(url, company, driver)
  230. if detail_content == '':
  231. detail_content, date = content_download(url)
  232. if detail_content == '':
  233. logger_.warning('{} : cannot find content'.format(url))
  234. error_list += [url]
  235. tmp = [url, detail_content, str(date), current]
  236. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  237. .format('gnews_detail', str(tuple(GNEWS_DETAIL_COL)).replace('\'',''), tuple(tmp))
  238. DA.mysql_insert_data(db, insert_sql)
  239. update_sql = "UPDATE url_list SET _status = 1 WHERE news_url = '{}'".format(url)
  240. DA.mysql_insert_data(db, update_sql)
  241. # tmp = dict(url=group['url'],
  242. # detail_content=detail_content,
  243. # day2=date,
  244. # crawler_date=current,
  245. # _status=1)
  246. # table.update(tmp, ['url'])
  247. SendCheck(detail_content, url, group['news_title'], ws, pos, ner)
  248. time.sleep(15)
  249. except Exception as e:
  250. print(url)
  251. logger_.error('{} / {} : cannot find content'.format(url, str(e)))
  252. update_sql = "UPDATE url_list SET _status = -1 WHERE news_url = '{}'".format(url)
  253. DA.mysql_insert_data(db, update_sql)
  254. error_list += [url]
  255. driver.close()
  256. def main():
  257. query_key = u'建材'
  258. profilepath = 'Profile 1'
  259. logger_.info('start...')
  260. current = datetime.today().strftime(
  261. "%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
  262. db = DA.mysql_connect(MYSQL_CONFIG, 'gnews')
  263. # db = DA.DBconnect()
  264. # table = DA.Tabelconnect(db, 'gnews3')
  265. get_trends(query_key, db, current, profilepath)
  266. logger_.info('{} news list update'.format(query_key))
  267. time.sleep(120)
  268. ws = WS("./data")
  269. pos = POS("./data")
  270. ner = NER("./data")
  271. # pos,ner='',''
  272. # table = DA.Tabelconnect(db, 'gnews3')
  273. # data = pd.DataFrame([dict(i) for i in table.find(_status=0)])
  274. query_sql = "SELECT * FROM url_list WHERE _status=0"
  275. results = DA.mysql_select_data(db, query_sql)
  276. results = pd.DataFrame(results, columns=['id'] + URL_LIST_COL)
  277. logger_.info('find {} news...'.format(len(results)))
  278. detail_content(results, current, profilepath, db, ws, pos, ner)
  279. logger_.info('{} news description update'.format(query_key))
  280. db.close()
  281. if __name__ == "__main__":
  282. main()