from selenium import webdriver import time, pickle, sys, os, re, time, requests # import dataset import telegram import pandas as pd from datetime import datetime, timedelta from utility import database_access as DA from newspaper import Article from utility import log from bs4 import BeautifulSoup from ckiptagger import WS, POS, NER from utility.connect import * logger_ = log.init_logging('gnews', 'gnews') bot = telegram.Bot(token='1661195930:AAG8TNHUuXoghSFf3VC-oKbo_erep7Y1io4') URL_LIST_COL = ['news_title', 'news_desc', 'news_url', 'search_keyword', 'company', 'news_day','crawler_date','_status'] GNEWS_DETAIL_COL = ['news_url', 'news_content', 'news_day', 'crawler_date'] def serive_create(profilepath): option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('--no-sandbox') option.add_argument('--disable-web-security') option.add_argument('--allow-running-insecure-content') option.add_argument('--incognito') option.add_argument( 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0') # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data") option.add_argument( "--user-data-dir=/home/noodlesloves/.config/google-chrome/") option.add_argument("profile-directory="+profilepath) # driver = webdriver.Chrome('./utility/chromedriver', options=option) driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option, service_args=['--verbose', '--log-path=/tmp/chromedriver.log']) executor_url = driver.command_executor._url session_id = driver.session_id print(session_id) print(executor_url) return driver def conv_time(t): min = int(re.findall('\d+', t)[0]) if u'秒' in t: s = (datetime.now() - timedelta(seconds=min) ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日') elif u'分鐘' in t: s = (datetime.now() - timedelta(minutes=min) ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日') elif u'小時' in t: s = (datetime.now() - timedelta(hours=min) ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日') elif u'天' in t: s = (datetime.now() - timedelta(days=min) ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日') elif u'週' in t: s = (datetime.now() - timedelta(days=min*7) ).strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日') else: s = t return s def get_trends(q, db, current, profilepath): driver = serive_create(profilepath) driver.get("https://www.google.com/search?q={}&tbm=nws".format(q)) time.sleep(3) # soup = BeautifulSoup(driver.page_source, 'html.parser') # content = soup.prettify() # print(content) c = 0 while True: time.sleep(15) c += 1 logger_.info('page {}'.format(c)) elmts = driver.find_elements_by_xpath("//div[@class='yr3B8d KWQBje']") for elmt in elmts: try: e = elmt.find_element_by_xpath(".//div[@role='heading']") title = e.text # check whether it is traditional title.encode('big5') print(e.text) e2 = elmt.find_element_by_xpath(".//div[@class='Y3v8qd']") # print(e2.text) desc = e2.text e3 = elmt.find_element_by_xpath("..") print(e3.get_attribute('href')) url = e3.get_attribute('href') e4 = elmt.find_element_by_xpath( ".//div[@class='XTjFC WF4CUc']") # print(e4.text) company = e4.text print(company) e5 = elmt.find_element_by_xpath(".//span[@class='WG9SHc']") # print(e5.text) day = e5.text day = conv_time(day) tmp = [title, desc, url, q, company, str(day), current, 0] insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\ .format('url_list', str(tuple(URL_LIST_COL)).replace('\'',''), tuple(tmp)) DA.mysql_insert_data(db, insert_sql) # if DA.check_unique(table, url): # table.insert({'title': title, # 'desc': desc, # 'url': url, # 'keyword': q, # 'company': company, # 'day': day, # 'crawler_date': current, # '_status': 0}) except: print(title, url) try: elmt = driver.find_element_by_xpath("//a[@id='pnnext']") except: print('done') break webdriver.ActionChains(driver).move_to_element(elmt).perform() webdriver.ActionChains(driver).move_to_element(elmt).click().perform() print('done...') driver.close() def our_rule(url, company, driver): url_domain_list = ['買購不動產新聞台', 'HiNet 新聞社群', '好房網News', '自由時報地產天下', '經濟日報', '台灣醒報 Awakening News Network', '自由時報電子報', '自由電子報市場動態', '自由財經', 'Bella儂儂', '康健雜誌', '台灣蘋果日報 娛樂時尚', '台灣蘋果日報', '台灣蘋果日報 動新聞', '公視新聞', '公民新聞', '自由娛樂', 'HiNet生活誌 - 中華電信'] detail_content = "" if url.find('hk') == -1 and url.find('hongkong') == -1 and url.find('youtube') == -1: if company in url_domain_list: driver.get(url) if company == '買購不動產新聞台': e = driver.find_elements_by_xpath( ".//div[@class='content-font']") elif company == 'HiNet 新聞社群': e = driver.find_elements_by_xpath(".//div[@id='detail']") elif company == '好房網News': e = driver.find_elements_by_xpath( ".//div[@itemprop='articleBody']") elif company == '自由時報地產天下': e = driver.find_elements_by_xpath(".//div[@data-desc='內文']") elif company == '經濟日報': e = driver.find_elements_by_xpath(".//div[@id='article_body']") elif company == '台灣醒報 Awakening News Network': e = driver.find_elements_by_xpath( ".//div[@class='markdown-body']") elif company == '自由時報電子報' or company == '自由電子報市場動態' or company == '自由財經': e = driver.find_elements_by_xpath(".//div[@class='text']") elif company == 'Bella儂儂': e = driver.find_elements_by_xpath(".//div[@id='content_div']") elif company == '康健雜誌': e = driver.find_elements_by_xpath( ".//div[@class='limitContent']") elif company == '台灣蘋果日報' or company == '台灣蘋果日報 動新聞': e = driver.find_elements_by_xpath( ".//div[@class='text--desktop text--mobile article-text-size_md tw-max_width']") elif company == '台灣蘋果日報 娛樂時尚': e = driver.find_elements_by_xpath( ".//p[@class='text--desktop text--mobile article-text-size_md tw-max_width']") elif company == '公視新聞': e = driver.find_elements_by_xpath( ".//article[@class='post-article']") elif company == 'udn 房地產': e = driver.find_elements_by_xpath( ".//div[@id='story_body_content']") elif company == '公民新聞': e = driver.find_elements_by_xpath( ".//div[@class='field-items']") elif company == '自由娛樂': e = driver.find_elements_by_xpath(".//div[@class='text']") elif company == 'HiNet生活誌 - 中華電信': e = driver.find_elements_by_xpath(".//div[@id='detail']") for i in e: detail_content += i.text return detail_content def content_download(url): article = Article(url) article.download() article.parse() return article.text, article.publish_date def SendCheck(detail_content, url, title, ws, pos, ner): pattern = r'[陳|林|黃|張|李|王|吳|劉|蔡|楊|許|鄭|謝|郭|洪|曾|邱|廖|賴|周|徐|蘇|葉|莊|呂|江|何|蕭|羅|高|簡|朱|鍾|施|游|詹|沈|彭|胡|余|盧|潘|顏|梁|趙|柯|翁|魏|方|孫|張簡|戴|范|歐陽|宋|鄧|杜|侯|曹|薛|傅|丁|溫|紀|范姜|蔣|歐|藍|連|唐|馬|董|石|卓|程|姚|康|馮|古|姜|湯|汪|白|田|涂|鄒|巫|尤|鐘|龔|嚴|韓|黎|阮|袁|童|陸|金|錢|邵][\u4E00-\u9fa5]{2}[^攝]' pattern = re.compile(pattern) push = 0 content_ = re.sub(r'[^\w\s]', '', detail_content) # print(content_) for i in pattern.findall(content_): index_ = content_.find(i) pre = content_[index_-10:index_] after = content_[index_:index_+10] ws_results = ws([pre + after]) skip = 0 for word in ['記者', '報導', '攝影', '攝', '新聞']: if word in ws_results[0]: skip = 1 if skip: continue pos_results = pos(ws_results) ner_results = ner(ws_results, pos_results) c = 0 for i in ner_results[0]: if 'PERSON' in list(i): # print(ner_results) push = 1 c = 1 if c == 1: break if (content_[index_-3:index_]) == '設計師' or (content_[index_-3:index_]) == '發明者' or (content_[index_-3:index_]) == '建築師' or (content_[index_-3:index_]) == '總經理': # print(content_[index_-3:index_] + '-' + content_[index_:3]) push = 1 break elif (content_[index_-2:index_]) == '會長' or (content_[index_-2:index_]) == '副總': # print(content_[index_-2:index_] + '-' + content_[index_:3]) push = 1 break elif (content_[index_-4:index_]) == '專案經理': # print(content_[index_-4:index_] + '-' + content_[index_:3]) push = 1 break if push == 1: # pass # bot.sendMessage(chat_id=1605426233, text=url) params = {"message": '[ '+ title + ' ] ' + url} r = requests.post("https://notify-api.line.me/api/notify",headers=LINE_HEADERS, params=params) def detail_content(data, current, profilepath, db, ws, pos, ner): driver = serive_create(profilepath) error_list = [] for key, group in data.iterrows(): url = group['news_url'] company = group['company'] print(url) print(group['news_title']) date = None try: detail_content = our_rule(url, company, driver) if detail_content == '': detail_content, date = content_download(url) if detail_content == '': logger_.warning('{} : cannot find content'.format(url)) error_list += [url] tmp = [url, detail_content, str(date), current] insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\ .format('gnews_detail', str(tuple(GNEWS_DETAIL_COL)).replace('\'',''), tuple(tmp)) DA.mysql_insert_data(db, insert_sql) update_sql = "UPDATE url_list SET _status = 1 WHERE news_url = '{}'".format(url) DA.mysql_insert_data(db, update_sql) # tmp = dict(url=group['url'], # detail_content=detail_content, # day2=date, # crawler_date=current, # _status=1) # table.update(tmp, ['url']) SendCheck(detail_content, url, group['news_title'], ws, pos, ner) time.sleep(15) except Exception as e: print(url) logger_.error('{} / {} : cannot find content'.format(url, str(e))) update_sql = "UPDATE url_list SET _status = -1 WHERE news_url = '{}'".format(url) DA.mysql_insert_data(db, update_sql) error_list += [url] driver.close() def main(): query_key = u'建材' profilepath = 'Profile 1' logger_.info('start...') current = datetime.today().strftime( "%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日') db = DA.mysql_connect(MYSQL_CONFIG, 'gnews') # db = DA.DBconnect() # table = DA.Tabelconnect(db, 'gnews3') get_trends(query_key, db, current, profilepath) logger_.info('{} news list update'.format(query_key)) time.sleep(120) ws = WS("./data") pos = POS("./data") ner = NER("./data") # pos,ner='','' # table = DA.Tabelconnect(db, 'gnews3') # data = pd.DataFrame([dict(i) for i in table.find(_status=0)]) query_sql = "SELECT * FROM url_list WHERE _status=0" results = DA.mysql_select_data(db, query_sql) results = pd.DataFrame(results, columns=['id'] + URL_LIST_COL) logger_.info('find {} news...'.format(len(results))) detail_content(results, current, profilepath, db, ws, pos, ner) logger_.info('{} news description update'.format(query_key)) db.close() if __name__ == "__main__": main()