|
@@ -1,45 +1,35 @@
|
|
from selenium import webdriver
|
|
from selenium import webdriver
|
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
import time, pickle, sys, os, re, time, requests
|
|
import time, pickle, sys, os, re, time, requests
|
|
-# import dataset
|
|
|
|
-import telegram
|
|
|
|
|
|
+import dataset
|
|
import pandas as pd
|
|
import pandas as pd
|
|
from datetime import datetime, timedelta
|
|
from datetime import datetime, timedelta
|
|
-from utility import database_access as DA
|
|
|
|
from newspaper import Article
|
|
from newspaper import Article
|
|
from utility import log
|
|
from utility import log
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
-from ckiptagger import WS, POS, NER
|
|
|
|
-from utility.connect import *
|
|
|
|
|
|
+# from ckiptagger import WS, POS, NER
|
|
|
|
+
|
|
|
|
|
|
logger_ = log.init_logging('gnews', 'gnews')
|
|
logger_ = log.init_logging('gnews', 'gnews')
|
|
-bot = telegram.Bot(token='1661195930:AAG8TNHUuXoghSFf3VC-oKbo_erep7Y1io4')
|
|
|
|
-
|
|
|
|
-URL_LIST_COL = ['news_title', 'news_desc', 'news_url', 'search_keyword', 'company', 'news_day','crawler_date','_status']
|
|
|
|
-GNEWS_DETAIL_COL = ['news_url', 'news_content', 'news_day', 'crawler_date']
|
|
|
|
-
|
|
|
|
-def serive_create(profilepath):
|
|
|
|
- option = webdriver.ChromeOptions()
|
|
|
|
- option.add_argument('--headless')
|
|
|
|
- option.add_argument('--no-sandbox')
|
|
|
|
- option.add_argument('--disable-web-security')
|
|
|
|
- option.add_argument('--allow-running-insecure-content')
|
|
|
|
- option.add_argument('--incognito')
|
|
|
|
- option.add_argument(
|
|
|
|
- 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
|
|
|
|
- # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
|
|
|
|
- option.add_argument(
|
|
|
|
- "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
|
|
|
|
- option.add_argument("profile-directory="+profilepath)
|
|
|
|
- # driver = webdriver.Chrome('./utility/chromedriver', options=option)
|
|
|
|
- driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
|
|
|
|
- service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
|
|
|
|
-
|
|
|
|
- executor_url = driver.command_executor._url
|
|
|
|
- session_id = driver.session_id
|
|
|
|
- print(session_id)
|
|
|
|
- print(executor_url)
|
|
|
|
-
|
|
|
|
- return driver
|
|
|
|
|
|
+
|
|
|
|
+def brower_start(port):
|
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
|
+ browser = webdriver.Remote(
|
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
|
+ )
|
|
|
|
+ return browser
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def build_cache(db, table):
|
|
|
|
+ id_dict=[]
|
|
|
|
+ cursor = db.query('SELECT url FROM gnews.{};'.format(table))
|
|
|
|
+
|
|
|
|
+ for c in cursor:
|
|
|
|
+ id_dict += [c['url']]
|
|
|
|
+ return id_dict
|
|
|
|
|
|
|
|
|
|
def conv_time(t):
|
|
def conv_time(t):
|
|
@@ -68,70 +58,79 @@ def conv_time(t):
|
|
return s
|
|
return s
|
|
|
|
|
|
|
|
|
|
-def get_trends(q, db, current, profilepath):
|
|
|
|
- driver = serive_create(profilepath)
|
|
|
|
-
|
|
|
|
|
|
+def get_trends(q, url_table, id_cache, driver):
|
|
|
|
+
|
|
driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
|
|
driver.get("https://www.google.com/search?q={}&tbm=nws".format(q))
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
- # soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
|
- # content = soup.prettify()
|
|
|
|
- # print(content)
|
|
|
|
|
|
+
|
|
|
|
+ # click tool
|
|
|
|
+ element = driver.find_element(By.ID, "hdtb-tls")
|
|
|
|
+ driver.implicitly_wait(5)
|
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
|
|
+
|
|
|
|
+ # click time
|
|
|
|
+ element = driver.find_elements(By.XPATH, "//div[@class='KTBKoe']")
|
|
|
|
+ driver.implicitly_wait(5)
|
|
|
|
+ ActionChains(driver).move_to_element(element[1]).click(element[1]).perform()
|
|
|
|
+
|
|
|
|
+ # click time
|
|
|
|
+ element = driver.find_element(By.XPATH,"//div[@id='lb']")
|
|
|
|
+ ele = element.find_elements(By.XPATH,"//g-menu-item[@class='ErsxPb']")
|
|
|
|
+ for e in ele:
|
|
|
|
+ if e.text == '過去 1 個月':
|
|
|
|
+ print(e.text)
|
|
|
|
+ driver.implicitly_wait(5)
|
|
|
|
+ ActionChains(driver).move_to_element(e).click(e).perform()
|
|
|
|
+ break
|
|
|
|
+
|
|
c = 0
|
|
c = 0
|
|
while True:
|
|
while True:
|
|
- time.sleep(15)
|
|
|
|
|
|
+ time.sleep(3)
|
|
c += 1
|
|
c += 1
|
|
logger_.info('page {}'.format(c))
|
|
logger_.info('page {}'.format(c))
|
|
- elmts = driver.find_elements_by_xpath("//div[@class='yr3B8d KWQBje']")
|
|
|
|
|
|
+ elmts = driver.find_elements_by_xpath("//g-card[@class='ftSUBd']")
|
|
|
|
+
|
|
for elmt in elmts:
|
|
for elmt in elmts:
|
|
- try:
|
|
|
|
|
|
+ # try:
|
|
|
|
+ title, url, company = '', '', ''
|
|
e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
e = elmt.find_element_by_xpath(".//div[@role='heading']")
|
|
title = e.text
|
|
title = e.text
|
|
- # check whether it is traditional
|
|
|
|
- title.encode('big5')
|
|
|
|
- print(e.text)
|
|
|
|
- e2 = elmt.find_element_by_xpath(".//div[@class='Y3v8qd']")
|
|
|
|
- # print(e2.text)
|
|
|
|
- desc = e2.text
|
|
|
|
- e3 = elmt.find_element_by_xpath("..")
|
|
|
|
- print(e3.get_attribute('href'))
|
|
|
|
- url = e3.get_attribute('href')
|
|
|
|
- e4 = elmt.find_element_by_xpath(
|
|
|
|
- ".//div[@class='XTjFC WF4CUc']")
|
|
|
|
- # print(e4.text)
|
|
|
|
- company = e4.text
|
|
|
|
- print(company)
|
|
|
|
- e5 = elmt.find_element_by_xpath(".//span[@class='WG9SHc']")
|
|
|
|
- # print(e5.text)
|
|
|
|
- day = e5.text
|
|
|
|
|
|
+ # print(title)
|
|
|
|
+
|
|
|
|
+ url = elmt.find_element_by_xpath(".//a[@class='WlydOe']").get_attribute('href')
|
|
|
|
+ # print(url)
|
|
|
|
+
|
|
|
|
+ company = elmt.find_element_by_xpath(".//div[@class='CEMjEf NUnG9d']").text
|
|
|
|
+ # print(company)
|
|
|
|
+
|
|
|
|
+ day = elmt.find_element_by_xpath(".//div[@class='OSrXXb ZE0LJd']").text
|
|
day = conv_time(day)
|
|
day = conv_time(day)
|
|
|
|
+ # print(day)
|
|
|
|
|
|
- tmp = [title, desc, url, q, company, str(day), current, 0]
|
|
|
|
- insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
|
- .format('url_list', str(tuple(URL_LIST_COL)).replace('\'',''), tuple(tmp))
|
|
|
|
|
|
+ current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
|
|
|
|
+ print(title, url, company, day)
|
|
|
|
|
|
- DA.mysql_insert_data(db, insert_sql)
|
|
|
|
- # if DA.check_unique(table, url):
|
|
|
|
- # table.insert({'title': title,
|
|
|
|
- # 'desc': desc,
|
|
|
|
- # 'url': url,
|
|
|
|
- # 'keyword': q,
|
|
|
|
- # 'company': company,
|
|
|
|
- # 'day': day,
|
|
|
|
- # 'crawler_date': current,
|
|
|
|
- # '_status': 0})
|
|
|
|
- except:
|
|
|
|
- print(title, url)
|
|
|
|
|
|
+ if url not in id_cache:
|
|
|
|
+ url_table.insert({
|
|
|
|
+ 'title': title,
|
|
|
|
+ 'url': url,
|
|
|
|
+ 'keyword': q,
|
|
|
|
+ 'company': company,
|
|
|
|
+ 'day': str(day),
|
|
|
|
+ 'crawler_date': current,
|
|
|
|
+ 'page': c,
|
|
|
|
+ '_status': 0
|
|
|
|
+ })
|
|
|
|
+ # except:
|
|
|
|
+ # print('***',title, url)
|
|
try:
|
|
try:
|
|
- elmt = driver.find_element_by_xpath("//a[@id='pnnext']")
|
|
|
|
|
|
+ element = driver.find_element_by_xpath("//a[@id='pnnext']")
|
|
|
|
+ driver.implicitly_wait(5)
|
|
|
|
+ ActionChains(driver).move_to_element(element).click(element).perform()
|
|
except:
|
|
except:
|
|
print('done')
|
|
print('done')
|
|
break
|
|
break
|
|
- webdriver.ActionChains(driver).move_to_element(elmt).perform()
|
|
|
|
- webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- print('done...')
|
|
|
|
- driver.close()
|
|
|
|
|
|
+ logger_.info('{} news list update'.format(q))
|
|
|
|
|
|
|
|
|
|
def our_rule(url, company, driver):
|
|
def our_rule(url, company, driver):
|
|
@@ -198,69 +197,14 @@ def content_download(url):
|
|
return article.text, article.publish_date
|
|
return article.text, article.publish_date
|
|
|
|
|
|
|
|
|
|
-def SendCheck(detail_content, url, title, ws, pos, ner):
|
|
|
|
- pattern = r'[陳|林|黃|張|李|王|吳|劉|蔡|楊|許|鄭|謝|郭|洪|曾|邱|廖|賴|周|徐|蘇|葉|莊|呂|江|何|蕭|羅|高|簡|朱|鍾|施|游|詹|沈|彭|胡|余|盧|潘|顏|梁|趙|柯|翁|魏|方|孫|張簡|戴|范|歐陽|宋|鄧|杜|侯|曹|薛|傅|丁|溫|紀|范姜|蔣|歐|藍|連|唐|馬|董|石|卓|程|姚|康|馮|古|姜|湯|汪|白|田|涂|鄒|巫|尤|鐘|龔|嚴|韓|黎|阮|袁|童|陸|金|錢|邵][\u4E00-\u9fa5]{2}[^攝]'
|
|
|
|
- pattern = re.compile(pattern)
|
|
|
|
-
|
|
|
|
- push = 0
|
|
|
|
- content_ = re.sub(r'[^\w\s]', '', detail_content)
|
|
|
|
- # print(content_)
|
|
|
|
-
|
|
|
|
- for i in pattern.findall(content_):
|
|
|
|
- index_ = content_.find(i)
|
|
|
|
- pre = content_[index_-10:index_]
|
|
|
|
- after = content_[index_:index_+10]
|
|
|
|
- ws_results = ws([pre + after])
|
|
|
|
-
|
|
|
|
- skip = 0
|
|
|
|
- for word in ['記者', '報導', '攝影', '攝', '新聞']:
|
|
|
|
- if word in ws_results[0]:
|
|
|
|
- skip = 1
|
|
|
|
- if skip:
|
|
|
|
- continue
|
|
|
|
-
|
|
|
|
- pos_results = pos(ws_results)
|
|
|
|
- ner_results = ner(ws_results, pos_results)
|
|
|
|
- c = 0
|
|
|
|
- for i in ner_results[0]:
|
|
|
|
- if 'PERSON' in list(i):
|
|
|
|
- # print(ner_results)
|
|
|
|
- push = 1
|
|
|
|
- c = 1
|
|
|
|
- if c == 1:
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- if (content_[index_-3:index_]) == '設計師' or (content_[index_-3:index_]) == '發明者' or (content_[index_-3:index_]) == '建築師' or (content_[index_-3:index_]) == '總經理':
|
|
|
|
- # print(content_[index_-3:index_] + '-' + content_[index_:3])
|
|
|
|
- push = 1
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- elif (content_[index_-2:index_]) == '會長' or (content_[index_-2:index_]) == '副總':
|
|
|
|
- # print(content_[index_-2:index_] + '-' + content_[index_:3])
|
|
|
|
- push = 1
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- elif (content_[index_-4:index_]) == '專案經理':
|
|
|
|
- # print(content_[index_-4:index_] + '-' + content_[index_:3])
|
|
|
|
- push = 1
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- if push == 1:
|
|
|
|
- # pass
|
|
|
|
- # bot.sendMessage(chat_id=1605426233, text=url)
|
|
|
|
- params = {"message": '[ '+ title + ' ] ' + url}
|
|
|
|
- r = requests.post("https://notify-api.line.me/api/notify",headers=LINE_HEADERS, params=params)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def detail_content(data, current, profilepath, db, ws, pos, ner):
|
|
|
|
- driver = serive_create(profilepath)
|
|
|
|
|
|
+def detail_crawler(data, detail_table, url_table, error_table, driver):
|
|
error_list = []
|
|
error_list = []
|
|
for key, group in data.iterrows():
|
|
for key, group in data.iterrows():
|
|
- url = group['news_url']
|
|
|
|
- company = group['company']
|
|
|
|
|
|
+ url = group['url']
|
|
print(url)
|
|
print(url)
|
|
- print(group['news_title'])
|
|
|
|
- date = None
|
|
|
|
|
|
+ company = group['company']
|
|
|
|
+ date = group['day']
|
|
|
|
+
|
|
try:
|
|
try:
|
|
detail_content = our_rule(url, company, driver)
|
|
detail_content = our_rule(url, company, driver)
|
|
if detail_content == '':
|
|
if detail_content == '':
|
|
@@ -269,62 +213,83 @@ def detail_content(data, current, profilepath, db, ws, pos, ner):
|
|
if detail_content == '':
|
|
if detail_content == '':
|
|
logger_.warning('{} : cannot find content'.format(url))
|
|
logger_.warning('{} : cannot find content'.format(url))
|
|
error_list += [url]
|
|
error_list += [url]
|
|
|
|
+ error_table.insert({
|
|
|
|
+ 'url':url,
|
|
|
|
+ 'keyword': group['keyword'],
|
|
|
|
+ 'error_message': 'cannot find conten',
|
|
|
|
+ 'crawler_date': current
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
|
|
|
|
+ detail_table.insert({
|
|
|
|
+ 'url': url,
|
|
|
|
+ 'keyword': group['keyword'],
|
|
|
|
+ 'detail_content': detail_content,
|
|
|
|
+ 'date': str(date),
|
|
|
|
+ 'company': company,
|
|
|
|
+ 'page': group['page'],
|
|
|
|
+ 'crawler_date': current
|
|
|
|
+ })
|
|
|
|
+
|
|
|
|
+ url_table.upsert({'url':url,'_status':1},['url'])
|
|
|
|
+ time.sleep(2)
|
|
|
|
|
|
- tmp = [url, detail_content, str(date), current]
|
|
|
|
- insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
|
|
|
|
- .format('gnews_detail', str(tuple(GNEWS_DETAIL_COL)).replace('\'',''), tuple(tmp))
|
|
|
|
- DA.mysql_insert_data(db, insert_sql)
|
|
|
|
-
|
|
|
|
- update_sql = "UPDATE url_list SET _status = 1 WHERE news_url = '{}'".format(url)
|
|
|
|
- DA.mysql_insert_data(db, update_sql)
|
|
|
|
- # tmp = dict(url=group['url'],
|
|
|
|
- # detail_content=detail_content,
|
|
|
|
- # day2=date,
|
|
|
|
- # crawler_date=current,
|
|
|
|
- # _status=1)
|
|
|
|
- # table.update(tmp, ['url'])
|
|
|
|
-
|
|
|
|
- SendCheck(detail_content, url, group['news_title'], ws, pos, ner)
|
|
|
|
- time.sleep(15)
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(url)
|
|
|
|
- logger_.error('{} / {} : cannot find content'.format(url, str(e)))
|
|
|
|
- update_sql = "UPDATE url_list SET _status = -1 WHERE news_url = '{}'".format(url)
|
|
|
|
- DA.mysql_insert_data(db, update_sql)
|
|
|
|
- error_list += [url]
|
|
|
|
|
|
+ current = datetime.today().strftime("%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
|
|
|
|
+ error_table.insert({
|
|
|
|
+ 'url':url,
|
|
|
|
+ 'keyword': group['keyword'],
|
|
|
|
+ 'error_message': str(e),
|
|
|
|
+ 'crawler_date': current
|
|
|
|
+ })
|
|
|
|
+
|
|
driver.close()
|
|
driver.close()
|
|
|
|
|
|
|
|
|
|
|
|
+def get_next_job(db, table, query_key):
|
|
|
|
+ result = db.query("select * from gnews.{} where _status=0 and keyword='{}'".format(table, query_key))
|
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
|
+
|
|
|
|
+ return url_pd
|
|
|
|
+
|
|
|
|
+
|
|
def main():
|
|
def main():
|
|
- query_key = u'建材'
|
|
|
|
- profilepath = 'Profile 1'
|
|
|
|
- logger_.info('start...')
|
|
|
|
- current = datetime.today().strftime(
|
|
|
|
- "%Y{y}%m{m}%d{d}").format(y='年', m='月', d='日')
|
|
|
|
-
|
|
|
|
- db = DA.mysql_connect(MYSQL_CONFIG, 'gnews')
|
|
|
|
- # db = DA.DBconnect()
|
|
|
|
- # table = DA.Tabelconnect(db, 'gnews3')
|
|
|
|
- get_trends(query_key, db, current, profilepath)
|
|
|
|
-
|
|
|
|
- logger_.info('{} news list update'.format(query_key))
|
|
|
|
- time.sleep(120)
|
|
|
|
-
|
|
|
|
- ws = WS("./data")
|
|
|
|
- pos = POS("./data")
|
|
|
|
- ner = NER("./data")
|
|
|
|
- # pos,ner='',''
|
|
|
|
- # table = DA.Tabelconnect(db, 'gnews3')
|
|
|
|
- # data = pd.DataFrame([dict(i) for i in table.find(_status=0)])
|
|
|
|
- query_sql = "SELECT * FROM url_list WHERE _status=0"
|
|
|
|
- results = DA.mysql_select_data(db, query_sql)
|
|
|
|
- results = pd.DataFrame(results, columns=['id'] + URL_LIST_COL)
|
|
|
|
-
|
|
|
|
- logger_.info('find {} news...'.format(len(results)))
|
|
|
|
- detail_content(results, current, profilepath, db, ws, pos, ner)
|
|
|
|
-
|
|
|
|
- logger_.info('{} news description update'.format(query_key))
|
|
|
|
|
|
+ location_pd = pd.read_csv('location_list.csv')
|
|
|
|
+ location_pd = location_pd[location_pd['縣市']== '台北']
|
|
|
|
+ location = location_pd['地區'].to_list()
|
|
|
|
+
|
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
|
+ port=int(sys.argv[1])
|
|
|
|
+ print('restart docker pw{}'.format(port))
|
|
|
|
+ os.system('sudo docker container restart pw'+str(port))
|
|
|
|
+ time.sleep(8)
|
|
|
|
+
|
|
|
|
+ driver = brower_start(port)
|
|
|
|
+
|
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gnews?charset=utf8mb4')
|
|
|
|
+ url_table_name = 'url_list2'
|
|
|
|
+ url_table = db[url_table_name]
|
|
|
|
+ detail_table = db['gnews_detail2']
|
|
|
|
+ error_table = db['error_list']
|
|
|
|
+
|
|
|
|
+ for keyword in location:
|
|
|
|
+ if keyword == '文山' or keyword == '大安' or keyword == '中正': continue
|
|
|
|
+ query_key = '{} 政治'.format(keyword)
|
|
|
|
+ logger_.info('{} start...'.format(query_key))
|
|
|
|
+
|
|
|
|
+ # find new news url
|
|
|
|
+ id_cache = build_cache(db, url_table_name)
|
|
|
|
+ get_trends(query_key, url_table, id_cache, driver)
|
|
|
|
+ time.sleep(5)
|
|
|
|
+
|
|
|
|
+ url_pd = get_next_job(db, url_table_name, query_key)
|
|
|
|
+ logger_.info('find {} news...'.format(len(url_pd)))
|
|
|
|
+
|
|
|
|
+ detail_crawler(url_pd, detail_table, url_table, error_table, driver)
|
|
|
|
+ logger_.info('{} news description update'.format(query_key))
|
|
|
|
+
|
|
db.close()
|
|
db.close()
|
|
|
|
+ driver.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
main()
|
|
main()
|