123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- from datetime import date, datetime
- import pymysql
- pymysql.install_as_MySQLdb()
- import dataset
- import pandas as pd
- import os
- import random
- import re
- import json
- import requests
- from summarizer import Summarizer
- from transformers import AutoConfig, AutoTokenizer, AutoModel
- from summarizer.sentence_handler import SentenceHandler
- from spacy.lang.zh import Chinese
- from random import sample
- import csv
- from gn3 import GoogleNews3
- googlenews = GoogleNews3(lang="zh-tw", period="7d",
- encode="utf-8", region="tw")
- # from fastapi import FastAPI
- # app = FastAPI()
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
- result = db.query('SELECT * FROM gnews.weather_detail2')
- data = pd.DataFrame(result, columns=next(iter(result)).keys())
- custom_config = AutoConfig.from_pretrained('bert-base-chinese')
- custom_config.output_hidden_states=True
- custom_tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
- custom_model = AutoModel.from_pretrained('bert-base-chinese', config=custom_config)
- summarizer = Summarizer(custom_model=custom_model,
- custom_tokenizer=custom_tokenizer,
- sentence_handler = SentenceHandler(language=Chinese))
- df_test = pd.read_csv('./news_text.txt')
- df = pd.read_csv('./image.csv')
- def sentence2img(news_text: str):
- tags = []
- for i in range(len(df['tags'])):
- tags.extend(df['tags'][i].split(','))
- images = []
- for i in list(set(tags)):
- if i in news_text:
- images.extend(list(df[df['tags'].str.contains(i,regex=False)==True]['img_url']))
- if images:
- return images[random.randint(0, len(images)-1)]
- else:
- return list(df['img_url'])[random.randint(0, len(list(df['img_url']))-1)]
- def remove_redundancy(text, words):
- for w in words:
- if w in text:
- text = text.split(w)[0]
- text = text.replace('\n','').replace('\r','').replace('(','(').replace(')',')')
- return text
- def news_clean(text):
-
- text = text.replace('u3000','')
- text = text.replace('u200b','')
-
- text = remove_redundancy(text, ['原始連結','更多內容','看更多','參考資料','本文獲','Source','註','點擊瀏覽',
- '【延伸閱讀】','更多三立','延伸閱讀','新聞來源','更多 TVBS 報導','【更多中天快點TV報導】',
- '今日各地紫外線預報概況。 【','國際都市氣象','亞洲、大洋洲美洲歐洲、非洲中國各地體感溫度預報'])
-
- text = re.sub("[\(\[].*?[\)\]]", "", text)
- text = re.sub(r'〔.*?/.*?〕', '', text)
- text = re.sub(r'【 第.*?號.*?地震報告 】', '後面都不要了', text)
- text = re.sub(r'\w\w\w接受《.*?》訪問指出,', '', text)
- text = re.sub(r'\w\w\w「.*?」.*?,', '', text)
- text = re.sub(r'對此,.*?指出,', '', text)
- text = re.sub(r'臉書.*?|.*?表示,', '', text)
- text = re.sub(r'氣象粉專.*?提醒,', '', text)
- text = re.sub(r'請繼續往下閱讀...', '', text)
- text = re.sub(r'\w\w\w說,', '', text)
- text = re.sub(r'\w\w\w也提醒,', '', text)
- text = re.sub(r'\w\w\w分析,', '', text)
- text = re.sub(r'圖:.*?/攝', '', text)
- text = re.sub(r'圖:.*?/提供', '', text)
- text = re.sub(r'圖/.*?中心', '', text)
- text = re.sub(r'三立氣象主播\w\w\w', '', text)
- text = re.sub(r'記者.*?/攝影', '', text)
- text = re.sub(r'三立.*?/.*?報導', '', text)
- text = re.sub(r'生活中心/.*?報導', '', text)
- text = re.sub(r'\w\w\w / .*?報導', '', text)
- text = re.sub(r'記者.*?/.*?報導', '', text)
- text = re.sub(r'(.*?提供)', '', text)
- text = re.sub(r'(圖/.*?)', '', text)
- text = re.sub(r'\w在《.*?》.*?撰文,', '', text)
- text = re.sub(r'氣象專家.*?認為,', '', text)
- text = re.sub(r'\w\w\w表示,', '', text)
- text = re.sub(r'※點圖放大', '', text)
- text = re.sub(r'綜合報導 / \w\w\w', '', text)
- text = re.sub(r'【.*?-.*?】', '', text)
- text = re.sub(r'不用抽.*?活動辦法', '', text)
- text = re.sub(r'◤.*?◢?.*?酒店', '', text)
- text = re.sub(r'\d+/\d+/\d+ \d+:\d+', '', text)
- text = re.sub(r'▼(\w*\w*。)', '', text)
- text = re.sub(r'▲(\w*\w*。)', '', text).replace('▲','').replace('▼','')
- text = text.replace('xa0','')
- text = remove_redundancy(text, ['後面都不要了'])
- return text
- def gen_daily_news(data, date, summarizer):
- text_content_list = []
- image_urls_list = []
-
- news_date = data.groupby("news_date")
- print("-"*100)
- print(date)
-
- news_content_list = []
- news_imgs_list = []
-
- for idx, news in enumerate(news_date.get_group(date).iterrows()):
-
- # clean news content
- news_content = news_clean(news[1]['news_content'])
-
- # compress content
- if len(news_content)>1000:
- news_content = summarizer(news_content, ratio=0.8)
- if len(news_content) > 180 and len(news_content) < 800:
- news_content_list.append(news_content)
- news_imgs_list.append(news[1]['news_imgs'])
- error_count = 0
- zero_count = 0
- news = ''
- while True:
- try:
- text = ''.join(sample(news_content_list,min(4,len(news_content_list))))
- text = text.replace('.','·')
- # print(text)
- ratio = 400 / len(text)
- # print(ratio)
- news = news_clean(summarizer(text, ratio=ratio))
- print('字數:',len(news))
- if zero_count > 5:
- print('So sad! Something wrong!')
- print('-'*80)
- break
- if len(news) != 0:
- news_split = [n.strip()+'。' for n in news.split('。')]
-
- image_urls_list = sample(image_urls_list, min(len(image_urls_list),len(news_split[:-1])))
- text_content_list = news_split[:-1]
- break
- if len(news) <120:
- print('Regenerating...')
- zero_count = zero_count+1
- except:
- print('Regenerating...')
- error_count = error_count+1
- if error_count>5:
- print('So sad! Something wrong!')
- print('-'*80)
- break
- image_urls_list = []
- for t in text_content_list:
- # print(t)
- image_urls_list.append(sentence2img(t))
- return text_content_list, image_urls_list
- # name_date = '2021年11月01日'
- # t, i = gen_daily_news(data, name_date, summarizer)
- kw = input("關鍵字:")
- googlenews.clear()
- googlenews.get_news(kw)
- trdresult = googlenews.result()
- """ for i in range(25,26):
- if i <10:
- name_date = '2021年11月0' + str(i) + '日'
- else:
- name_date = '2021年11月' + str(i) + '日' """
- #t, i = gen_daily_news(data, '2021年11月25日', summarizer)
- t, i = [trdresult[0]['title']], [trdresult[0]['img']]
- url = 'https://www.choozmo.com:8887/make_anchor_video'
- headers = {'Authorization':"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ5dWt5byIsImlhdCI6MTYzNzkwNjc0NCwibmJmIjoxNjM3OTA2NzQ0LCJqdGkiOiIwZmM2MjQ5MS1kNzFmLTRjNzktYTk1MC1iZmVjOGYyNGI1ODUiLCJleHAiOjE2MzgzMzg3NDQsInR5cGUiOiJhY2Nlc3MiLCJmcmVzaCI6ZmFsc2UsImNzcmYiOiIxYmEwYjU4MC03OTU1LTQ4MmMtYjNiYi0yZjhmYmMyZjg5YjYifQ.MUT8tXk2v_M2nI5fxJewSh2kIi2-uh-vWAj9Y3N2niE", 'Content-Type': 'application/json'}
- payload = {
- "name": "Demo_1_" + kw + '_' + datetime.now().strftime('%Y%m%d%H%M%S'),
- "text_content": t,
- "image_urls": i,
- "avatar": "7",
- "client_id": "-1",
- "multiLang": 0
- }
- #print(t)
- #print(i)
- r = requests.request("POST", url, data=json.dumps(payload), headers=headers)
- print(r.text)
- # @app.get('/')
- # async def genlist(date:date, flag: int):
- # return {
- # "text" : [
- # "中央氣象局指出,今天天氣與昨天類似,降雨範圍以基隆北海岸與東半部為主,颱風消息方面,第20號輕位於台灣東方約1910公里的海面上,未來朝日本東方海面前進,對台灣天氣無直接影響,但基隆北海岸、東半部及恆春半島沿海易有長浪發生。",
- # "氣象局預估,週五至週六清晨東北季風影響,北部、東北部早晚較涼,降雨方面在迎風面的基隆北海岸、東半部及大台北地區有局部短暫雨,其他地區為多雲到晴,午後南部地區有局部短暫陣雨。",
- # "天氣風險公司總經理彭啟未來7-10天都維持類似的天氣,下一波變化有些預測落在11月中上旬,約11/10-11/12,北方冷高壓有機會再度南下,是否如某些預測大降溫,但這變數仍很多,保持關注就可以了。",
- # "氣象局說明,今東北季風稍微增強,然而水氣沒有顯著增加,因此天氣表現與昨日類似,只是早晚稍微又涼些,清晨中部以北及東北部約20、21度,花東及南部地區約23度,降雨範圍依舊以迎風面的基隆北海岸及東半部的局部短暫雨為主,且大台北地區雨較零星,其他地方則為多雲到晴,午後南部地區有局部短暫陣雨的機率,但範圍跟程度都影響不大。"
- # ],
- # "imgurl" : [
- # "https://cdn2.ettoday.net/images/5930/d5930739.jpg",
- # "https://cdn2.ettoday.net/images/5917/d5917494.jpg",
- # "https://cdn2.ettoday.net/images/5915/d5915515.jpg",
- # "https://cdn2.ettoday.net/images/5931/d5931569.jpg"
- # ]
- # }
|