from datetime import date, datetime import pymysql pymysql.install_as_MySQLdb() import dataset import pandas as pd import os import random import re import json import requests from summarizer import Summarizer from transformers import AutoConfig, AutoTokenizer, AutoModel from summarizer.sentence_handler import SentenceHandler from spacy.lang.zh import Chinese from random import sample import csv from gn3 import GoogleNews3 googlenews = GoogleNews3(lang="zh-tw", period="7d", encode="utf-8", region="tw") # from fastapi import FastAPI # app = FastAPI() os.environ["TOKENIZERS_PARALLELISM"] = "false" db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4') result = db.query('SELECT * FROM gnews.weather_detail2') data = pd.DataFrame(result, columns=next(iter(result)).keys()) custom_config = AutoConfig.from_pretrained('bert-base-chinese') custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') custom_model = AutoModel.from_pretrained('bert-base-chinese', config=custom_config) summarizer = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer, sentence_handler = SentenceHandler(language=Chinese)) df_test = pd.read_csv('./news_text.txt') df = pd.read_csv('./image.csv') def sentence2img(news_text: str): tags = [] for i in range(len(df['tags'])): tags.extend(df['tags'][i].split(',')) images = [] for i in list(set(tags)): if i in news_text: images.extend(list(df[df['tags'].str.contains(i,regex=False)==True]['img_url'])) if images: return images[random.randint(0, len(images)-1)] else: return list(df['img_url'])[random.randint(0, len(list(df['img_url']))-1)] def remove_redundancy(text, words): for w in words: if w in text: text = text.split(w)[0] text = text.replace('\n','').replace('\r','').replace('(','(').replace(')',')') return text def news_clean(text): text = text.replace('u3000','') text = text.replace('u200b','') text = remove_redundancy(text, ['原始連結','更多內容','看更多','參考資料','本文獲','Source','註','點擊瀏覽', '【延伸閱讀】','更多三立','延伸閱讀','新聞來源','更多 TVBS 報導','【更多中天快點TV報導】', '今日各地紫外線預報概況。 【','國際都市氣象','亞洲、大洋洲美洲歐洲、非洲中國各地體感溫度預報']) text = re.sub("[\(\[].*?[\)\]]", "", text) text = re.sub(r'〔.*?/.*?〕', '', text) text = re.sub(r'【 第.*?號.*?地震報告 】', '後面都不要了', text) text = re.sub(r'\w\w\w接受《.*?》訪問指出,', '', text) text = re.sub(r'\w\w\w「.*?」.*?,', '', text) text = re.sub(r'對此,.*?指出,', '', text) text = re.sub(r'臉書.*?|.*?表示,', '', text) text = re.sub(r'氣象粉專.*?提醒,', '', text) text = re.sub(r'請繼續往下閱讀...', '', text) text = re.sub(r'\w\w\w說,', '', text) text = re.sub(r'\w\w\w也提醒,', '', text) text = re.sub(r'\w\w\w分析,', '', text) text = re.sub(r'圖:.*?/攝', '', text) text = re.sub(r'圖:.*?/提供', '', text) text = re.sub(r'圖/.*?中心', '', text) text = re.sub(r'三立氣象主播\w\w\w', '', text) text = re.sub(r'記者.*?/攝影', '', text) text = re.sub(r'三立.*?/.*?報導', '', text) text = re.sub(r'生活中心/.*?報導', '', text) text = re.sub(r'\w\w\w / .*?報導', '', text) text = re.sub(r'記者.*?/.*?報導', '', text) text = re.sub(r'(.*?提供)', '', text) text = re.sub(r'(圖/.*?)', '', text) text = re.sub(r'\w在《.*?》.*?撰文,', '', text) text = re.sub(r'氣象專家.*?認為,', '', text) text = re.sub(r'\w\w\w表示,', '', text) text = re.sub(r'※點圖放大', '', text) text = re.sub(r'綜合報導 / \w\w\w', '', text) text = re.sub(r'【.*?-.*?】', '', text) text = re.sub(r'不用抽.*?活動辦法', '', text) text = re.sub(r'◤.*?◢?.*?酒店', '', text) text = re.sub(r'\d+/\d+/\d+ \d+:\d+', '', text) text = re.sub(r'▼(\w*\w*。)', '', text) text = re.sub(r'▲(\w*\w*。)', '', text).replace('▲','').replace('▼','') text = text.replace('xa0','') text = remove_redundancy(text, ['後面都不要了']) return text def gen_daily_news(data, date, summarizer): text_content_list = [] image_urls_list = [] news_date = data.groupby("news_date") print("-"*100) print(date) news_content_list = [] news_imgs_list = [] for idx, news in enumerate(news_date.get_group(date).iterrows()): # clean news content news_content = news_clean(news[1]['news_content']) # compress content if len(news_content)>1000: news_content = summarizer(news_content, ratio=0.8) if len(news_content) > 180 and len(news_content) < 800: news_content_list.append(news_content) news_imgs_list.append(news[1]['news_imgs']) error_count = 0 zero_count = 0 news = '' while True: try: text = ''.join(sample(news_content_list,min(4,len(news_content_list)))) text = text.replace('.','·') # print(text) ratio = 400 / len(text) # print(ratio) news = news_clean(summarizer(text, ratio=ratio)) print('字數:',len(news)) if zero_count > 5: print('So sad! Something wrong!') print('-'*80) break if len(news) != 0: news_split = [n.strip()+'。' for n in news.split('。')] image_urls_list = sample(image_urls_list, min(len(image_urls_list),len(news_split[:-1]))) text_content_list = news_split[:-1] break if len(news) <120: print('Regenerating...') zero_count = zero_count+1 except: print('Regenerating...') error_count = error_count+1 if error_count>5: print('So sad! Something wrong!') print('-'*80) break image_urls_list = [] for t in text_content_list: # print(t) image_urls_list.append(sentence2img(t)) return text_content_list, image_urls_list # name_date = '2021年11月01日' # t, i = gen_daily_news(data, name_date, summarizer) kw = input("關鍵字:") googlenews.clear() googlenews.get_news(kw) trdresult = googlenews.result() """ for i in range(25,26): if i <10: name_date = '2021年11月0' + str(i) + '日' else: name_date = '2021年11月' + str(i) + '日' """ #t, i = gen_daily_news(data, '2021年11月25日', summarizer) t, i = [trdresult[0]['title']], [trdresult[0]['img']] url = 'https://www.choozmo.com:8887/make_anchor_video' headers = {'Authorization':"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ5dWt5byIsImlhdCI6MTYzNzkwNjc0NCwibmJmIjoxNjM3OTA2NzQ0LCJqdGkiOiIwZmM2MjQ5MS1kNzFmLTRjNzktYTk1MC1iZmVjOGYyNGI1ODUiLCJleHAiOjE2MzgzMzg3NDQsInR5cGUiOiJhY2Nlc3MiLCJmcmVzaCI6ZmFsc2UsImNzcmYiOiIxYmEwYjU4MC03OTU1LTQ4MmMtYjNiYi0yZjhmYmMyZjg5YjYifQ.MUT8tXk2v_M2nI5fxJewSh2kIi2-uh-vWAj9Y3N2niE", 'Content-Type': 'application/json'} payload = { "name": "Demo_1_" + kw + '_' + datetime.now().strftime('%Y%m%d%H%M%S'), "text_content": t, "image_urls": i, "avatar": "7", "client_id": "-1", "multiLang": 0 } #print(t) #print(i) r = requests.request("POST", url, data=json.dumps(payload), headers=headers) print(r.text) # @app.get('/') # async def genlist(date:date, flag: int): # return { # "text" : [ # "中央氣象局指出,今天天氣與昨天類似,降雨範圍以基隆北海岸與東半部為主,颱風消息方面,第20號輕位於台灣東方約1910公里的海面上,未來朝日本東方海面前進,對台灣天氣無直接影響,但基隆北海岸、東半部及恆春半島沿海易有長浪發生。", # "氣象局預估,週五至週六清晨東北季風影響,北部、東北部早晚較涼,降雨方面在迎風面的基隆北海岸、東半部及大台北地區有局部短暫雨,其他地區為多雲到晴,午後南部地區有局部短暫陣雨。", # "天氣風險公司總經理彭啟未來7-10天都維持類似的天氣,下一波變化有些預測落在11月中上旬,約11/10-11/12,北方冷高壓有機會再度南下,是否如某些預測大降溫,但這變數仍很多,保持關注就可以了。", # "氣象局說明,今東北季風稍微增強,然而水氣沒有顯著增加,因此天氣表現與昨日類似,只是早晚稍微又涼些,清晨中部以北及東北部約20、21度,花東及南部地區約23度,降雨範圍依舊以迎風面的基隆北海岸及東半部的局部短暫雨為主,且大台北地區雨較零星,其他地方則為多雲到晴,午後南部地區有局部短暫陣雨的機率,但範圍跟程度都影響不大。" # ], # "imgurl" : [ # "https://cdn2.ettoday.net/images/5930/d5930739.jpg", # "https://cdn2.ettoday.net/images/5917/d5917494.jpg", # "https://cdn2.ettoday.net/images/5915/d5915515.jpg", # "https://cdn2.ettoday.net/images/5931/d5931569.jpg" # ] # }