gen_video.py 9.4 KB


  1. from datetime import date, datetime
  2. import pymysql
  3. pymysql.install_as_MySQLdb()
  4. import dataset
  5. import pandas as pd
  6. import os
  7. import random
  8. import re
  9. import json
  10. import requests
  11. from summarizer import Summarizer
  12. from transformers import AutoConfig, AutoTokenizer, AutoModel
  13. from summarizer.sentence_handler import SentenceHandler
  14. from spacy.lang.zh import Chinese
  15. from random import sample
  16. import csv
  17. from gn3 import GoogleNews3
  18. googlenews = GoogleNews3(lang="zh-tw", period="7d",
  19. encode="utf-8", region="tw")
  20. # from fastapi import FastAPI
  21. # app = FastAPI()
  22. os.environ["TOKENIZERS_PARALLELISM"] = "false"
  23. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')
  24. result = db.query('SELECT * FROM gnews.weather_detail2')
  25. data = pd.DataFrame(result, columns=next(iter(result)).keys())
  26. custom_config = AutoConfig.from_pretrained('bert-base-chinese')
  27. custom_config.output_hidden_states=True
  28. custom_tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
  29. custom_model = AutoModel.from_pretrained('bert-base-chinese', config=custom_config)
  30. summarizer = Summarizer(custom_model=custom_model,
  31. custom_tokenizer=custom_tokenizer,
  32. sentence_handler = SentenceHandler(language=Chinese))
  33. df_test = pd.read_csv('./news_text.txt')
  34. df = pd.read_csv('./image.csv')
  35. def sentence2img(news_text: str):
  36. tags = []
  37. for i in range(len(df['tags'])):
  38. tags.extend(df['tags'][i].split(','))
  39. images = []
  40. for i in list(set(tags)):
  41. if i in news_text:
  42. images.extend(list(df[df['tags'].str.contains(i,regex=False)==True]['img_url']))
  43. if images:
  44. return images[random.randint(0, len(images)-1)]
  45. else:
  46. return list(df['img_url'])[random.randint(0, len(list(df['img_url']))-1)]
  47. def remove_redundancy(text, words):
  48. for w in words:
  49. if w in text:
  50. text = text.split(w)[0]
  51. text = text.replace('\n','').replace('\r','').replace('(','(').replace(')',')')
  52. return text
  53. def news_clean(text):
  54. text = text.replace('u3000','')
  55. text = text.replace('u200b','')
  56. text = remove_redundancy(text, ['原始連結','更多內容','看更多','參考資料','本文獲','Source','註','點擊瀏覽',
  57. '【延伸閱讀】','更多三立','延伸閱讀','新聞來源','更多 TVBS 報導','【更多中天快點TV報導】',
  58. '今日各地紫外線預報概況。 【','國際都市氣象','亞洲、大洋洲美洲歐洲、非洲中國各地體感溫度預報'])
  59. text = re.sub("[\(\[].*?[\)\]]", "", text)
  60. text = re.sub(r'〔.*?/.*?〕', '', text)
  61. text = re.sub(r'【 第.*?號.*?地震報告 】', '後面都不要了', text)
  62. text = re.sub(r'\w\w\w接受《.*?》訪問指出,', '', text)
  63. text = re.sub(r'\w\w\w「.*?」.*?,', '', text)
  64. text = re.sub(r'對此,.*?指出,', '', text)
  65. text = re.sub(r'臉書.*?|.*?表示,', '', text)
  66. text = re.sub(r'氣象粉專.*?提醒,', '', text)
  67. text = re.sub(r'請繼續往下閱讀...', '', text)
  68. text = re.sub(r'\w\w\w說,', '', text)
  69. text = re.sub(r'\w\w\w也提醒,', '', text)
  70. text = re.sub(r'\w\w\w分析,', '', text)
  71. text = re.sub(r'圖:.*?/攝', '', text)
  72. text = re.sub(r'圖:.*?/提供', '', text)
  73. text = re.sub(r'圖/.*?中心', '', text)
  74. text = re.sub(r'三立氣象主播\w\w\w', '', text)
  75. text = re.sub(r'記者.*?/攝影', '', text)
  76. text = re.sub(r'三立.*?/.*?報導', '', text)
  77. text = re.sub(r'生活中心/.*?報導', '', text)
  78. text = re.sub(r'\w\w\w / .*?報導', '', text)
  79. text = re.sub(r'記者.*?/.*?報導', '', text)
  80. text = re.sub(r'(.*?提供)', '', text)
  81. text = re.sub(r'(圖/.*?)', '', text)
  82. text = re.sub(r'\w在《.*?》.*?撰文,', '', text)
  83. text = re.sub(r'氣象專家.*?認為,', '', text)
  84. text = re.sub(r'\w\w\w表示,', '', text)
  85. text = re.sub(r'※點圖放大', '', text)
  86. text = re.sub(r'綜合報導 / \w\w\w', '', text)
  87. text = re.sub(r'【.*?-.*?】', '', text)
  88. text = re.sub(r'不用抽.*?活動辦法', '', text)
  89. text = re.sub(r'◤.*?◢?.*?酒店', '', text)
  90. text = re.sub(r'\d+/\d+/\d+ \d+:\d+', '', text)
  91. text = re.sub(r'▼(\w*\w*。)', '', text)
  92. text = re.sub(r'▲(\w*\w*。)', '', text).replace('▲','').replace('▼','')
  93. text = text.replace('xa0','')
  94. text = remove_redundancy(text, ['後面都不要了'])
  95. return text
  96. def gen_daily_news(data, date, summarizer):
  97. text_content_list = []
  98. image_urls_list = []
  99. news_date = data.groupby("news_date")
  100. print("-"*100)
  101. print(date)
  102. news_content_list = []
  103. news_imgs_list = []
  104. for idx, news in enumerate(news_date.get_group(date).iterrows()):
  105. # clean news content
  106. news_content = news_clean(news[1]['news_content'])
  107. # compress content
  108. if len(news_content)>1000:
  109. news_content = summarizer(news_content, ratio=0.8)
  110. if len(news_content) > 180 and len(news_content) < 800:
  111. news_content_list.append(news_content)
  112. news_imgs_list.append(news[1]['news_imgs'])
  113. error_count = 0
  114. zero_count = 0
  115. news = ''
  116. while True:
  117. try:
  118. text = ''.join(sample(news_content_list,min(4,len(news_content_list))))
  119. text = text.replace('.','·')
  120. # print(text)
  121. ratio = 400 / len(text)
  122. # print(ratio)
  123. news = news_clean(summarizer(text, ratio=ratio))
  124. print('字數:',len(news))
  125. if zero_count > 5:
  126. print('So sad! Something wrong!')
  127. print('-'*80)
  128. break
  129. if len(news) != 0:
  130. news_split = [n.strip()+'。' for n in news.split('。')]
  131. image_urls_list = sample(image_urls_list, min(len(image_urls_list),len(news_split[:-1])))
  132. text_content_list = news_split[:-1]
  133. break
  134. if len(news) <120:
  135. print('Regenerating...')
  136. zero_count = zero_count+1
  137. except:
  138. print('Regenerating...')
  139. error_count = error_count+1
  140. if error_count>5:
  141. print('So sad! Something wrong!')
  142. print('-'*80)
  143. break
  144. image_urls_list = []
  145. for t in text_content_list:
  146. # print(t)
  147. image_urls_list.append(sentence2img(t))
  148. return text_content_list, image_urls_list
  149. # name_date = '2021年11月01日'
  150. # t, i = gen_daily_news(data, name_date, summarizer)
  151. kw = input("關鍵字:")
  152. googlenews.clear()
  153. googlenews.get_news(kw)
  154. trdresult = googlenews.result()
  155. """ for i in range(25,26):
  156. if i <10:
  157. name_date = '2021年11月0' + str(i) + '日'
  158. else:
  159. name_date = '2021年11月' + str(i) + '日' """
  160. #t, i = gen_daily_news(data, '2021年11月25日', summarizer)
  161. t, i = [trdresult[0]['title']], [trdresult[0]['img']]
  162. url = 'https://www.choozmo.com:8887/make_anchor_video'
  163. headers = {'Authorization':"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ5dWt5byIsImlhdCI6MTYzNzkwNjc0NCwibmJmIjoxNjM3OTA2NzQ0LCJqdGkiOiIwZmM2MjQ5MS1kNzFmLTRjNzktYTk1MC1iZmVjOGYyNGI1ODUiLCJleHAiOjE2MzgzMzg3NDQsInR5cGUiOiJhY2Nlc3MiLCJmcmVzaCI6ZmFsc2UsImNzcmYiOiIxYmEwYjU4MC03OTU1LTQ4MmMtYjNiYi0yZjhmYmMyZjg5YjYifQ.MUT8tXk2v_M2nI5fxJewSh2kIi2-uh-vWAj9Y3N2niE", 'Content-Type': 'application/json'}
  164. payload = {
  165. "name": "Demo_1_" + kw + '_' + datetime.now().strftime('%Y%m%d%H%M%S'),
  166. "text_content": t,
  167. "image_urls": i,
  168. "avatar": "7",
  169. "client_id": "-1",
  170. "multiLang": 0
  171. }
  172. #print(t)
  173. #print(i)
  174. r = requests.request("POST", url, data=json.dumps(payload), headers=headers)
  175. print(r.text)
  176. # @app.get('/')
  177. # async def genlist(date:date, flag: int):
  178. # return {
  179. # "text" : [
  180. # "中央氣象局指出,今天天氣與昨天類似,降雨範圍以基隆北海岸與東半部為主,颱風消息方面,第20號輕位於台灣東方約1910公里的海面上,未來朝日本東方海面前進,對台灣天氣無直接影響,但基隆北海岸、東半部及恆春半島沿海易有長浪發生。",
  181. # "氣象局預估,週五至週六清晨東北季風影響,北部、東北部早晚較涼,降雨方面在迎風面的基隆北海岸、東半部及大台北地區有局部短暫雨,其他地區為多雲到晴,午後南部地區有局部短暫陣雨。",
  182. # "天氣風險公司總經理彭啟未來7-10天都維持類似的天氣,下一波變化有些預測落在11月中上旬,約11/10-11/12,北方冷高壓有機會再度南下,是否如某些預測大降溫,但這變數仍很多,保持關注就可以了。",
  183. # "氣象局說明,今東北季風稍微增強,然而水氣沒有顯著增加,因此天氣表現與昨日類似,只是早晚稍微又涼些,清晨中部以北及東北部約20、21度,花東及南部地區約23度,降雨範圍依舊以迎風面的基隆北海岸及東半部的局部短暫雨為主,且大台北地區雨較零星,其他地方則為多雲到晴,午後南部地區有局部短暫陣雨的機率,但範圍跟程度都影響不大。"
  184. # ],
  185. # "imgurl" : [
  186. # "https://cdn2.ettoday.net/images/5930/d5930739.jpg",
  187. # "https://cdn2.ettoday.net/images/5917/d5917494.jpg",
  188. # "https://cdn2.ettoday.net/images/5915/d5915515.jpg",
  189. # "https://cdn2.ettoday.net/images/5931/d5931569.jpg"
  190. # ]
  191. # }