ai-anchor
/
video-maker


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
							import pandas as pd
from pathlib import Path
import subprocess
import shutil
import os
import chardet
import zipfile
from io import BytesIO
from translate import Translator
from langdetect import detect
from chardet.universaldetector import UniversalDetector
import numpy as np
from openai import OpenAI
from iso639 import Lang

DEFAULT_ENCODING = "utf-8"
client = OpenAI(base_url="http://192.168.192.84:8080/v1", api_key='choozmo9')
system_prompt = (
     "You are a precise and literal translator. "
     "Translate the user's input from {from_lang} to {to_lang} as faithfully and literally as possible. "
     "Preserve the original structure and vocabulary. "
     "Avoid paraphrasing, interpretation, or creative rewrites. "
     "Do not add explanations. Just return the translated text only."
)

def guess_codec(filenames: list) -> str:
  codec_detector = UniversalDetector()
  for filename in filenames:
    codec_detector.feed(filename.encode('cp437'))
    if codec_detector.done:
      break

  result = codec_detector.close()
  encoding = result.get("encoding")
  return encoding or DEFAULT_ENCODING
  
def check_zip(zip_filepath:str):
  path = Path(zip_filepath)
  with zipfile.ZipFile(str(path)) as zf:
    filenames = [x for x in zf.namelist() if not x.endswith('/')]
    result = guess_codec(filenames)
    true_filenames = [x.encode('cp437').decode(result) for x in zf.namelist() if not x.endswith('/')]
    # print(true_filenames)
    scenarios_files = [(x, i) for i, x in enumerate(true_filenames) if Path(x).suffix in [".xlsx", ".csv"] and not Path(x).name.startswith("._") and Path(x).stem != "style"]
    # print(scenarios_files)
    
    if len(scenarios_files) == 0:
      raise ValueError("no excel or csv file in zip.")
    if len(scenarios_files) > 1:
      raise ValueError("too many excel or csv file in zip.")
    f = zf.read(filenames[scenarios_files[0][1]])
    if Path(scenarios_files[0][0]).suffix == ".xlsx":
      table = pd.read_excel(BytesIO(f), dtype=object)
    elif Path(scenarios_files[0][0]).suffix == ".csv":
      table = pd.read_csv(BytesIO(f), dtype=object)
    table.reset_index(inplace=True)
    # print(table)
    
    stems = [Path(x).stem for x in true_filenames]
    for i in range(len(table)):
      # excel 裡的圖檔跟zip裡的檔案要一致
      if table.loc[i, ['素材']].isna().item():
        img =  table.loc[i, ['素材']].item()
        print(img)

        img_files = [x.strip() for x in img.split(',')]
        for img in img_files:
          print(img)
          n = stems.count(img)
          if n == 0:
            raise ValueError(f"{img}: no such media file in zip.")
          elif n > 1:
            raise ValueError(f'too many same name media files as {img} in zip')
      
      # 需要tts文字或音檔
      if table.loc[i, ['字幕']].isna().item():
        if table.loc[i, ['音檔']].isna().item():
          raise ValueError(f'text or voice file is needed at scene {i+1}.')
        voice_file = table.loc[i, ['音檔']].item()
        n = stems.count(voice_file)
        if n != 1:
          raise ValueError(f"voice file is can't find is zip at scene {i+1}.")
      
def update_zip(zip_path, lang, new_filename, voice):


    with zipfile.ZipFile(zip_path, 'r') as zip_in, zipfile.ZipFile(new_filename, 'w') as zip_out:
        for item in zip_in.infolist():
            with zip_in.open(item.filename) as src_file:
                if item.filename.split('.')[-1] == "xlsx":
                    table = pd.read_excel(src_file, dtype=object)
                    table['聲音'] = np.nan
                    table.loc[0, ['聲音']] = voice
                    table['發音'] = np.nan
                    table = translate_table(table, lang)
                    table.to_excel(Path(item.filename).name ,sheet_name='Sheet_name_1')
                    zip_out.write(Path(item.filename).name, item.filename)
                    os.remove(Path(item.filename).name)
                elif item.filename.split('.')[-1] == "csv":
                    table = pd.read_csv(src_file, dtype=object)
                    table = translate_table(table, lang)
                    table['聲音'] = np.nan
                    table.loc[0, ['聲音']] = voice
                    table['發音'] = np.nan
                    table.to_excel(Path(item.filename).name ,sheet_name='Sheet_name_1')
                    zip_out.write(Path(item.filename).name, item.filename)
                    os.remove(Path(item.filename).name)
                else:
                    # それ以外のファイルはそのままコピー
                    with zip_out.open(item.filename, 'w') as dst_file:
                        shutil.copyfileobj(src_file, dst_file)

    # 旧ZIPを削除し、新ZIPをリネーム
    os.remove(zip_path)
    
def translate_table(table, lang):
    print(f"translate to {lang}")
    for i in range(len(table)):
        if (not table.loc[i, ['大標']].isna().item()) and (text:=table.loc[i, ['字幕']].item()):
            print("大標:",text)
            #translator= Translator(to_lang=lang, from_lang=detect(text))
            #translation = translator.translate(text)
            translation = translate(text, lang)
            print("大標翻譯:",translation)
            table.loc[i, ['大標']] = translation
        if (not table.loc[i, ['字幕']].isna().item()) and (text:=table.loc[i, ['字幕']].item()):
            print('字幕:',text)
            #translator= Translator(to_lang=lang, from_lang=detect(text))
            #translation = translator.translate(text)
            translation = translate(text, lang)
            print('字幕翻譯:',translation)
            table.loc[i, ['字幕']] = translation
    return table
        
def translate(text, to_lang:str):
    from_lang = Lang(detect(text)).name
    to_lang = Lang(to_lang.split("-")).name
    if to_lang == "Chinese":
        to_lang = "Traditional Chinese"
    completion = client.chat.completions.create(
        model="gemma",
        messages=[
            {
                "role": "system",
                "content": system_prompt.format(from_lang=from_lang, to_lang=to_lang)
            },
            {
                "role": "user",
                "content": text
            }
        ]
    )
    return completion.choices[0].message.content