hai 11 meses · 821b0cc05d
--- a/dictionary.txt
+++ b/dictionary.txt
@@ -1,4 +0,0 @@
 
				-請問 100 v
			
 
				-碳排放 100 n
			
 
				-是什麼 100 v
			
 
				-什麼意思 100 n
			
--- a/dictionary/dictionary_reviewed.txt
+++ b/dictionary/dictionary_reviewed.txt
--- a/dictionary/dictionary_reviewed_rows.txt
+++ b/dictionary/dictionary_reviewed_rows.txt
@@ -0,0 +1,618 @@
 
				+2050
			
 
				+CBAM
			
 
				+CDM
			
 
				+CO2e
			
 
				+H型
			
 
				+LED
			
 
				+PDCA
			
 
				+一貫式
			
 
				+一貫煉
			
 
				+三氟化氮
			
 
				+三氯乙烯
			
 
				+三氯乙烷
			
 
				+三氯甲烷
			
 
				+上市櫃
			
 
				+不銹鋼
			
 
				+世界
			
 
				+中央
			
 
				+中小
			
 
				+中華民國
			
 
				+中長期
			
 
				+主管
			
 
				+之
			
 
				+乙苯
			
 
				+事宜
			
 
				+事項
			
 
				+二極體
			
 
				+二氧化碳
			
 
				+二氯乙烷
			
 
				+二氯甲烷
			
 
				+二甲苯
			
 
				+交易
			
 
				+交易所
			
 
				+交流
			
 
				+交通部
			
 
				+京都
			
 
				+以
			
 
				+企業
			
 
				+估算
			
 
				+低
			
 
				+低位
			
 
				+低碳
			
 
				+住商
			
 
				+作業
			
 
				+使用
			
 
				+供需
			
 
				+係
			
 
				+係數
			
 
				+保存
			
 
				+保育
			
 
				+保護署
			
 
				+修正
			
 
				+倡議
			
 
				+價值
			
 
				+元件
			
 
				+先期
			
 
				+內政部
			
 
				+內部
			
 
				+全國
			
 
				+全氟
			
 
				+全球
			
 
				+公共
			
 
				+公司
			
 
				+公告
			
 
				+公噸
			
 
				+公定
			
 
				+公正
			
 
				+公約
			
 
				+六氟化硫
			
 
				+其他
			
 
				+具備
			
 
				+再
			
 
				+再生能源
			
 
				+出口
			
 
				+出口國
			
 
				+分公司
			
 
				+分析
			
 
				+初始
			
 
				+利用
			
 
				+制度
			
 
				+功能
			
 
				+化學
			
 
				+化學家
			
 
				+化石
			
 
				+匯量
			
 
				+區域
			
 
				+半導體
			
 
				+協定
			
 
				+協會
			
 
				+協議
			
 
				+原住民
			
 
				+原住民族
			
 
				+原則
			
 
				+原料
			
 
				+原物料
			
 
				+去除
			
 
				+參與
			
 
				+及
			
 
				+及國際會議
			
 
				+取得
			
 
				+取樣
			
 
				+台灣
			
 
				+各細節
			
 
				+各部門
			
 
				+合作
			
 
				+含氟
			
 
				+含量
			
 
				+吸收
			
 
				+和
			
 
				+品保
			
 
				+品牌
			
 
				+品質
			
 
				+商
			
 
				+商業
			
 
				+商標法
			
 
				+四氯乙烯
			
 
				+四氯化碳
			
 
				+回收
			
 
				+因應
			
 
				+因應法
			
 
				+固定
			
 
				+固定式
			
 
				+國土
			
 
				+國家
			
 
				+國家永續
			
 
				+國情
			
 
				+國際
			
 
				+國際驗證
			
 
				+圖
			
 
				+土地利用
			
 
				+土壤
			
 
				+在
			
 
				+地方
			
 
				+型
			
 
				+執行
			
 
				+培訓
			
 
				+基本
			
 
				+基板
			
 
				+基準
			
 
				+基礎
			
 
				+基線
			
 
				+基金
			
 
				+基金會
			
 
				+報告
			
 
				+增氧設備
			
 
				+增量
			
 
				+外交部
			
 
				+外加性
			
 
				+外購
			
 
				+多樣性
			
 
				+多邊
			
 
				+大眾
			
 
				+天然氣
			
 
				+太平洋
			
 
				+契約
			
 
				+好算
			
 
				+委員會
			
 
				+威爾斯
			
 
				+安全
			
 
				+宣告
			
 
				+宣導
			
 
				+實施
			
 
				+實驗室
			
 
				+審查
			
 
				+審核
			
 
				+專案
			
 
				+專章
			
 
				+對
			
 
				+對策
			
 
				+展開
			
 
				+層級
			
 
				+層面
			
 
				+工具
			
 
				+工業
			
 
				+工業局
			
 
				+巴黎
			
 
				+市場
			
 
				+平台
			
 
				+平衡
			
 
				+年
			
 
				+廢棄
			
 
				+廢棄物
			
 
				+廢水
			
 
				+建立
			
 
				+建築
			
 
				+強制性
			
 
				+強化
			
 
				+強度
			
 
				+形象
			
 
				+彩色
			
 
				+影響
			
 
				+徵收
			
 
				+德國萊因
			
 
				+情境
			
 
				+情形
			
 
				+意見
			
 
				+憲章
			
 
				+成本
			
 
				+成長
			
 
				+我國
			
 
				+或
			
 
				+承認
			
 
				+承諾
			
 
				+承諾
			
 
				+技術
			
 
				+投資
			
 
				+抵換
			
 
				+抵減
			
 
				+指引
			
 
				+指數
			
 
				+指標
			
 
				+排放
			
 
				+排放源
			
 
				+排放量
			
 
				+排碳金
			
 
				+推動
			
 
				+推動產品
			
 
				+推廣
			
 
				+措施
			
 
				+提供
			
 
				+提升
			
 
				+換購
			
 
				+揭露
			
 
				+揮發性
			
 
				+撰寫
			
 
				+擴散區
			
 
				+改善
			
 
				+改用
			
 
				+改造
			
 
				+政府
			
 
				+政策
			
 
				+效果
			
 
				+效率
			
 
				+效益
			
 
				+效能
			
 
				+教育
			
 
				+教育部
			
 
				+整體
			
 
				+數
			
 
				+數據
			
 
				+文件
			
 
				+新南
			
 
				+新設
			
 
				+斷路器
			
 
				+方式
			
 
				+方案型
			
 
				+方法
			
 
				+旋轉式
			
 
				+既有
			
 
				+日本
			
 
				+時尚產業
			
 
				+晶圓
			
 
				+書
			
 
				+會
			
 
				+會計
			
 
				+有機物
			
 
				+有限公司
			
 
				+服務
			
 
				+服務業
			
 
				+期程
			
 
				+期間
			
 
				+材料
			
 
				+東北
			
 
				+林業
			
 
				+查證
			
 
				+查驗
			
 
				+校驗
			
 
				+核發
			
 
				+核配
			
 
				+條
			
 
				+條文
			
 
				+森林
			
 
				+業
			
 
				+標準
			
 
				+標準化
			
 
				+標的
			
 
				+標籤
			
 
				+機制
			
 
				+機會
			
 
				+機構
			
 
				+機組
			
 
				+機車
			
 
				+機關
			
 
				+檢測
			
 
				+檢驗
			
 
				+檢驗局
			
 
				+權證
			
 
				+歐盟
			
 
				+比率
			
 
				+氣候
			
 
				+氣渦
			
 
				+氣相沉積
			
 
				+氣體
			
 
				+氧化亞氮
			
 
				+氧化物
			
 
				+氧化矽
			
 
				+氧化鈣
			
 
				+氧化鋁
			
 
				+氧化鐵
			
 
				+氫氟碳化物
			
 
				+氮氧化物
			
 
				+水泥
			
 
				+水質
			
 
				+永續
			
 
				+污染物
			
 
				+汰換
			
 
				+汰舊換新
			
 
				+決策者
			
 
				+汽力
			
 
				+汽機車
			
 
				+汽車
			
 
				+沼氣
			
 
				+法
			
 
				+法律
			
 
				+活動
			
 
				+流程
			
 
				+海洋
			
 
				+液晶
			
 
				+涵蓋
			
 
				+淘汰
			
 
				+淨零排放
			
 
				+清冊
			
 
				+清潔
			
 
				+減碳
			
 
				+減緩
			
 
				+減量
			
 
				+減量事項
			
 
				+減量
			
 
				+測定
			
 
				+測試
			
 
				+源
			
 
				+準則
			
 
				+溫室
			
 
				+溫室氣體
			
 
				+溫暖化
			
 
				+漁船
			
 
				+潛勢
			
 
				+澳洲
			
 
				+濾光片
			
 
				+為
			
 
				+煉製
			
 
				+煉鋼
			
 
				+照明
			
 
				+熟料
			
 
				+熱值
			
 
				+熱軋
			
 
				+燃料
			
 
				+燃氣
			
 
				+燃油
			
 
				+燃煤
			
 
				+燃燒
			
 
				+燒成
			
 
				+燒成爐
			
 
				+獎勵
			
 
				+現況
			
 
				+環保署
			
 
				+環境
			
 
				+生命
			
 
				+生料
			
 
				+生物
			
 
				+生產
			
 
				+產品
			
 
				+產業
			
 
				+產業淨
			
 
				+產業界
			
 
				+產生熱
			
 
				+用量
			
 
				+甲烷
			
 
				+甲苯
			
 
				+界定
			
 
				+當量
			
 
				+登錄
			
 
				+發光
			
 
				+發展
			
 
				+發電
			
 
				+發電業
			
 
				+的
			
 
				+監測法
			
 
				+監督
			
 
				+監護
			
 
				+盤查
			
 
				+目標
			
 
				+目的
			
 
				+直接
			
 
				+相互
			
 
				+相關
			
 
				+省電
			
 
				+石油
			
 
				+研擬
			
 
				+研析
			
 
				+研發
			
 
				+研磨
			
 
				+研究
			
 
				+研究所
			
 
				+研究院
			
 
				+破壞
			
 
				+硫
			
 
				+碳
			
 
				+碳中和
			
 
				+碳價
			
 
				+碳化物
			
 
				+碳匯
			
 
				+碳定價
			
 
				+碳排
			
 
				+碳標籤
			
 
				+碳權
			
 
				+碳洩漏
			
 
				+碳盤查
			
 
				+碳稅
			
 
				+碳費
			
 
				+碳費費率
			
 
				+碳足跡
			
 
				+碳邊境
			
 
				+碳鋼
			
 
				+確保事項
			
 
				+確認
			
 
				+科學
			
 
				+科技
			
 
				+移動
			
 
				+移轉
			
 
				+移除
			
 
				+稅費
			
 
				+程序
			
 
				+種類
			
 
				+積體
			
 
				+穩定
			
 
				+空氣
			
 
				+空調
			
 
				+立恩威
			
 
				+第
			
 
				+第三方
			
 
				+等
			
 
				+管制
			
 
				+管理
			
 
				+管理法
			
 
				+管理系統
			
 
				+管理
			
 
				+節約
			
 
				+範疇
			
 
				+粒狀
			
 
				+糧食
			
 
				+系統
			
 
				+組織
			
 
				+組織型
			
 
				+結合型
			
 
				+給
			
 
				+統計
			
 
				+統計及
			
 
				+經
			
 
				+經濟
			
 
				+經濟部
			
 
				+經驗
			
 
				+綠色
			
 
				+綱要
			
 
				+總碳
			
 
				+總量
			
 
				+績效
			
 
				+繳付
			
 
				+美國
			
 
				+老舊
			
 
				+聯合
			
 
				+聯合國
			
 
				+聲明
			
 
				+股份
			
 
				+胚生產
			
 
				+能力
			
 
				+能源
			
 
				+自主
			
 
				+自然
			
 
				+自然碳匯
			
 
				+自願
			
 
				+自願性
			
 
				+與
			
 
				+芝加哥
			
 
				+苯
			
 
				+苯乙烯
			
 
				+英國
			
 
				+蒸汽
			
 
				+薄膜
			
 
				+薄膜區
			
 
				+藉由
			
 
				+處
			
 
				+處理
			
 
				+處理階段
			
 
				+蝕刻
			
 
				+蝕刻區
			
 
				+行政院
			
 
				+行銷
			
 
				+衛理
			
 
				+衛生
			
 
				+衝擊
			
 
				+裂解
			
 
				+製程
			
 
				+製造
			
 
				+製造業
			
 
				+複循環
			
 
				+要點
			
 
				+規則
			
 
				+規劃
			
 
				+規定
			
 
				+規格
			
 
				+規模
			
 
				+規範
			
 
				+觀測
			
 
				+觸媒
			
 
				+計入期
			
 
				+計畫
			
 
				+計畫型
			
 
				+計算
			
 
				+設備
			
 
				+設定
			
 
				+設施
			
 
				+評估
			
 
				+評比
			
 
				+試算
			
 
				+試驗
			
 
				+認可
			
 
				+認定
			
 
				+認證
			
 
				+誘因
			
 
				+調整
			
 
				+調適
			
 
				+證明
			
 
				+證碳
			
 
				+議定
			
 
				+議定書
			
 
				+變化
			
 
				+變壓器
			
 
				+變更
			
 
				+變遷
			
 
				+負
			
 
				+財務
			
 
				+財政部
			
 
				+貢獻
			
 
				+資料
			
 
				+資源
			
 
				+資訊
			
 
				+質量
			
 
				+購買
			
 
				+趨勢
			
 
				+路徑
			
 
				+車輛
			
 
				+軋造
			
 
				+軋鋼
			
 
				+輔導
			
 
				+輪機
			
 
				+輸配線路
			
 
				+輻射
			
 
				+轉型
			
 
				+辦法
			
 
				+農委會
			
 
				+農業
			
 
				+農機
			
 
				+週期
			
 
				+逸散
			
 
				+運具
			
 
				+運輸
			
 
				+道瓊
			
 
				+選擇
			
 
				+邊境
			
 
				+邊界
			
 
				+部門
			
 
				+配售
			
 
				+配送
			
 
				+配額型
			
 
				+重組
			
 
				+量化
			
 
				+量測
			
 
				+金屬
			
 
				+金管會
			
 
				+金融
			
 
				+銷售
			
 
				+鋼生產
			
 
				+鋼胚
			
 
				+鋼鋼
			
 
				+鋼鐵業
			
 
				+鍋爐
			
 
				+鏈
			
 
				+鑑別
			
 
				+長期
			
 
				+開發行
			
 
				+間接
			
 
				+關稅
			
 
				+闡釋
			
 
				+附
			
 
				+陣列
			
 
				+階段
			
 
				+集魚燈
			
 
				+零排放
			
 
				+電力
			
 
				+電動
			
 
				+電工
			
 
				+電弧爐
			
 
				+電晶體
			
 
				+電氣
			
 
				+電路
			
 
				+需求
			
 
				+非
			
 
				+非政府
			
 
				+項
			
 
				+項目
			
 
				+預測
			
 
				+額
			
 
				+額度
			
 
				+類別
			
 
				+類型
			
 
				+顧問
			
 
				+顯示器
			
 
				+顯示器業
			
 
				+風險
			
 
				+飲食
			
 
				+香港
			
 
				+驅動力
			
 
				+驗證
			
 
				+高效率
			
 
				+黃金
			
 
				+論壇
			
 
				+量化
			
 
				+行政院
			
--- a/dictionary/dictionary_to_rows.py
+++ b/dictionary/dictionary_to_rows.py
@@ -0,0 +1,25 @@
 
				+
			
 
				+def process_file(input_file, output_file):
			
 
				+    unique_terms = set()
			
 
				+
			
 
				+    # Read the input file and collect unique terms
			
 
				+    with open(input_file, 'r', encoding='utf-8') as f:
			
 
				+        for line in f:
			
 
				+            terms = line.strip().split()
			
 
				+            unique_terms.update(terms)
			
 
				+
			
 
				+    # Sort the unique terms alphabetically
			
 
				+    sorted_terms = sorted(unique_terms)
			
 
				+
			
 
				+    # Write the unique terms to the output file
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        for term in sorted_terms:
			
 
				+            f.write(f"{term}\n")
			
 
				+
			
 
				+    print(f"Processed {len(unique_terms)} unique terms.")
			
 
				+
			
 
				+# Process the file
			
 
				+input_file = 'dictionary_reviewed.txt'  # Replace with your input file name
			
 
				+output_file = 'dictionary_reviewed_rows.txt'
			
 
				+
			
 
				+process_file(input_file, output_file)
			
--- a/dictionary/generate_dictionary.py
+++ b/dictionary/generate_dictionary.py
@@ -0,0 +1,48 @@
 
				+import csv
			
 
				+import re
			
 
				+import jieba
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def is_chinese(text):
			
 
				+    return bool(re.search('[\u4e00-\u9fff]', text))
			
 
				+
			
 
				+def extract_and_save(input_file, output_file):
			
 
				+    unique_rows = set()
			
 
				+
			
 
				+    with open(input_file, 'r', encoding='utf-8') as csvfile:
			
 
				+        first_line = csvfile.readline().strip()
			
 
				+        column_names = first_line.split(',')
			
 
				+        print(f"Columns found in CSV: {column_names}")
			
 
				+        
			
 
				+        csvfile.seek(0)
			
 
				+        reader = csv.DictReader(csvfile)
			
 
				+        
			
 
				+        source_column = next((col for col in column_names if 'source' in col.lower()), None)
			
 
				+        target_column = next((col for col in column_names if 'target' in col.lower()), None)
			
 
				+        
			
 
				+        if not source_column or not target_column:
			
 
				+            raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
			
 
				+        
			
 
				+        for row in reader:
			
 
				+            for column in [source_column, target_column]:
			
 
				+                text = row[column]
			
 
				+                if is_chinese(text):
			
 
				+                    tokens = list(jieba.cut(text))
			
 
				+                    if tokens:  # Only process non-empty lines
			
 
				+                        unique_rows.add(' '.join(tokens))
			
 
				+
			
 
				+    with open(output_file, 'w', encoding='utf-8') as txtfile:
			
 
				+        for unique_row in sorted(unique_rows):
			
 
				+            txtfile.write(unique_row + '\n')
			
 
				+
			
 
				+    print(f"Extraction complete. Results saved to {output_file}")
			
 
				+
			
 
				+# Specify input and output file paths
			
 
				+input_csv = 'knowledge_graph_test_rows.csv'
			
 
				+output_txt = 'dictionary.txt'
			
 
				+
			
 
				+try:
			
 
				+    extract_and_save(input_csv, output_txt)
			
 
				+except Exception as e:
			
 
				+    print(f"An error occurred: {e}")
			
 
				+    print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")
			
--- a/whisper.py
+++ b/whisper.py
@@ -204,7 +204,7 @@ def main():
 
				     parser = argparse.ArgumentParser(description="處理音頻文件使用 Whisper")
			
 
				     parser.add_argument("--file", help="要處理的單個音頻文件的路徑")
			
 
				     parser.add_argument("--folder", default="data", help="包含音頻文件的文件夾路徑（默認：data）")
			
 
				-    parser.add_argument("--dict", default="dictionary.txt", help="jieba 字典的路徑（默認：dictionary.txt")
			
 
				+    parser.add_argument("--dict", default="dictionary_reviewed_rows.txt", help="jieba dictionary_reviewed_rows.txt")
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     set_jieba_dictionary(args.dict)