sherry
/
ASR


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							import csv
import re
import jieba
from pathlib import Path

def is_chinese(text):
    return bool(re.search('[\u4e00-\u9fff]', text))

def extract_and_save(input_file, output_file):
    unique_rows = set()

    with open(input_file, 'r', encoding='utf-8') as csvfile:
        first_line = csvfile.readline().strip()
        column_names = first_line.split(',')
        print(f"Columns found in CSV: {column_names}")
        
        csvfile.seek(0)
        reader = csv.DictReader(csvfile)
        
        source_column = next((col for col in column_names if 'source' in col.lower()), None)
        target_column = next((col for col in column_names if 'target' in col.lower()), None)
        
        if not source_column or not target_column:
            raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
        
        for row in reader:
            for column in [source_column, target_column]:
                text = row[column]
                if is_chinese(text):
                    tokens = list(jieba.cut(text))
                    if tokens:  # Only process non-empty lines
                        unique_rows.add(' '.join(tokens))

    with open(output_file, 'w', encoding='utf-8') as txtfile:
        for unique_row in sorted(unique_rows):
            txtfile.write(unique_row + '\n')

    print(f"Extraction complete. Results saved to {output_file}")

# Specify input and output file paths
input_csv = 'knowledge_graph_test_rows.csv'
output_txt = 'dictionary.txt'

try:
    extract_and_save(input_csv, output_txt)
except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")