import csv import re import jieba from pathlib import Path def is_chinese(text): return bool(re.search('[\u4e00-\u9fff]', text)) def extract_and_save(input_file, output_file): unique_rows = set() with open(input_file, 'r', encoding='utf-8') as csvfile: first_line = csvfile.readline().strip() column_names = first_line.split(',') print(f"Columns found in CSV: {column_names}") csvfile.seek(0) reader = csv.DictReader(csvfile) source_column = next((col for col in column_names if 'source' in col.lower()), None) target_column = next((col for col in column_names if 'target' in col.lower()), None) if not source_column or not target_column: raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}") for row in reader: for column in [source_column, target_column]: text = row[column] if is_chinese(text): tokens = list(jieba.cut(text)) if tokens: # Only process non-empty lines unique_rows.add(' '.join(tokens)) with open(output_file, 'w', encoding='utf-8') as txtfile: for unique_row in sorted(unique_rows): txtfile.write(unique_row + '\n') print(f"Extraction complete. Results saved to {output_file}") # Specify input and output file paths input_csv = 'knowledge_graph_test_rows.csv' output_txt = 'dictionary.txt' try: extract_and_save(input_csv, output_txt) except Exception as e: print(f"An error occurred: {e}") print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")