123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import csv
- import re
- import jieba
- from pathlib import Path
- def is_chinese(text):
- return bool(re.search('[\u4e00-\u9fff]', text))
- def extract_and_save(input_file, output_file):
- unique_rows = set()
- with open(input_file, 'r', encoding='utf-8') as csvfile:
- first_line = csvfile.readline().strip()
- column_names = first_line.split(',')
- print(f"Columns found in CSV: {column_names}")
-
- csvfile.seek(0)
- reader = csv.DictReader(csvfile)
-
- source_column = next((col for col in column_names if 'source' in col.lower()), None)
- target_column = next((col for col in column_names if 'target' in col.lower()), None)
-
- if not source_column or not target_column:
- raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
-
- for row in reader:
- for column in [source_column, target_column]:
- text = row[column]
- if is_chinese(text):
- tokens = list(jieba.cut(text))
- if tokens: # Only process non-empty lines
- unique_rows.add(' '.join(tokens))
- with open(output_file, 'w', encoding='utf-8') as txtfile:
- for unique_row in sorted(unique_rows):
- txtfile.write(unique_row + '\n')
- print(f"Extraction complete. Results saved to {output_file}")
- # Specify input and output file paths
- input_csv = 'knowledge_graph_test_rows.csv'
- output_txt = 'dictionary.txt'
- try:
- extract_and_save(input_csv, output_txt)
- except Exception as e:
- print(f"An error occurred: {e}")
- print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")
|