@@ -0,0 +1,48 @@
+import csv
+import re
+import jieba
+from pathlib import Path
+def is_chinese(text):
+ return bool(re.search('[\u4e00-\u9fff]', text))
+def extract_and_save(input_file, output_file):
+ unique_rows = set()
+ with open(input_file, 'r', encoding='utf-8') as csvfile:
+ first_line = csvfile.readline().strip()
+ column_names = first_line.split(',')
+ print(f"Columns found in CSV: {column_names}")
+ csvfile.seek(0)
+ reader = csv.DictReader(csvfile)
+ source_column = next((col for col in column_names if 'source' in col.lower()), None)
+ target_column = next((col for col in column_names if 'target' in col.lower()), None)
+ if not source_column or not target_column:
+ raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
+ for row in reader:
+ for column in [source_column, target_column]:
+ text = row[column]
+ if is_chinese(text):
+ tokens = list(jieba.cut(text))
+ if tokens: # Only process non-empty lines
+ unique_rows.add(' '.join(tokens))
+ with open(output_file, 'w', encoding='utf-8') as txtfile:
+ for unique_row in sorted(unique_rows):
+ txtfile.write(unique_row + '\n')
+ print(f"Extraction complete. Results saved to {output_file}")
+# Specify input and output file paths
+input_csv = 'knowledge_graph_test_rows.csv'
+output_txt = 'dictionary.txt'
+ extract_and_save(input_csv, output_txt)
+except Exception as e:
+ print(f"An error occurred: {e}")
+ print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")