|
@@ -0,0 +1,48 @@
|
|
|
+import csv
|
|
|
+import re
|
|
|
+import jieba
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+def is_chinese(text):
|
|
|
+ return bool(re.search('[\u4e00-\u9fff]', text))
|
|
|
+
|
|
|
+def extract_and_save(input_file, output_file):
|
|
|
+ unique_rows = set()
|
|
|
+
|
|
|
+ with open(input_file, 'r', encoding='utf-8') as csvfile:
|
|
|
+ first_line = csvfile.readline().strip()
|
|
|
+ column_names = first_line.split(',')
|
|
|
+ print(f"Columns found in CSV: {column_names}")
|
|
|
+
|
|
|
+ csvfile.seek(0)
|
|
|
+ reader = csv.DictReader(csvfile)
|
|
|
+
|
|
|
+ source_column = next((col for col in column_names if 'source' in col.lower()), None)
|
|
|
+ target_column = next((col for col in column_names if 'target' in col.lower()), None)
|
|
|
+
|
|
|
+ if not source_column or not target_column:
|
|
|
+ raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
|
|
|
+
|
|
|
+ for row in reader:
|
|
|
+ for column in [source_column, target_column]:
|
|
|
+ text = row[column]
|
|
|
+ if is_chinese(text):
|
|
|
+ tokens = list(jieba.cut(text))
|
|
|
+ if tokens: # Only process non-empty lines
|
|
|
+ unique_rows.add(' '.join(tokens))
|
|
|
+
|
|
|
+ with open(output_file, 'w', encoding='utf-8') as txtfile:
|
|
|
+ for unique_row in sorted(unique_rows):
|
|
|
+ txtfile.write(unique_row + '\n')
|
|
|
+
|
|
|
+ print(f"Extraction complete. Results saved to {output_file}")
|
|
|
+
|
|
|
+# Specify input and output file paths
|
|
|
+input_csv = 'knowledge_graph_test_rows.csv'
|
|
|
+output_txt = 'dictionary.txt'
|
|
|
+
|
|
|
+try:
|
|
|
+ extract_and_save(input_csv, output_txt)
|
|
|
+except Exception as e:
|
|
|
+ print(f"An error occurred: {e}")
|
|
|
+ print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")
|