generate_dictionary.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import csv
  2. import re
  3. import jieba
  4. from pathlib import Path
  5. def is_chinese(text):
  6. return bool(re.search('[\u4e00-\u9fff]', text))
  7. def extract_and_save(input_file, output_file):
  8. unique_rows = set()
  9. with open(input_file, 'r', encoding='utf-8') as csvfile:
  10. first_line = csvfile.readline().strip()
  11. column_names = first_line.split(',')
  12. print(f"Columns found in CSV: {column_names}")
  13. csvfile.seek(0)
  14. reader = csv.DictReader(csvfile)
  15. source_column = next((col for col in column_names if 'source' in col.lower()), None)
  16. target_column = next((col for col in column_names if 'target' in col.lower()), None)
  17. if not source_column or not target_column:
  18. raise ValueError(f"Required columns not found. Looking for 'source' and 'target'. Found: {column_names}")
  19. for row in reader:
  20. for column in [source_column, target_column]:
  21. text = row[column]
  22. if is_chinese(text):
  23. tokens = list(jieba.cut(text))
  24. if tokens: # Only process non-empty lines
  25. unique_rows.add(' '.join(tokens))
  26. with open(output_file, 'w', encoding='utf-8') as txtfile:
  27. for unique_row in sorted(unique_rows):
  28. txtfile.write(unique_row + '\n')
  29. print(f"Extraction complete. Results saved to {output_file}")
  30. # Specify input and output file paths
  31. input_csv = 'knowledge_graph_test_rows.csv'
  32. output_txt = 'dictionary.txt'
  33. try:
  34. extract_and_save(input_csv, output_txt)
  35. except Exception as e:
  36. print(f"An error occurred: {e}")
  37. print("Please check your CSV file structure and ensure it contains 'source' and 'target' columns.")