add_weights.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import jieba
  2. import jieba.posseg as pseg
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. import numpy as np
  5. def get_term_type(term):
  6. # 使用jieba进行词性标注
  7. words = pseg.cut(term)
  8. pos_list = [w.flag for w in words]
  9. # 如果词组只有一个词,直接返回其词性
  10. if len(pos_list) == 1:
  11. return pos_list[0]
  12. # 如果是多词组合,返回最后一个词的词性(通常是核心词)
  13. return pos_list[-1]
  14. def process_file(input_file, output_file):
  15. with open(input_file, 'r', encoding='utf-8') as f:
  16. terms = [line.strip() for line in f.readlines()]
  17. # 创建每个词条的"文档"
  18. documents = [term.replace(' ', '') for term in terms]
  19. # 计算TF-IDF
  20. vectorizer = TfidfVectorizer()
  21. tfidf_matrix = vectorizer.fit_transform(documents)
  22. feature_names = vectorizer.get_feature_names_out()
  23. # 获取权重
  24. weights = {}
  25. for term_idx, term in enumerate(terms):
  26. term_without_spaces = term.replace(' ', '')
  27. if term_without_spaces in feature_names:
  28. weight = tfidf_matrix[term_idx, feature_names == term_without_spaces].toarray()[0][0]
  29. weights[term] = weight
  30. # 将权重归一化到0-100范围
  31. max_weight = max(weights.values())
  32. min_weight = min(weights.values())
  33. weight_range = max_weight - min_weight
  34. normalized_weights = {term: int((weight - min_weight) / weight_range * 100) if weight_range > 0 else 50
  35. for term, weight in weights.items()}
  36. # 按权重排序词条
  37. sorted_terms = sorted(normalized_weights.items(), key=lambda x: x[1], reverse=True)
  38. # 写入输出文件
  39. cnt = 0
  40. with open(output_file, 'w', encoding='utf-8') as f:
  41. if cnt != 10:
  42. for term, weight in sorted_terms:
  43. term_type = get_term_type(term.replace(' ', ''))
  44. f.write(f"{term} {weight} {term_type}\n")
  45. cnt += 1
  46. print(f"处理了 {len(sorted_terms)} 个词条。")
  47. # 处理文件
  48. input_file = 'dictionary_reviewed_rows.txt' # 替换为您的输入文件名
  49. output_file = 'dictioanry_reviewed_rows_weighted.txt'
  50. process_file(input_file, output_file)