import jieba import jieba.posseg as pseg from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np def get_term_type(term): # 使用jieba进行词性标注 words = pseg.cut(term) pos_list = [w.flag for w in words] # 如果词组只有一个词,直接返回其词性 if len(pos_list) == 1: return pos_list[0] # 如果是多词组合,返回最后一个词的词性(通常是核心词) return pos_list[-1] def process_file(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: terms = [line.strip() for line in f.readlines()] # 创建每个词条的"文档" documents = [term.replace(' ', '') for term in terms] # 计算TF-IDF vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(documents) feature_names = vectorizer.get_feature_names_out() # 获取权重 weights = {} for term_idx, term in enumerate(terms): term_without_spaces = term.replace(' ', '') if term_without_spaces in feature_names: weight = tfidf_matrix[term_idx, feature_names == term_without_spaces].toarray()[0][0] weights[term] = weight # 将权重归一化到0-100范围 max_weight = max(weights.values()) min_weight = min(weights.values()) weight_range = max_weight - min_weight normalized_weights = {term: int((weight - min_weight) / weight_range * 100) if weight_range > 0 else 50 for term, weight in weights.items()} # 按权重排序词条 sorted_terms = sorted(normalized_weights.items(), key=lambda x: x[1], reverse=True) # 写入输出文件 cnt = 0 with open(output_file, 'w', encoding='utf-8') as f: if cnt != 10: for term, weight in sorted_terms: term_type = get_term_type(term.replace(' ', '')) f.write(f"{term} {weight} {term_type}\n") cnt += 1 print(f"处理了 {len(sorted_terms)} 个词条。") # 处理文件 input_file = 'dictionary_reviewed_rows.txt' # 替换为您的输入文件名 output_file = 'dictioanry_reviewed_rows_weighted.txt' process_file(input_file, output_file)