12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import jieba
- import jieba.posseg as pseg
- from sklearn.feature_extraction.text import TfidfVectorizer
- import numpy as np
- def get_term_type(term):
- # 使用jieba进行词性标注
- words = pseg.cut(term)
- pos_list = [w.flag for w in words]
-
- # 如果词组只有一个词,直接返回其词性
- if len(pos_list) == 1:
- return pos_list[0]
-
- # 如果是多词组合,返回最后一个词的词性(通常是核心词)
- return pos_list[-1]
- def process_file(input_file, output_file):
- with open(input_file, 'r', encoding='utf-8') as f:
- terms = [line.strip() for line in f.readlines()]
- # 创建每个词条的"文档"
- documents = [term.replace(' ', '') for term in terms]
- # 计算TF-IDF
- vectorizer = TfidfVectorizer()
- tfidf_matrix = vectorizer.fit_transform(documents)
- feature_names = vectorizer.get_feature_names_out()
-
- # 获取权重
- weights = {}
- for term_idx, term in enumerate(terms):
- term_without_spaces = term.replace(' ', '')
- if term_without_spaces in feature_names:
- weight = tfidf_matrix[term_idx, feature_names == term_without_spaces].toarray()[0][0]
- weights[term] = weight
- # 将权重归一化到0-100范围
- max_weight = max(weights.values())
- min_weight = min(weights.values())
- weight_range = max_weight - min_weight
- normalized_weights = {term: int((weight - min_weight) / weight_range * 100) if weight_range > 0 else 50
- for term, weight in weights.items()}
- # 按权重排序词条
- sorted_terms = sorted(normalized_weights.items(), key=lambda x: x[1], reverse=True)
- # 写入输出文件
- cnt = 0
- with open(output_file, 'w', encoding='utf-8') as f:
- if cnt != 10:
- for term, weight in sorted_terms:
- term_type = get_term_type(term.replace(' ', ''))
- f.write(f"{term} {weight} {term_type}\n")
- cnt += 1
- print(f"处理了 {len(sorted_terms)} 个词条。")
- # 处理文件
- input_file = 'dictionary_reviewed_rows.txt' # 替换为您的输入文件名
- output_file = 'dictioanry_reviewed_rows_weighted.txt'
- process_file(input_file, output_file)
|