-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathintermediate.py
More file actions
65 lines (57 loc) · 2.72 KB
/
intermediate.py
File metadata and controls
65 lines (57 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import jieba
from pypinyin import lazy_pinyin
from sklearn.feature_extraction.text import TfidfVectorizer
# 定义加载停用词的函数
def load_stop_words(stop_words_file):
with open(stop_words_file, 'r', encoding='utf-8') as f:
stop_words = set(line.strip() for line in f if line.strip() and not line.startswith('#'))
return stop_words
# 定义从文件中读取文档的函数,并去除关键词
def read_and_preprocess_documents(input_dir, keyword_pinyin):
input_file_path = os.path.join(input_dir, f'word_result_{keyword_pinyin}.txt')
documents = []
if os.path.exists(input_file_path):
with open(input_file_path, 'r', encoding='utf-8') as file:
for line in file:
# 使用jieba进行分词,并去除停用词和关键词
clean_line = ' '.join([word for word in jieba.cut(line.strip()) if word not in stop_words])
if clean_line:
documents.append(clean_line)
else:
print(f"文件 {input_file_path} 不存在.")
return documents
# 定义计算TF-IDF的函数
def calculate_tfidf(documents):
tfidf_vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95, binary=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
return tfidf_matrix, tfidf_vectorizer.get_feature_names_out()
# 定义将TF-IDF结果写入文件的函数
def write_tfidf_to_file(output_dir, keyword_pinyin, tfidf_matrix, feature_names):
output_file_path = os.path.join(output_dir, f'tfidf_{keyword_pinyin}.txt')
with open(output_file_path, 'w', encoding='utf-8') as file:
for doc_index in range(tfidf_matrix.shape[0]):
for word_idx in tfidf_matrix[doc_index].indices:
word = feature_names[word_idx]
score = tfidf_matrix[doc_index, word_idx]
if score < 1.0: # 只写入小于1.0的分数
file.write(f"{word}: {score}\n")
# 主程序入口
if __name__ == "__main__":
input_dir = 'word_output'
output_dir = 'tfidf_output'
stop_words_file = 'data/stop_words.txt'
keywords = ['郭德纲', '女']
# 加载停用词
stop_words = load_stop_words(stop_words_file)
# 确保输出目录存在
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 对每个关键词处理
for keyword in keywords:
keyword_pinyin = ''.join(lazy_pinyin(keyword))
documents = read_and_preprocess_documents(input_dir, keyword_pinyin)
if documents:
tfidf_matrix, feature_names = calculate_tfidf(documents)
write_tfidf_to_file(output_dir, keyword_pinyin, tfidf_matrix, feature_names)
print(f"完成关键词 '{keyword}' 的TF-IDF计算并已保存到文件。")