-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapper_tfidf.py
More file actions
93 lines (74 loc) · 2.63 KB
/
mapper_tfidf.py
File metadata and controls
93 lines (74 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
"""
TF-IDF Mapper Module
Improved version with better error handling and configuration
"""
import sys
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import handle_broken_pipe, setup_logging
from config import Config
# Setup logging
setup_logging()
logger = logging.getLogger(__name__)
def mapper_tfidf(tokens, doc_id):
"""
Map function for TF-IDF calculation
Args:
tokens: List of tokens to process
doc_id: Document identifier
"""
try:
if not tokens:
logger.warning(f"No tokens provided for document {doc_id}")
return
# Configure TF-IDF vectorizer
vectorizer = TfidfVectorizer(
stop_words='english',
max_features=Config.TFIDF_MAX_FEATURES,
min_df=Config.TFIDF_MIN_DF,
max_df=Config.TFIDF_MAX_DF
)
# Create document text from tokens
document_text = ' '.join(tokens)
# Calculate TF-IDF
tfidf_matrix = vectorizer.fit_transform([document_text])
feature_names = vectorizer.get_feature_names_out()
# Output <word, doc_id, tfidf_score> tuples
for i, word in enumerate(feature_names):
tfidf_score = tfidf_matrix[0, i]
print(f'{word}\t{doc_id}\t{tfidf_score}')
logger.info(f"Processed {len(feature_names)} features for document {doc_id}")
except Exception as e:
logger.error(f"Error in TF-IDF mapper for document {doc_id}: {e}")
def main():
"""Main TF-IDF mapper function"""
try:
# Check command line arguments
if len(sys.argv) != 2:
logger.error("Usage: python mapper_tfidf.py <doc_id>")
sys.exit(1)
doc_id = sys.argv[1]
# Read input from stdin
input_text = sys.stdin.read().strip()
if not input_text:
logger.warning("No input provided to TF-IDF mapper")
return
# Parse tokens
tokens = [token.strip() for token in input_text.split(',') if token.strip()]
if not tokens:
logger.warning("No valid tokens found for TF-IDF processing")
return
# Process tokens
mapper_tfidf(tokens, doc_id)
except KeyboardInterrupt:
logger.info("TF-IDF mapper interrupted by user")
sys.exit(0)
except Exception as e:
logger.error(f"Error in TF-IDF mapper: {e}")
sys.exit(1)
if __name__ == "__main__":
try:
main()
except BrokenPipeError:
handle_broken_pipe()