-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreducer_tfidf.py
More file actions
107 lines (86 loc) · 3.02 KB
/
reducer_tfidf.py
File metadata and controls
107 lines (86 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
TF-IDF Reducer Module
Improved version with better error handling and output formatting
"""
import sys
import logging
from typing import List, Tuple
from utils import safe_parse_line, handle_broken_pipe, setup_logging, format_output
# Setup logging
setup_logging()
logger = logging.getLogger(__name__)
def reduce_tfidf_scores() -> List[Tuple[str, float]]:
"""
Reduce TF-IDF scores from mapper output
Returns:
List of (word, avg_tfidf) tuples sorted by score descending
"""
current_word = None
current_tfidf_sum = 0.0
document_count = 0
word_tfidfs = []
try:
# Read input from stdin
for line in sys.stdin:
parsed = safe_parse_line(line, expected_parts=3)
if not parsed:
continue
word, doc_id, tfidf_str = parsed
try:
tfidf = float(tfidf_str)
except ValueError:
logger.warning(f"Invalid TF-IDF value: {tfidf_str}")
continue
# Sum up TF-IDF scores for the same word
if current_word == word:
current_tfidf_sum += tfidf
document_count += 1
else:
# Store previous word's average TF-IDF if exists
if current_word:
avg_tfidf = current_tfidf_sum / document_count
word_tfidfs.append((current_word, avg_tfidf))
# Initialize new word
current_word = word
current_tfidf_sum = tfidf
document_count = 1
# Handle the last word
if current_word:
avg_tfidf = current_tfidf_sum / document_count
word_tfidfs.append((current_word, avg_tfidf))
# Sort by average TF-IDF score in descending order
word_tfidfs.sort(key=lambda x: x[1], reverse=True)
return word_tfidfs
except Exception as e:
logger.error(f"Error in TF-IDF reducer: {e}")
return []
def print_header() -> None:
"""Print formatted header"""
print(f"{'Word':<20} {'Avg_TF-IDF':<15}")
print("-" * 35)
def main():
"""Main TF-IDF reducer function"""
try:
# Process TF-IDF scores
word_tfidfs = reduce_tfidf_scores()
if not word_tfidfs:
logger.warning("No TF-IDF scores to process")
return
# Print header
print_header()
# Output results
for word, avg_tfidf in word_tfidfs:
print(format_output(word, avg_tfidf, width=20))
logger.info(f"Processed {len(word_tfidfs)} unique words with TF-IDF scores")
except KeyboardInterrupt:
logger.info("TF-IDF reducer interrupted by user")
sys.exit(0)
except Exception as e:
logger.error(f"Error in TF-IDF reducer: {e}")
sys.exit(1)
if __name__ == "__main__":
try:
main()
except BrokenPipeError:
handle_broken_pipe()