Skip to content

Commit 4bebb64

Browse files
committed
Add metrics tracking to dashboard in evaluator.py
- Enhanced the dashboard functionality by adding support for displaying metrics during task evaluation. - Updated the evaluate function to include metrics if available, improving real-time monitoring capabilities.
1 parent dd69655 commit 4bebb64

2 files changed

Lines changed: 108 additions & 0 deletions

File tree

lmms_eval/evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,9 @@ def evaluate(
647647
processed_docs += 1
648648
if dashboard:
649649
dashboard.update_task_progress(task_output.task_name, processed_docs)
650+
# Add metrics to dashboard if available
651+
if metrics:
652+
dashboard.add_task_metrics(task_output.task_name, metrics)
650653

651654
pbar.update(1)
652655

test_ui_fixes.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python3
2+
"""Test script to verify UI fixes."""
3+
4+
import os
5+
import time
6+
7+
from lmms_eval.ui import create_dashboard
8+
9+
10+
def test_dashboard_without_blocking():
11+
"""Test that dashboard doesn't block on final results."""
12+
print("Testing dashboard without blocking...")
13+
14+
# Test with UI enabled
15+
os.environ["ENABLE_UI"] = "true"
16+
dashboard = create_dashboard()
17+
18+
# Test initialization
19+
dashboard.start_evaluation("test_model", "test_args", ["task1"])
20+
21+
# Test task operations
22+
dashboard.start_task("task1", 10)
23+
24+
# Simulate progress updates with metrics
25+
for i in range(1, 11):
26+
dashboard.update_task_progress("task1", i)
27+
# Add metrics periodically
28+
if i % 3 == 0:
29+
metrics = {"accuracy": 0.8 + i * 0.01, "score": 80 + i}
30+
dashboard.add_task_metrics("task1", metrics)
31+
time.sleep(0.1)
32+
33+
# End task
34+
dashboard.end_task("task1")
35+
36+
# Test final results - should not block
37+
results = {"results": {"task1": {"accuracy": 0.85}}, "task1": {"accuracy": 0.85}}
38+
39+
print("Showing final results...")
40+
start_time = time.time()
41+
dashboard.show_final_results(results)
42+
end_time = time.time()
43+
44+
elapsed = end_time - start_time
45+
print(f"Final results display took {elapsed:.2f} seconds")
46+
47+
# Should not block indefinitely
48+
if elapsed < 5:
49+
print("✓ Dashboard doesn't block on final results")
50+
return True
51+
else:
52+
print("✗ Dashboard still blocks on final results")
53+
return False
54+
55+
56+
def test_minimal_dashboard():
57+
"""Test minimal dashboard functionality."""
58+
print("\nTesting minimal dashboard...")
59+
60+
# Test with UI disabled
61+
os.environ["ENABLE_UI"] = "false"
62+
dashboard = create_dashboard()
63+
64+
# Test all methods work without errors
65+
dashboard.start_evaluation("test_model", "test_args", ["task1"])
66+
dashboard.start_task("task1", 10)
67+
dashboard.update_task_progress("task1", 5)
68+
dashboard.add_task_metrics("task1", {"accuracy": 0.85})
69+
dashboard.end_task("task1")
70+
dashboard.show_final_results({"results": {"task1": {"accuracy": 0.85}}})
71+
72+
print("✓ Minimal dashboard works correctly")
73+
return True
74+
75+
76+
def main():
77+
"""Run UI fixes tests."""
78+
print("=== Testing UI Fixes ===\n")
79+
80+
try:
81+
success = True
82+
83+
# Test Rich dashboard
84+
success &= test_dashboard_without_blocking()
85+
86+
# Test minimal dashboard
87+
success &= test_minimal_dashboard()
88+
89+
if success:
90+
print("\n✅ All UI fixes working correctly!")
91+
return 0
92+
else:
93+
print("\n❌ Some UI fixes failed!")
94+
return 1
95+
96+
except Exception as e:
97+
print(f"\n❌ Test failed with error: {e}")
98+
import traceback
99+
100+
traceback.print_exc()
101+
return 1
102+
103+
104+
if __name__ == "__main__":
105+
exit(main())

0 commit comments

Comments
 (0)