Skip to content

Commit 62c4440

Browse files
committed
Add human-readable test failure summarization to CI/CD
Creates tools/summarize_test_failures.py to parse and summarize: - Judge evaluation logs (JSONL format) - API server logs (error patterns) Outputs human-readable summary showing: - Overall pass/fail counts - Failed tests grouped by agent - Criterion scores with pass/fail indicators - Judge reasoning for failures - System responses (truncated) - API error patterns and counts Workflow step runs on test failure to help debug issues quickly.
1 parent 3db6a1e commit 62c4440

File tree

2 files changed

+211
-0
lines changed

2 files changed

+211
-0
lines changed

.github/workflows/ai-config-validation.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,12 @@ jobs:
326326
327327
exit $TEST_EXIT_CODE
328328
329+
- name: Summarize test failures
330+
if: failure()
331+
run: |
332+
echo "📊 Generating human-readable failure summary..."
333+
.venv/bin/python tools/summarize_test_failures.py || true
334+
329335
- name: Cleanup secrets
330336
if: always()
331337
run: |

tools/summarize_test_failures.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Summarize test failures from judge evaluation logs and API server logs.
4+
Used in GitHub Actions to provide human-readable failure summaries.
5+
"""
6+
7+
import json
8+
import sys
9+
from pathlib import Path
10+
from typing import List, Dict, Any
11+
import re
12+
13+
14+
def parse_judge_logs(logs_dir: Path) -> List[Dict[str, Any]]:
15+
"""Parse all judge evaluation JSONL files in logs directory."""
16+
evaluations = []
17+
18+
if not logs_dir.exists():
19+
print(f"⚠️ Judge logs directory not found: {logs_dir}")
20+
return []
21+
22+
# Find all JSONL files
23+
jsonl_files = list(logs_dir.glob("*.jsonl"))
24+
25+
if not jsonl_files:
26+
print(f"⚠️ No JSONL files found in {logs_dir}")
27+
return []
28+
29+
for jsonl_file in jsonl_files:
30+
print(f"📄 Reading {jsonl_file.name}")
31+
with open(jsonl_file, 'r') as f:
32+
for line in f:
33+
if line.strip():
34+
try:
35+
evaluations.append(json.loads(line))
36+
except json.JSONDecodeError as e:
37+
print(f"⚠️ Failed to parse line in {jsonl_file.name}: {e}")
38+
39+
return evaluations
40+
41+
42+
def parse_api_logs(log_file: Path) -> List[str]:
43+
"""Extract relevant error messages from API server logs."""
44+
errors = []
45+
46+
if not log_file.exists():
47+
print(f"⚠️ API log file not found: {log_file}")
48+
return []
49+
50+
print(f"📄 Reading {log_file.name}")
51+
52+
# Patterns to look for
53+
error_patterns = [
54+
r"ERROR:.*",
55+
r"Exception:.*",
56+
r"Traceback.*",
57+
r"Failed.*",
58+
r"Connection error.*",
59+
r"PII PRE-SCREENING ERROR:.*",
60+
r"SEARCH ERROR:.*",
61+
]
62+
63+
combined_pattern = re.compile('|'.join(error_patterns), re.IGNORECASE)
64+
65+
with open(log_file, 'r') as f:
66+
for line in f:
67+
if combined_pattern.search(line):
68+
errors.append(line.strip())
69+
70+
return errors
71+
72+
73+
def summarize_failures(evaluations: List[Dict[str, Any]]) -> None:
74+
"""Print human-readable summary of test failures."""
75+
76+
print("\n" + "="*80)
77+
print("🔍 TEST FAILURE SUMMARY")
78+
print("="*80 + "\n")
79+
80+
# Separate passed and failed tests
81+
passed = [e for e in evaluations if e.get('scores', {}).get('overall', 0) >= 0.6]
82+
failed = [e for e in evaluations if e.get('scores', {}).get('overall', 0) < 0.6]
83+
84+
print(f"📊 Overall Results:")
85+
print(f" ✅ Passed: {len(passed)}/{len(evaluations)}")
86+
print(f" ❌ Failed: {len(failed)}/{len(evaluations)}")
87+
print()
88+
89+
if not failed:
90+
print("🎉 All tests passed!")
91+
return
92+
93+
print("❌ Failed Tests:\n")
94+
95+
# Group failures by agent
96+
by_agent = {}
97+
for eval_result in failed:
98+
agent = eval_result.get('context', {}).get('agent', 'unknown')
99+
if agent not in by_agent:
100+
by_agent[agent] = []
101+
by_agent[agent].append(eval_result)
102+
103+
# Print failures grouped by agent
104+
for agent, results in sorted(by_agent.items()):
105+
print(f"\n{'─'*80}")
106+
print(f"Agent: {agent.upper()}")
107+
print(f"{'─'*80}\n")
108+
109+
for result in results:
110+
test_id = result.get('test_case_id', 'unknown')
111+
test_input = result.get('input', 'N/A')
112+
overall_score = result.get('scores', {}).get('overall', 0)
113+
114+
print(f"Test ID: {test_id}")
115+
print(f"Score: {overall_score:.2f}")
116+
print(f"Input: {test_input[:100]}..." if len(test_input) > 100 else f"Input: {test_input}")
117+
print()
118+
119+
# Show criterion scores
120+
criteria_scores = result.get('scores', {}).get('criteria', {})
121+
if criteria_scores:
122+
print("Criterion Scores:")
123+
for criterion, score in criteria_scores.items():
124+
emoji = "✅" if score >= 0.6 else "❌"
125+
print(f" {emoji} {criterion}: {score:.2f}")
126+
print()
127+
128+
# Show judge reasoning
129+
reasoning = result.get('reasoning', 'No reasoning provided')
130+
print("Judge Reasoning:")
131+
print(f" {reasoning[:300]}..." if len(reasoning) > 300 else f" {reasoning}")
132+
print()
133+
134+
# Show actual response (truncated)
135+
response = result.get('response', 'N/A')
136+
print("System Response:")
137+
print(f" {response[:200]}..." if len(response) > 200 else f" {response}")
138+
print()
139+
140+
print("="*80)
141+
142+
143+
def summarize_api_errors(errors: List[str]) -> None:
144+
"""Print summary of API server errors."""
145+
146+
if not errors:
147+
print("\n✅ No API errors detected in server logs\n")
148+
return
149+
150+
print("\n" + "="*80)
151+
print("🔧 API SERVER ERRORS")
152+
print("="*80 + "\n")
153+
154+
# Group similar errors
155+
error_counts = {}
156+
for error in errors:
157+
# Extract error type
158+
error_type = error.split(':')[0] if ':' in error else 'Unknown'
159+
error_counts[error_type] = error_counts.get(error_type, 0) + 1
160+
161+
print("Error Types:")
162+
for error_type, count in sorted(error_counts.items(), key=lambda x: x[1], reverse=True):
163+
print(f" • {error_type}: {count} occurrences")
164+
print()
165+
166+
print("Recent Errors (last 10):")
167+
for error in errors[-10:]:
168+
print(f" {error}")
169+
170+
print("\n" + "="*80)
171+
172+
173+
def main():
174+
"""Main entry point for log summarization."""
175+
176+
# Paths
177+
judge_logs_dir = Path("logs/judge_evaluations")
178+
api_log_file = Path("/tmp/agents-demo-api.log")
179+
180+
print("\n🔍 Analyzing test failures...\n")
181+
182+
# Parse logs
183+
evaluations = parse_judge_logs(judge_logs_dir)
184+
api_errors = parse_api_logs(api_log_file)
185+
186+
# Print summaries
187+
if evaluations:
188+
summarize_failures(evaluations)
189+
else:
190+
print("⚠️ No judge evaluation logs found")
191+
192+
if api_errors:
193+
summarize_api_errors(api_errors)
194+
195+
print("\n✨ Summary complete\n")
196+
197+
# Exit with error code if there were failures
198+
if evaluations:
199+
failed_count = sum(1 for e in evaluations if e.get('scores', {}).get('overall', 0) < 0.6)
200+
if failed_count > 0:
201+
sys.exit(1)
202+
203+
204+
if __name__ == "__main__":
205+
main()

0 commit comments

Comments
 (0)