Skip to content

Commit 3c0b017

Browse files
committed
WIP: Improve error report
1 parent 6676167 commit 3c0b017

2 files changed

Lines changed: 209 additions & 72 deletions

File tree

dev_scripts/large_tests/report.py

Lines changed: 205 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,51 @@
22

33
import re
44
import sys
5+
import xml.etree.ElementTree as ET
56
from collections import Counter
7+
from pathlib import Path
68
from typing import Dict, List, Tuple
79

8-
import xml.etree.ElementTree as ET
10+
DOC_TO_PIXELS_LOG_START = "----- DOC TO PIXELS LOG START -----"
11+
DOC_TO_PIXELS_LOG_END = "----- DOC TO PIXELS LOG END -----"
12+
13+
EXPECTED_PATTERNS = [
14+
re.compile(r"^Converting page X/X to pixels$"),
15+
re.compile(r"^Converting page X/X from pixels to searchable PDF$"),
16+
re.compile(r"^Converting to PDF using LibreOffice$"),
17+
re.compile(r"^Converted document to pixels$"),
18+
re.compile(r"^Safe PDF created$"),
19+
re.compile(r"^Compressing PDF$"),
20+
re.compile(r"^Merging X pages into a single PDF$"),
21+
re.compile(r"^Calculating number of pages$"),
22+
re.compile(r"^\[COMMAND\].*$"),
23+
re.compile(r"^Result: (SUCCESS|FAILURE)$"),
24+
re.compile(r"^pdfinfo:$"),
25+
re.compile(r"^pdftoppm: Syntax Error.*$"),
26+
re.compile(r"^convert /tmp/input_file as a .*$"),
27+
re.compile(r"^time=.*msg=\"forwarding signal.*"),
28+
re.compile(r"^time=.*msg=\"Waiting for container.*"),
29+
re.compile(r"^Installing LibreOffice extension.*$"),
30+
re.compile(r"^Archive:.*$"),
31+
re.compile(r"^ extracting:.*$"),
32+
re.compile(r"^ inflating:.*$"),
33+
re.compile(r"^$"),
34+
]
35+
36+
37+
def scrub_container_line(line: str) -> str:
38+
line = re.sub(r"\b[0-9a-fA-F]{6,}\b", "X", line)
39+
line = re.sub(r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", "X", line)
40+
line = re.sub(r"\d+", "X", line)
41+
return line
942

1043

11-
# Pattern to scrub variable data (dates, hex IDs, numbers) for grouping
12-
VARIABLE_PATTERN = re.compile(
13-
r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?\b"
14-
r"|\b[0-9a-f]{8,}\b"
15-
r"|\b\d{4}/\d{2}/\d{2}\b"
16-
r"|\b\d{2}:\d{2}:\d{2}\b"
17-
r'|(?<=file\s)\S+\.pdf'
18-
r"|\bpage\s+\d+"
19-
r"|\bpages\s+\d+"
20-
)
44+
def is_expected_line(line: str) -> bool:
45+
return any(p.match(line) for p in EXPECTED_PATTERNS)
2146

2247

23-
def scrub_text(text: str) -> str:
24-
"""Replace variable data with placeholders for better grouping."""
25-
return VARIABLE_PATTERN.sub("X", text)
48+
def is_blank_line(line: str) -> bool:
49+
return line == ""
2650

2751

2852
def parse_junit(xml_file: str) -> ET.Element:
@@ -31,52 +55,99 @@ def parse_junit(xml_file: str) -> ET.Element:
3155

3256

3357
def count_results(root: ET.Element) -> Dict[str, int]:
34-
testsuite = root.find("testsuite")
35-
if testsuite is None:
36-
return {"errors": 0, "failures": 0, "skipped": 0, "tests": 0}
58+
total_errors = 0
59+
total_failures = 0
60+
total_skipped = 0
61+
total_tests = 0
62+
for testsuite in root.findall("testsuite"):
63+
total_errors += int(testsuite.attrib.get("errors", "0"))
64+
total_failures += int(testsuite.attrib.get("failures", "0"))
65+
total_skipped += int(testsuite.attrib.get("skipped", "0"))
66+
total_tests += int(testsuite.attrib.get("tests", "0"))
3767
return {
38-
"errors": int(testsuite.attrib.get("errors", "0")),
39-
"failures": int(testsuite.attrib.get("failures", "0")),
40-
"skipped": int(testsuite.attrib.get("skipped", "0")),
41-
"tests": int(testsuite.attrib.get("tests", "0")),
68+
"errors": total_errors,
69+
"failures": total_failures,
70+
"skipped": total_skipped,
71+
"tests": total_tests,
4272
}
4373

4474

45-
def get_test_overview(root: ET.Element) -> List[Tuple[str, str]]:
46-
testsuite = root.find("testsuite")
47-
results = []
48-
if testsuite is not None:
49-
for testcase in testsuite.findall("testcase"):
50-
name = testcase.attrib.get("name", "unknown")
51-
classname = testcase.attrib.get("classname", "")
52-
full_name = f"{classname}::{name}" if classname else name
53-
failure = testcase.find("failure")
54-
error = testcase.find("error")
55-
if failure is not None:
56-
status = "FAIL"
57-
elif error is not None:
58-
status = "ERROR"
59-
else:
60-
status = "PASS"
61-
results.append((full_name, status))
62-
return results
63-
64-
65-
def get_container_outputs(root: ET.Element) -> List[str]:
66-
outputs = []
67-
testsuite = root.find("testsuite")
68-
if testsuite is not None:
69-
for testcase in testsuite.findall("testcase"):
70-
for child in ("failure", "error"):
71-
elem = testcase.find(child)
72-
if elem is not None and elem.text:
73-
outputs.append(scrub_text(elem.text.strip()))
74-
return outputs
75+
def get_test_cases(root: ET.Element) -> List[ET.Element]:
76+
cases = []
77+
for testsuite in root.findall("testsuite"):
78+
cases.extend(testsuite.findall("testcase"))
79+
return cases
80+
81+
82+
def get_test_status(testcase: ET.Element) -> str:
83+
if testcase.find("failure") is not None:
84+
return "FAIL"
85+
elif testcase.find("error") is not None:
86+
return "ERROR"
87+
return "PASS"
88+
89+
90+
def get_extension(name: str) -> str:
91+
m = re.search(r"\[([^\]]+)\]", name)
92+
if m:
93+
ext = Path(m.group(1)).suffix.lstrip(".")
94+
return ext if ext else "none"
95+
return "none"
96+
97+
98+
def get_size_bucket(name: str) -> str:
99+
if "10K_docs" in name:
100+
return "0KB - 10KB"
101+
elif "100K_docs" in name:
102+
return "10KB - 100KB"
103+
elif "10M_docs" in name:
104+
return "100KB - 10MB"
105+
elif "100M_docs" in name:
106+
return "10MB - 100MB"
107+
return "unknown"
108+
109+
110+
def extract_captured_text(testcase: ET.Element, tag: str) -> str:
111+
elem = testcase.find(tag)
112+
if elem is not None and elem.text:
113+
text = elem.text
114+
lines = text.split("\n")
115+
content_lines = []
116+
in_content = False
117+
for line in lines:
118+
if "Captured" in line and "---" in line:
119+
in_content = True
120+
continue
121+
if in_content:
122+
content_lines.append(line)
123+
if content_lines:
124+
return "\n".join(content_lines)
125+
return ""
126+
127+
128+
def extract_container_output(testcase: ET.Element) -> str:
129+
output = extract_captured_text(testcase, "system-out")
130+
if DOC_TO_PIXELS_LOG_START in output and DOC_TO_PIXELS_LOG_END in output:
131+
(_, rest) = output.split(DOC_TO_PIXELS_LOG_START, 1)
132+
(log, _) = rest.split(DOC_TO_PIXELS_LOG_END, 1)
133+
return log.strip()
134+
elif output:
135+
return output.strip()
136+
return ""
137+
138+
139+
def get_container_lines(testcase: ET.Element) -> List[str]:
140+
output = extract_container_output(testcase)
141+
if output:
142+
return [line.rstrip() for line in output.split("\n")]
143+
return []
75144

76145

77146
def generate_report(xml_file: str) -> str:
78147
root = parse_junit(xml_file)
79148
results = count_results(root)
149+
test_cases = get_test_cases(root)
150+
80151
total = results["tests"]
81152
failures = results["failures"]
82153
errors = results["errors"]
@@ -87,31 +158,43 @@ def generate_report(xml_file: str) -> str:
87158
lines.append("==== RESULTS SUMMARY ===")
88159
lines.append(f" errors: {errors}")
89160
lines.append(f" failures: {failures}")
161+
lines.append(f" successes: {total - errors - failures - skipped}")
90162
lines.append(f" skipped: {skipped}")
91163
lines.append(f" tests: {total}")
92164
lines.append(f" failure rate: {failure_rate}")
93165
lines.append("")
94166
lines.append("")
95167

96-
# Test overview
97-
overview = get_test_overview(root)
98-
pass_count = sum(1 for _, s in overview if s == "PASS")
99-
fail_count = sum(1 for _, s in overview if s in ("FAIL", "ERROR"))
168+
ext_counter = Counter()
169+
size_counter: Dict[str, int] = {}
170+
for tc in test_cases:
171+
name = tc.attrib.get("name", "")
172+
ext = get_extension(name)
173+
size_bucket = get_size_bucket(name)
174+
ext_counter[ext] += 1
175+
size_counter[size_bucket] = size_counter.get(size_bucket, 0) + 1
176+
100177
lines.append("=== TEST OVERVIEW ===")
101-
lines.append(f" Total: {len(overview)} Passed: {pass_count} Failed: {fail_count}")
102-
if fail_count > 0:
103-
lines.append("")
104-
lines.append(" Failures:")
105-
for name, status in overview:
106-
if status in ("FAIL", "ERROR"):
107-
lines.append(f" [{status}] {name}")
178+
lines.append("")
179+
lines.append(" Extensions breakdown (All available tests)")
180+
for ext, count in ext_counter.most_common():
181+
lines.append(f" {count:>8} {ext}")
182+
lines.append("")
183+
lines.append(" File sizes breakdown (All available tests)")
184+
for bucket in ["0KB - 10KB", "10KB - 100KB", "100KB - 10MB", "10MB - 100MB"]:
185+
count = size_counter.get(bucket, 0)
186+
lines.append(f" {bucket} {count}")
108187
lines.append("")
109188
lines.append("")
110189

111-
# Most common container output
112-
outputs = get_container_outputs(root)
113-
if outputs:
114-
counter = Counter(outputs)
190+
all_lines: List[str] = []
191+
for tc in test_cases:
192+
all_lines.extend(get_container_lines(tc))
193+
194+
if all_lines:
195+
scrubbed = [scrub_container_line(line) for line in all_lines]
196+
filtered = [l for l in scrubbed if not is_expected_line(l)]
197+
counter = Counter(filtered)
115198
lines.append("=== MOST COMMON CONTAINER OUTPUT ===")
116199
lines.append("")
117200
lines.append(" Top 30:")
@@ -120,18 +203,69 @@ def generate_report(xml_file: str) -> str:
120203
lines.append("")
121204
lines.append("")
122205

123-
# Failure reasons
206+
fail_lines: List[str] = []
207+
for tc in test_cases:
208+
if get_test_status(tc) in ("FAIL", "ERROR"):
209+
fail_lines.extend(get_container_lines(tc))
210+
211+
if fail_lines:
212+
scrubbed = [scrub_container_line(line) for line in fail_lines]
213+
filtered = [l for l in scrubbed if not is_expected_line(l)]
214+
counter = Counter(filtered)
124215
lines.append("=== FAILURE REASONS ===")
125216
lines.append("")
126217
lines.append(" All failures:")
127218
for output, count in counter.most_common():
128-
lines.append(f" {count:>5} {output[:120]}")
219+
lines.append(f" {count:>5} {output}")
220+
lines.append("")
221+
lines.append("")
129222

130-
# Timeouts (not directly in JUnit, but useful)
223+
timeout_files: List[str] = []
224+
for tc in test_cases:
225+
output = extract_captured_text(tc, "system-out")
226+
if "TIMEOUT EXCEEDED" in output:
227+
m = re.search(r"'(.*?)'", output)
228+
if m:
229+
timeout_files.append(m.group(1))
230+
231+
lines.append("=== TIMEOUTS ===")
232+
lines.append("")
233+
if timeout_files:
234+
lines.append(f" Summary: {len(timeout_files)}")
235+
lines.append("")
236+
lines.append(" Affected files:")
237+
for f in timeout_files:
238+
lines.append(f" - {f}")
239+
else:
240+
lines.append(" Summary: 0")
241+
lines.append("")
242+
lines.append(" Affected files:")
243+
lines.append("")
244+
lines.append("")
245+
246+
failed_entries: List[Tuple[str, List[str]]] = []
247+
for tc in test_cases:
248+
if get_test_status(tc) in ("FAIL", "ERROR"):
249+
name = tc.attrib.get("name", "")
250+
m = re.search(r"\[([^\]]+)\]", name)
251+
fname = m.group(1) if m else name
252+
container_lines = get_container_lines(tc)
253+
scrubbed = [scrub_container_line(l) for l in container_lines]
254+
filtered = [l for l in scrubbed if not is_expected_line(l)]
255+
preview = filtered[:3]
256+
failed_entries.append((fname, preview))
257+
258+
lines.append("=== FAILED FILES ===")
259+
lines.append("")
260+
if failed_entries:
261+
for fname, preview in sorted(failed_entries, key=lambda x: x[0]):
262+
lines.append(f" - {fname}")
263+
for pline in preview:
264+
lines.append(f" {pline}")
265+
else:
266+
lines.append(" (none)")
131267
lines.append("")
132268
lines.append("")
133-
lines.append("=== TIMEOUTS ===")
134-
lines.append(" (Not available from JUnit XML)")
135269

136270
return "\n".join(lines)
137271

tests/test_large_set.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,12 @@ async def run_doc_test(
7272
except asyncio.TimeoutError:
7373
print(f"*** TIMEOUT EXCEEDED FOR DOCUMENT '{doc}' ***")
7474
raise
75+
stderr_str = stderr.decode(errors="replace")
76+
if stderr_str:
77+
print(stderr_str, end="")
7578
assert returncode == 0, (
7679
f"Failed to convert {doc} (exit {returncode}).\n"
77-
f"stderr: {stderr.decode(errors='replace')}"
80+
f"stderr: {stderr_str}"
7881
)
7982

8083
@for_each_10K_doc

0 commit comments

Comments
 (0)