Improve real-world schema crash test: failure dump, cluster analysis, TypeErrors baseline ratchet (#3958)

jlowin · claude · web-flow · commit e3f845f5581f · 2026-04-17T15:08:48.000-04:00
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/tests/utilities/json_schema_type/cluster_failures.py b/tests/utilities/json_schema_type/cluster_failures.py
@@ -0,0 +1,107 @@
+"""Cluster real-world schema failure records by exception signature.
+
+Usage
+-----
+Run the crash test with DUMP_SCHEMA_FAILURES to collect per-failure records,
+then run this script to cluster them:
+
+    # 1. Run the crash test with failure dumping enabled:
+    RUN_REAL_WORLD_SCHEMA_TEST=1 \\
+    OPENAPI_DIRECTORY_PATH=/tmp/openapi-directory \\
+    DUMP_SCHEMA_FAILURES=/tmp/schema_failures \\
+    uv run pytest tests/utilities/json_schema_type/test_real_world_schemas.py \\
+        -m integration -n auto --timeout-method=thread -q
+
+    # 2. Cluster the failures:
+    python tests/utilities/json_schema_type/cluster_failures.py
+
+    # 3. (Optional) specify a custom dump directory:
+    DUMP_SCHEMA_FAILURES=/my/path python tests/utilities/json_schema_type/cluster_failures.py
+
+Each JSONL record written by the test has the shape:
+    {
+        "provider": "amazonaws.com",
+        "name": "TagKey",
+        "bucket": "schema_errors",           # type_errors | schema_errors | timeouts | other_errors
+        "error_type": "SchemaError",
+        "error_msg": "...",
+        "schema": "{...}"                    # JSON-encoded, truncated to 2000 chars
+    }
+
+Workflow for fixing a cluster
+------------------------------
+1. Identify the top cluster(s) by count.
+2. Grab the example schema from the cluster output.
+3. Reproduce in a unit test: add a test to test_json_schema_type.py that calls
+   json_schema_to_type() with that schema and asserts the correct type is returned.
+4. Fix the root cause in src/fastmcp/utilities/json_schema_type.py.
+5. Re-run the crash test — confirm the cluster count drops.
+6. Ratchet the baseline in tests/utilities/json_schema_type/conftest.py:
+   - Lower MAX_TYPE_ERRORS / MAX_SCHEMA_ERRORS to the new actual count.
+   - Add a comment with the date and what was fixed.
+7. Commit.
+"""
+
+from __future__ import annotations
+
+import collections
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+DUMP_DIR = Path(os.environ.get("DUMP_SCHEMA_FAILURES", "/tmp/schema_failures"))
+
+_NORMALIZE_RE = re.compile(r"'[^']{3,}'|\"[^\"]{3,}\"|0x[0-9a-fA-F]+|\b\d+\b")
+
+
+def normalize(msg: str) -> str:
+    head = "\n".join(msg.splitlines()[:3])
+    return _NORMALIZE_RE.sub("<X>", head)[:300]
+
+
+def main() -> None:
+    if not DUMP_DIR.exists():
+        print(f"No dump directory found at {DUMP_DIR}.")
+        print("Run the crash test with DUMP_SCHEMA_FAILURES set first.")
+        sys.exit(1)
+
+    records = []
+    for f in DUMP_DIR.glob("*.jsonl"):
+        for line in f.read_text().splitlines():
+            if line.strip():
+                records.append(json.loads(line))
+
+    if not records:
+        print(f"No failure records found in {DUMP_DIR}. All schemas passed!")
+        return
+
+    print(f"Total failures: {len(records)}")
+    print()
+
+    buckets: collections.Counter[str] = collections.Counter(
+        r["bucket"] for r in records
+    )
+    for k, v in buckets.most_common():
+        print(f"  {k}: {v}")
+    print()
+
+    clusters: collections.Counter[tuple[str, str, str]] = collections.Counter()
+    examples: dict[tuple[str, str, str], dict] = {}
+    for r in records:
+        key = (r["bucket"], r["error_type"], normalize(r["error_msg"]))
+        clusters[key] += 1
+        examples.setdefault(key, r)
+
+    print(f"=== Top clusters (of {len(clusters)} total) ===")
+    for (bucket, etype, sig), count in clusters.most_common():
+        print(f"\n[{count:>4}x] {bucket} / {etype}")
+        print(f"       sig: {sig[:150]}")
+        ex = examples[(bucket, etype, sig)]
+        print(f"       ex : provider={ex['provider']}  name={ex['name']}")
+        print(f"       msg: {ex['error_msg'][:200]}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/utilities/json_schema_type/conftest.py b/tests/utilities/json_schema_type/conftest.py
@@ -69,9 +69,11 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
         f"Total crashes:   {crashes:,} ({crashes / max(totals['schemas'], 1) * 100:.2f}%)"
     )
 
-    # Snapshot baselines (captured 2026-04-10, openapi-directory@f7207cf0,
-    # origin/main, with JSON round-trip to strip YAML artifacts).
-    MAX_TYPE_ERRORS = 420  # was 388 — real json_schema_to_type bugs
+    # Snapshot baselines (openapi-directory@f7207cf0).
+    # Ratcheted 2026-04-17: TypeErrors 420→0 (already fixed on main since
+    # original 2026-04-10 capture).  SchemaErrors unchanged — all 279 are
+    # Pydantic Rust-regex rejections (lookahead, \p{…}, size limits).
+    MAX_TYPE_ERRORS = 0
     MAX_SCHEMA_ERRORS = 300  # was 277 — Pydantic regex rejections (not our code)
     MAX_TIMEOUTS = 5  # was 0
     MAX_OTHER_ERRORS = 50  # was 0
diff --git a/tests/utilities/json_schema_type/test_real_world_schemas.py b/tests/utilities/json_schema_type/test_real_world_schemas.py
@@ -7,7 +7,49 @@
 shows progress and can identify which provider caused a hang.
 
 Marked as an integration test — skipped by default, run with:
-    uv run pytest tests/utilities/json_schema_type/test_real_world_schemas.py -m integration -v
+
+    uv run pytest tests/utilities/json_schema_type/test_real_world_schemas.py -m integration -n auto --timeout-method=thread -q
+
+On a 16-core machine this takes ~3-5 minutes. The CI workflow
+(.github/workflows/run-schema-crash-test.yml) runs it on push/PR when
+relevant source files change.
+
+## Aggregate baselines
+
+After each run, conftest.py compares aggregate crash counts against baseline
+caps in ``conftest.py``.  Ratchet those caps DOWN whenever a cluster of errors
+is fixed — never up.  See ``conftest.py`` for the current caps and comments
+on what was fixed.
+
+## Collecting and analysing failures
+
+Set ``DUMP_SCHEMA_FAILURES`` to a directory to write per-failure JSONL records:
+
+    RUN_REAL_WORLD_SCHEMA_TEST=1 \\
+    OPENAPI_DIRECTORY_PATH=/tmp/openapi-directory \\
+    DUMP_SCHEMA_FAILURES=/tmp/schema_failures \\
+    uv run pytest tests/utilities/json_schema_type/test_real_world_schemas.py \\
+        -m integration -n auto --timeout-method=thread -q
+
+Then cluster the results to find root causes:
+
+    python tests/utilities/json_schema_type/cluster_failures.py
+
+See ``cluster_failures.py`` for the full fix workflow (reproduce → unit test →
+fix → ratchet baseline → commit).
+
+## Cloning the corpus locally
+
+The test auto-clones when ``RUN_REAL_WORLD_SCHEMA_TEST=1``.  To pre-clone
+manually (~800 MB):
+
+    git clone --depth 1 https://github.com/APIs-guru/openapi-directory.git /tmp/openapi-directory
+
+Then point at it:
+
+    OPENAPI_DIRECTORY_PATH=/tmp/openapi-directory \\
+    RUN_REAL_WORLD_SCHEMA_TEST=1 \\
+    uv run pytest ...
 """
 
 from __future__ import annotations
@@ -223,6 +265,37 @@ def _test_provider(provider: str) -> ProviderResult:
     result = ProviderResult()
     use_alarm = hasattr(signal, "SIGALRM")
 
+    # Optional per-failure dump: when DUMP_SCHEMA_FAILURES is set to a directory,
+    # write one JSONL record per failure so we can cluster them later.
+    dump_dir_env = os.environ.get("DUMP_SCHEMA_FAILURES")
+    dump_path: Path | None = None
+    if dump_dir_env:
+        dump_path = Path(dump_dir_env) / f"{provider}.jsonl"
+        dump_path.parent.mkdir(parents=True, exist_ok=True)
+        # Truncate any prior content from a previous run of this provider.
+        dump_path.write_text("")
+
+    def _record_failure(
+        bucket: str, schema_name: str, schema_obj: dict, exc: BaseException
+    ) -> None:
+        if dump_path is None:
+            return
+        # Cap the schema snippet so huge recursive specs don't blow up disk.
+        try:
+            schema_repr = json.dumps(schema_obj, default=str)[:2000]
+        except Exception:
+            schema_repr = "<unserializable>"
+        record = {
+            "provider": provider,
+            "name": schema_name,
+            "bucket": bucket,
+            "error_type": type(exc).__name__,
+            "error_msg": str(exc)[:500],
+            "schema": schema_repr,
+        }
+        with dump_path.open("a") as fh:
+            fh.write(json.dumps(record) + "\n")
+
     for spec_file in _spec_files_for_provider(provider):
         spec = _load_spec(spec_file)
         if spec is None:
@@ -246,16 +319,26 @@ def _test_provider(provider: str) -> ProviderResult:
             try:
                 T = json_schema_to_type(schema)
                 TypeAdapter(T)
-            except _SchemaTimeout:
+            except _SchemaTimeout as e:
+                if use_alarm:
+                    signal.alarm(0)
                 result.timeouts += 1
-            except TypeError:
+                _record_failure("timeouts", _name, schema, e)
+            except TypeError as e:
+                if use_alarm:
+                    signal.alarm(0)
                 result.type_errors += 1
+                _record_failure("type_errors", _name, schema, e)
             except Exception as e:
+                if use_alarm:
+                    signal.alarm(0)
                 err_type = type(e).__name__
                 if "SchemaError" in err_type or "schema" in str(e).lower()[:50]:
                     result.schema_errors += 1
+                    _record_failure("schema_errors", _name, schema, e)
                 else:
                     result.other_errors += 1
+                    _record_failure("other_errors", _name, schema, e)
             finally:
                 if use_alarm:
                     signal.alarm(0)