[evals] Add held-out sample caps for diff/patch sources

dlwh · dlwh · commit 9969ff918a3f · 2026-04-23T01:32:21.000-07:00
diff --git a/docs/explanations/diff_patch_ppl_leakage_checks.md b/docs/explanations/diff_patch_ppl_leakage_checks.md
@@ -19,3 +19,7 @@ Fail the run if any eval row shares a hash with training rows after normalizatio
 4. Keep eval source snapshots immutable.
 Pin source revisions (dataset snapshot or commit hash) in the build logs.
 Do not regenerate eval rows from moving HEAD references.
+
+5. Cap held-out samples per source.
+Use small held-out caps to avoid full-corpus downloads during eval wiring.
+Current caps: SWE-bench issue-to-patch 256, SWE-bench raw git diff 256, CommitPack commit-message-plus-diff 512.
diff --git a/experiments/exp5095_diff_patch_ppl.py b/experiments/exp5095_diff_patch_ppl.py
@@ -43,6 +43,7 @@ class DiffPatchSlice:
     name: str
     relative_path: str
     metrics: tuple[DiffPatchMetric, ...]
+    held_out_sample_cap: int
 
     @property
     def tags(self) -> tuple[str, ...]:
@@ -65,18 +66,21 @@ def to_raw_dataset(self, raw_root: str, metric: DiffPatchMetric) -> RawTextEvalu
         name="issue_to_patch",
         relative_path="swe_bench/issue_to_patch.jsonl.gz",
         metrics=(DiffPatchMetric.PATCH_TEXT, DiffPatchMetric.CONTEXT_PLUS_PATCH),
+        held_out_sample_cap=256,
     ),
     DiffPatchSlice(
         source="swe_bench",
         name="raw_git_diff",
         relative_path="swe_bench/raw_git_diff.jsonl.gz",
         metrics=(DiffPatchMetric.PATCH_TEXT,),
+        held_out_sample_cap=256,
     ),
     DiffPatchSlice(
         source="commitpack",
         name="commit_message_plus_diff",
         relative_path="commitpack/commit_message_plus_diff.jsonl.gz",
         metrics=(DiffPatchMetric.PATCH_TEXT, DiffPatchMetric.CONTEXT_PLUS_PATCH),
+        held_out_sample_cap=512,
     ),
 )
 
@@ -154,6 +158,26 @@ def build_diff_patch_raw_validation_sets(
     return datasets
 
 
+def diff_patch_source_sampling_plan(
+    *,
+    slices: tuple[DiffPatchSlice, ...] = DIFF_PATCH_SLICES,
+) -> dict[str, dict[str, object]]:
+    """Small held-out sampling plan for source builders.
+
+    The plan is intentionally metadata-only so source integration can cap
+    downloads before data ingestion.
+    """
+
+    return {
+        f"{slice_spec.source}/{slice_spec.name}": {
+            "held_out_sample_cap": slice_spec.held_out_sample_cap,
+            "split": "validation",
+            "source": slice_spec.source,
+        }
+        for slice_spec in slices
+    }
+
+
 ACTIVE_DIFF_PATCH_DATASETS: dict[str, RawTextEvaluationDataset] = build_diff_patch_raw_validation_sets()
 
 
diff --git a/tests/evals/test_exp5095_diff_patch_ppl.py b/tests/evals/test_exp5095_diff_patch_ppl.py
@@ -71,3 +71,12 @@ def test_diff_patch_raw_validation_sets_prefixes_namespace() -> None:
     prefixed = diff_patch.diff_patch_raw_validation_sets()
     assert "diff_patch/swe_bench/issue_to_patch_patch_text" in prefixed
     assert "diff_patch/commitpack/commit_message_plus_diff_context_plus_patch" in prefixed
+
+
+def test_diff_patch_sampling_plan_uses_small_held_out_caps() -> None:
+    plan = diff_patch.diff_patch_source_sampling_plan()
+
+    assert plan["swe_bench/issue_to_patch"]["held_out_sample_cap"] == 256
+    assert plan["swe_bench/raw_git_diff"]["held_out_sample_cap"] == 256
+    assert plan["commitpack/commit_message_plus_diff"]["held_out_sample_cap"] == 512
+    assert all(entry["split"] == "validation" for entry in plan.values())