Fix e2e-tests dropping highest concurrency benchmark configs (#1020)

Ankur-singh · Oseltamivir · web-flow · commit 68bf34d2c5de · 2026-04-10T00:25:57.000-07:00
* Fix e2e-tests missing highest concurrency benchmark configs

Remove run-eval filter from SINGLE job config generation so all
single-node configs are benchmarked regardless of eval marking.

* prevent regression

* Fix leak + buff tests

---------

Co-authored-by: Oseltamivir &lt;bryansg2013@gmail.com&gt;
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -54,7 +54,7 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
                   EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
@@ -934,6 +934,8 @@ def main():
         matrix_values = mark_eval_entries(matrix_values)
         if args.evals_only:
             matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
+            for e in matrix_values:
+                e[Fields.EVAL_ONLY.value] = True
 
     print(json.dumps(matrix_values))
     return matrix_values
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -9,6 +9,7 @@
     generate_runner_model_sweep_config,
     apply_node_type_defaults,
     expand_config_keys,
+    mark_eval_entries,
 )
 
 
@@ -1582,3 +1583,163 @@ def test_overlapping_patterns_deduplicate(self):
             "dsr1-fp8-h200-trt",
             "gptoss-fp8-b200-sglang",
         ]
+
+
+# =============================================================================
+# Tests for e2e-tests.yml workflow config splitting
+# =============================================================================
+
+def _split_e2e_configs(data):
+    """Replicate the splitting logic from e2e-tests.yml get-jobs step.
+
+    Returns (SINGLE, MULTI, EVALS) lists matching the workflow filters.
+    """
+    single = [x for x in data if 'prefill' not in x and not x.get('eval-only', False)]
+    multi = [x for x in data if 'prefill' in x]
+    evals = [x for x in data if 'prefill' not in x and x.get('run-eval', False)]
+    return single, multi, evals
+
+
+class TestE2EConfigSplitting:
+    """Verify the e2e-tests.yml config splitting logic handles all flag
+    combinations correctly: default, --no-evals, and --evals-only."""
+
+    @pytest.fixture
+    def mixed_entries(self):
+        """Simulates default mode output: single-node (some eval-marked),
+        plus multi-node entries."""
+        return [
+            {'exp-name': 'a', 'isl': 1024, 'osl': 1024, 'conc': 64, 'tp': 2, 'run-eval': False},
+            {'exp-name': 'b', 'isl': 1024, 'osl': 1024, 'conc': 128, 'tp': 2, 'run-eval': False},
+            {'exp-name': 'c', 'isl': 8192, 'osl': 1024, 'conc': 256, 'tp': 2, 'run-eval': True},
+            {'exp-name': 'd', 'isl': 8192, 'osl': 1024, 'conc': 512, 'tp': 2, 'run-eval': True},
+            {'exp-name': 'e', 'conc': 64, 'prefill': {'tp': 2, 'num-worker': 1}},
+        ]
+
+    def test_default_mode_benchmarks_all_single_node(self, mixed_entries):
+        """Default: all single-node entries (including eval-marked) are benchmarked."""
+        single, multi, evals = _split_e2e_configs(mixed_entries)
+        assert len(single) == 4
+        assert all('prefill' not in x for x in single)
+
+    def test_default_mode_evals_only_eval_marked(self, mixed_entries):
+        """Default: only eval-marked entries go to EVALS."""
+        single, multi, evals = _split_e2e_configs(mixed_entries)
+        assert len(evals) == 2
+        assert all(x['run-eval'] for x in evals)
+
+    def test_default_mode_eval_marked_in_both(self, mixed_entries):
+        """Default: eval-marked entries appear in BOTH single and evals."""
+        single, multi, evals = _split_e2e_configs(mixed_entries)
+        eval_names = {x['exp-name'] for x in evals}
+        single_names = {x['exp-name'] for x in single}
+        assert eval_names.issubset(single_names)
+
+    def test_no_evals_all_benchmarked(self):
+        """--no-evals: mark_eval_entries is skipped, no run-eval=True entries."""
+        data = [
+            {'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': False},
+            {'exp-name': 'b', 'conc': 128, 'tp': 2, 'run-eval': False},
+            {'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': False},
+        ]
+        single, multi, evals = _split_e2e_configs(data)
+        assert len(single) == 3
+        assert len(evals) == 0
+
+    def test_evals_only_no_benchmarks(self):
+        """--evals-only: entries have eval-only flag, SINGLE must be empty."""
+        data = [
+            {'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': True, 'eval-only': True},
+            {'exp-name': 'd', 'conc': 512, 'tp': 2, 'run-eval': True, 'eval-only': True},
+        ]
+        single, multi, evals = _split_e2e_configs(data)
+        assert len(single) == 0, "evals-only should not trigger benchmarks"
+        assert len(evals) == 2
+
+    def test_empty_config(self):
+        """Empty input produces empty outputs."""
+        single, multi, evals = _split_e2e_configs([])
+        assert single == [] and multi == [] and evals == []
+
+    def test_all_eval_marked_without_eval_only_flag_still_benchmarked(self):
+        """Default mode where mark_eval_entries marks every entry (e.g. only
+        8k1k with single conc). Without eval-only flag, SINGLE must still
+        include them for benchmarking."""
+        data = [
+            {'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': True},
+            {'exp-name': 'b', 'conc': 64, 'tp': 4, 'run-eval': True},
+        ]
+        single, multi, evals = _split_e2e_configs(data)
+        assert len(single) == 2, "all-eval-marked entries must still be benchmarked in default mode"
+        assert len(evals) == 2
+
+    def test_prefill_entries_never_in_single_or_evals(self, mixed_entries):
+        """Prefill (multi-node) entries only appear in MULTI."""
+        single, multi, evals = _split_e2e_configs(mixed_entries)
+        assert len(multi) == 1
+        assert all('prefill' in x for x in multi)
+        assert all('prefill' not in x for x in single)
+        assert all('prefill' not in x for x in evals)
+
+
+class TestMarkEvalEntries:
+    """Verify mark_eval_entries only marks highest/median concurrency at 8k1k."""
+
+    def test_marks_highest_and_median_conc(self):
+        """Should mark highest and median concurrency for 8k1k entries."""
+        entries = [
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 32,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 128,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 512,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
+        ]
+        result = mark_eval_entries(entries)
+        # conc values: [32, 128, 512]. median=128 (index 1), highest=512
+        assert result[0]['run-eval'] is False   # conc=32
+        assert result[1]['run-eval'] is True    # conc=128 (median)
+        assert result[2]['run-eval'] is True    # conc=512 (highest)
+
+    def test_non_8k1k_never_marked(self):
+        """Entries with non-8k1k seq lengths should never be eval-marked."""
+        entries = [
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 512,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
+        ]
+        result = mark_eval_entries(entries)
+        assert result[0]['run-eval'] is False
+
+    def test_multinode_entries_never_marked(self):
+        """Entries without top-level tp (multi-node) should never be eval-marked."""
+        entries = [
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 8192, 'osl': 1024, 'conc': 512,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False,
+             'prefill': {'tp': 2, 'num-worker': 1}},
+        ]
+        result = mark_eval_entries(entries)
+        assert result[0]['run-eval'] is False
+
+    def test_never_marks_all_entries(self):
+        """mark_eval_entries should never mark every single-node entry,
+        ensuring the e2e splitting logic can distinguish default from evals-only."""
+        entries = [
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': c,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False}
+            for c in [32, 64, 128, 256, 512]
+        ] + [
+            # Non-8k1k entry that should never be marked
+            {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+             'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 64,
+             'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
+        ]
+        result = mark_eval_entries(entries)
+        non_prefill = [x for x in result if 'prefill' not in x]
+        assert not all(x['run-eval'] for x in non_prefill), \
+            "mark_eval_entries must not mark all entries — would break e2e splitting"
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
@@ -53,6 +53,7 @@ class Fields(Enum):
 
     # Eval
     RUN_EVAL = 'run-eval'
+    EVAL_ONLY = 'eval-only'
 
 
 """
@@ -89,6 +90,7 @@ class SingleNodeMatrixEntry(BaseModel):
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     disagg: bool
     run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
+    eval_only: bool = Field(alias=Fields.EVAL_ONLY.value, default=False)
 
 
 class WorkerConfig(BaseModel):