Skip to content

Commit 68bf34d

Browse files
Fix e2e-tests dropping highest concurrency benchmark configs (#1020)
* Fix e2e-tests missing highest concurrency benchmark configs Remove run-eval filter from SINGLE job config generation so all single-node configs are benchmarked regardless of eval marking. * prevent regression * Fix leak + buff tests --------- Co-authored-by: Oseltamivir <bryansg2013@gmail.com>
1 parent 4b655dc commit 68bf34d

File tree

4 files changed

+166
-1
lines changed

4 files changed

+166
-1
lines changed

.github/workflows/e2e-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
pip install pydantic
5555
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
5656
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
57-
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
57+
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('eval-only', False)]))")
5858
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
5959
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
6060
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT

utils/matrix_logic/generate_sweep_configs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,8 @@ def main():
934934
matrix_values = mark_eval_entries(matrix_values)
935935
if args.evals_only:
936936
matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
937+
for e in matrix_values:
938+
e[Fields.EVAL_ONLY.value] = True
937939

938940
print(json.dumps(matrix_values))
939941
return matrix_values

utils/matrix_logic/test_generate_sweep_configs.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
generate_runner_model_sweep_config,
1010
apply_node_type_defaults,
1111
expand_config_keys,
12+
mark_eval_entries,
1213
)
1314

1415

@@ -1582,3 +1583,163 @@ def test_overlapping_patterns_deduplicate(self):
15821583
"dsr1-fp8-h200-trt",
15831584
"gptoss-fp8-b200-sglang",
15841585
]
1586+
1587+
1588+
# =============================================================================
1589+
# Tests for e2e-tests.yml workflow config splitting
1590+
# =============================================================================
1591+
1592+
def _split_e2e_configs(data):
1593+
"""Replicate the splitting logic from e2e-tests.yml get-jobs step.
1594+
1595+
Returns (SINGLE, MULTI, EVALS) lists matching the workflow filters.
1596+
"""
1597+
single = [x for x in data if 'prefill' not in x and not x.get('eval-only', False)]
1598+
multi = [x for x in data if 'prefill' in x]
1599+
evals = [x for x in data if 'prefill' not in x and x.get('run-eval', False)]
1600+
return single, multi, evals
1601+
1602+
1603+
class TestE2EConfigSplitting:
1604+
"""Verify the e2e-tests.yml config splitting logic handles all flag
1605+
combinations correctly: default, --no-evals, and --evals-only."""
1606+
1607+
@pytest.fixture
1608+
def mixed_entries(self):
1609+
"""Simulates default mode output: single-node (some eval-marked),
1610+
plus multi-node entries."""
1611+
return [
1612+
{'exp-name': 'a', 'isl': 1024, 'osl': 1024, 'conc': 64, 'tp': 2, 'run-eval': False},
1613+
{'exp-name': 'b', 'isl': 1024, 'osl': 1024, 'conc': 128, 'tp': 2, 'run-eval': False},
1614+
{'exp-name': 'c', 'isl': 8192, 'osl': 1024, 'conc': 256, 'tp': 2, 'run-eval': True},
1615+
{'exp-name': 'd', 'isl': 8192, 'osl': 1024, 'conc': 512, 'tp': 2, 'run-eval': True},
1616+
{'exp-name': 'e', 'conc': 64, 'prefill': {'tp': 2, 'num-worker': 1}},
1617+
]
1618+
1619+
def test_default_mode_benchmarks_all_single_node(self, mixed_entries):
1620+
"""Default: all single-node entries (including eval-marked) are benchmarked."""
1621+
single, multi, evals = _split_e2e_configs(mixed_entries)
1622+
assert len(single) == 4
1623+
assert all('prefill' not in x for x in single)
1624+
1625+
def test_default_mode_evals_only_eval_marked(self, mixed_entries):
1626+
"""Default: only eval-marked entries go to EVALS."""
1627+
single, multi, evals = _split_e2e_configs(mixed_entries)
1628+
assert len(evals) == 2
1629+
assert all(x['run-eval'] for x in evals)
1630+
1631+
def test_default_mode_eval_marked_in_both(self, mixed_entries):
1632+
"""Default: eval-marked entries appear in BOTH single and evals."""
1633+
single, multi, evals = _split_e2e_configs(mixed_entries)
1634+
eval_names = {x['exp-name'] for x in evals}
1635+
single_names = {x['exp-name'] for x in single}
1636+
assert eval_names.issubset(single_names)
1637+
1638+
def test_no_evals_all_benchmarked(self):
1639+
"""--no-evals: mark_eval_entries is skipped, no run-eval=True entries."""
1640+
data = [
1641+
{'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': False},
1642+
{'exp-name': 'b', 'conc': 128, 'tp': 2, 'run-eval': False},
1643+
{'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': False},
1644+
]
1645+
single, multi, evals = _split_e2e_configs(data)
1646+
assert len(single) == 3
1647+
assert len(evals) == 0
1648+
1649+
def test_evals_only_no_benchmarks(self):
1650+
"""--evals-only: entries have eval-only flag, SINGLE must be empty."""
1651+
data = [
1652+
{'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': True, 'eval-only': True},
1653+
{'exp-name': 'd', 'conc': 512, 'tp': 2, 'run-eval': True, 'eval-only': True},
1654+
]
1655+
single, multi, evals = _split_e2e_configs(data)
1656+
assert len(single) == 0, "evals-only should not trigger benchmarks"
1657+
assert len(evals) == 2
1658+
1659+
def test_empty_config(self):
1660+
"""Empty input produces empty outputs."""
1661+
single, multi, evals = _split_e2e_configs([])
1662+
assert single == [] and multi == [] and evals == []
1663+
1664+
def test_all_eval_marked_without_eval_only_flag_still_benchmarked(self):
1665+
"""Default mode where mark_eval_entries marks every entry (e.g. only
1666+
8k1k with single conc). Without eval-only flag, SINGLE must still
1667+
include them for benchmarking."""
1668+
data = [
1669+
{'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': True},
1670+
{'exp-name': 'b', 'conc': 64, 'tp': 4, 'run-eval': True},
1671+
]
1672+
single, multi, evals = _split_e2e_configs(data)
1673+
assert len(single) == 2, "all-eval-marked entries must still be benchmarked in default mode"
1674+
assert len(evals) == 2
1675+
1676+
def test_prefill_entries_never_in_single_or_evals(self, mixed_entries):
1677+
"""Prefill (multi-node) entries only appear in MULTI."""
1678+
single, multi, evals = _split_e2e_configs(mixed_entries)
1679+
assert len(multi) == 1
1680+
assert all('prefill' in x for x in multi)
1681+
assert all('prefill' not in x for x in single)
1682+
assert all('prefill' not in x for x in evals)
1683+
1684+
1685+
class TestMarkEvalEntries:
1686+
"""Verify mark_eval_entries only marks highest/median concurrency at 8k1k."""
1687+
1688+
def test_marks_highest_and_median_conc(self):
1689+
"""Should mark highest and median concurrency for 8k1k entries."""
1690+
entries = [
1691+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1692+
'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 32,
1693+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
1694+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1695+
'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 128,
1696+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
1697+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1698+
'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 512,
1699+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
1700+
]
1701+
result = mark_eval_entries(entries)
1702+
# conc values: [32, 128, 512]. median=128 (index 1), highest=512
1703+
assert result[0]['run-eval'] is False # conc=32
1704+
assert result[1]['run-eval'] is True # conc=128 (median)
1705+
assert result[2]['run-eval'] is True # conc=512 (highest)
1706+
1707+
def test_non_8k1k_never_marked(self):
1708+
"""Entries with non-8k1k seq lengths should never be eval-marked."""
1709+
entries = [
1710+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1711+
'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 512,
1712+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
1713+
]
1714+
result = mark_eval_entries(entries)
1715+
assert result[0]['run-eval'] is False
1716+
1717+
def test_multinode_entries_never_marked(self):
1718+
"""Entries without top-level tp (multi-node) should never be eval-marked."""
1719+
entries = [
1720+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1721+
'isl': 8192, 'osl': 1024, 'conc': 512,
1722+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False,
1723+
'prefill': {'tp': 2, 'num-worker': 1}},
1724+
]
1725+
result = mark_eval_entries(entries)
1726+
assert result[0]['run-eval'] is False
1727+
1728+
def test_never_marks_all_entries(self):
1729+
"""mark_eval_entries should never mark every single-node entry,
1730+
ensuring the e2e splitting logic can distinguish default from evals-only."""
1731+
entries = [
1732+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1733+
'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': c,
1734+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False}
1735+
for c in [32, 64, 128, 256, 512]
1736+
] + [
1737+
# Non-8k1k entry that should never be marked
1738+
{'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
1739+
'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 64,
1740+
'spec-decoding': False, 'dp-attn': False, 'run-eval': False},
1741+
]
1742+
result = mark_eval_entries(entries)
1743+
non_prefill = [x for x in result if 'prefill' not in x]
1744+
assert not all(x['run-eval'] for x in non_prefill), \
1745+
"mark_eval_entries must not mark all entries — would break e2e splitting"

utils/matrix_logic/validation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class Fields(Enum):
5353

5454
# Eval
5555
RUN_EVAL = 'run-eval'
56+
EVAL_ONLY = 'eval-only'
5657

5758

5859
"""
@@ -89,6 +90,7 @@ class SingleNodeMatrixEntry(BaseModel):
8990
exp_name: str = Field(alias=Fields.EXP_NAME.value)
9091
disagg: bool
9192
run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
93+
eval_only: bool = Field(alias=Fields.EVAL_ONLY.value, default=False)
9294

9395

9496
class WorkerConfig(BaseModel):

0 commit comments

Comments
 (0)