|
9 | 9 | generate_runner_model_sweep_config, |
10 | 10 | apply_node_type_defaults, |
11 | 11 | expand_config_keys, |
| 12 | + mark_eval_entries, |
12 | 13 | ) |
13 | 14 |
|
14 | 15 |
|
@@ -1582,3 +1583,163 @@ def test_overlapping_patterns_deduplicate(self): |
1582 | 1583 | "dsr1-fp8-h200-trt", |
1583 | 1584 | "gptoss-fp8-b200-sglang", |
1584 | 1585 | ] |
| 1586 | + |
| 1587 | + |
| 1588 | +# ============================================================================= |
| 1589 | +# Tests for e2e-tests.yml workflow config splitting |
| 1590 | +# ============================================================================= |
| 1591 | + |
| 1592 | +def _split_e2e_configs(data): |
| 1593 | + """Replicate the splitting logic from e2e-tests.yml get-jobs step. |
| 1594 | +
|
| 1595 | + Returns (SINGLE, MULTI, EVALS) lists matching the workflow filters. |
| 1596 | + """ |
| 1597 | + single = [x for x in data if 'prefill' not in x and not x.get('eval-only', False)] |
| 1598 | + multi = [x for x in data if 'prefill' in x] |
| 1599 | + evals = [x for x in data if 'prefill' not in x and x.get('run-eval', False)] |
| 1600 | + return single, multi, evals |
| 1601 | + |
| 1602 | + |
| 1603 | +class TestE2EConfigSplitting: |
| 1604 | + """Verify the e2e-tests.yml config splitting logic handles all flag |
| 1605 | + combinations correctly: default, --no-evals, and --evals-only.""" |
| 1606 | + |
| 1607 | + @pytest.fixture |
| 1608 | + def mixed_entries(self): |
| 1609 | + """Simulates default mode output: single-node (some eval-marked), |
| 1610 | + plus multi-node entries.""" |
| 1611 | + return [ |
| 1612 | + {'exp-name': 'a', 'isl': 1024, 'osl': 1024, 'conc': 64, 'tp': 2, 'run-eval': False}, |
| 1613 | + {'exp-name': 'b', 'isl': 1024, 'osl': 1024, 'conc': 128, 'tp': 2, 'run-eval': False}, |
| 1614 | + {'exp-name': 'c', 'isl': 8192, 'osl': 1024, 'conc': 256, 'tp': 2, 'run-eval': True}, |
| 1615 | + {'exp-name': 'd', 'isl': 8192, 'osl': 1024, 'conc': 512, 'tp': 2, 'run-eval': True}, |
| 1616 | + {'exp-name': 'e', 'conc': 64, 'prefill': {'tp': 2, 'num-worker': 1}}, |
| 1617 | + ] |
| 1618 | + |
| 1619 | + def test_default_mode_benchmarks_all_single_node(self, mixed_entries): |
| 1620 | + """Default: all single-node entries (including eval-marked) are benchmarked.""" |
| 1621 | + single, multi, evals = _split_e2e_configs(mixed_entries) |
| 1622 | + assert len(single) == 4 |
| 1623 | + assert all('prefill' not in x for x in single) |
| 1624 | + |
| 1625 | + def test_default_mode_evals_only_eval_marked(self, mixed_entries): |
| 1626 | + """Default: only eval-marked entries go to EVALS.""" |
| 1627 | + single, multi, evals = _split_e2e_configs(mixed_entries) |
| 1628 | + assert len(evals) == 2 |
| 1629 | + assert all(x['run-eval'] for x in evals) |
| 1630 | + |
| 1631 | + def test_default_mode_eval_marked_in_both(self, mixed_entries): |
| 1632 | + """Default: eval-marked entries appear in BOTH single and evals.""" |
| 1633 | + single, multi, evals = _split_e2e_configs(mixed_entries) |
| 1634 | + eval_names = {x['exp-name'] for x in evals} |
| 1635 | + single_names = {x['exp-name'] for x in single} |
| 1636 | + assert eval_names.issubset(single_names) |
| 1637 | + |
| 1638 | + def test_no_evals_all_benchmarked(self): |
| 1639 | + """--no-evals: mark_eval_entries is skipped, no run-eval=True entries.""" |
| 1640 | + data = [ |
| 1641 | + {'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': False}, |
| 1642 | + {'exp-name': 'b', 'conc': 128, 'tp': 2, 'run-eval': False}, |
| 1643 | + {'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': False}, |
| 1644 | + ] |
| 1645 | + single, multi, evals = _split_e2e_configs(data) |
| 1646 | + assert len(single) == 3 |
| 1647 | + assert len(evals) == 0 |
| 1648 | + |
| 1649 | + def test_evals_only_no_benchmarks(self): |
| 1650 | + """--evals-only: entries have eval-only flag, SINGLE must be empty.""" |
| 1651 | + data = [ |
| 1652 | + {'exp-name': 'c', 'conc': 256, 'tp': 2, 'run-eval': True, 'eval-only': True}, |
| 1653 | + {'exp-name': 'd', 'conc': 512, 'tp': 2, 'run-eval': True, 'eval-only': True}, |
| 1654 | + ] |
| 1655 | + single, multi, evals = _split_e2e_configs(data) |
| 1656 | + assert len(single) == 0, "evals-only should not trigger benchmarks" |
| 1657 | + assert len(evals) == 2 |
| 1658 | + |
| 1659 | + def test_empty_config(self): |
| 1660 | + """Empty input produces empty outputs.""" |
| 1661 | + single, multi, evals = _split_e2e_configs([]) |
| 1662 | + assert single == [] and multi == [] and evals == [] |
| 1663 | + |
| 1664 | + def test_all_eval_marked_without_eval_only_flag_still_benchmarked(self): |
| 1665 | + """Default mode where mark_eval_entries marks every entry (e.g. only |
| 1666 | + 8k1k with single conc). Without eval-only flag, SINGLE must still |
| 1667 | + include them for benchmarking.""" |
| 1668 | + data = [ |
| 1669 | + {'exp-name': 'a', 'conc': 64, 'tp': 2, 'run-eval': True}, |
| 1670 | + {'exp-name': 'b', 'conc': 64, 'tp': 4, 'run-eval': True}, |
| 1671 | + ] |
| 1672 | + single, multi, evals = _split_e2e_configs(data) |
| 1673 | + assert len(single) == 2, "all-eval-marked entries must still be benchmarked in default mode" |
| 1674 | + assert len(evals) == 2 |
| 1675 | + |
| 1676 | + def test_prefill_entries_never_in_single_or_evals(self, mixed_entries): |
| 1677 | + """Prefill (multi-node) entries only appear in MULTI.""" |
| 1678 | + single, multi, evals = _split_e2e_configs(mixed_entries) |
| 1679 | + assert len(multi) == 1 |
| 1680 | + assert all('prefill' in x for x in multi) |
| 1681 | + assert all('prefill' not in x for x in single) |
| 1682 | + assert all('prefill' not in x for x in evals) |
| 1683 | + |
| 1684 | + |
| 1685 | +class TestMarkEvalEntries: |
| 1686 | + """Verify mark_eval_entries only marks highest/median concurrency at 8k1k.""" |
| 1687 | + |
| 1688 | + def test_marks_highest_and_median_conc(self): |
| 1689 | + """Should mark highest and median concurrency for 8k1k entries.""" |
| 1690 | + entries = [ |
| 1691 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1692 | + 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 32, |
| 1693 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False}, |
| 1694 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1695 | + 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 128, |
| 1696 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False}, |
| 1697 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1698 | + 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 512, |
| 1699 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False}, |
| 1700 | + ] |
| 1701 | + result = mark_eval_entries(entries) |
| 1702 | + # conc values: [32, 128, 512]. median=128 (index 1), highest=512 |
| 1703 | + assert result[0]['run-eval'] is False # conc=32 |
| 1704 | + assert result[1]['run-eval'] is True # conc=128 (median) |
| 1705 | + assert result[2]['run-eval'] is True # conc=512 (highest) |
| 1706 | + |
| 1707 | + def test_non_8k1k_never_marked(self): |
| 1708 | + """Entries with non-8k1k seq lengths should never be eval-marked.""" |
| 1709 | + entries = [ |
| 1710 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1711 | + 'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 512, |
| 1712 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False}, |
| 1713 | + ] |
| 1714 | + result = mark_eval_entries(entries) |
| 1715 | + assert result[0]['run-eval'] is False |
| 1716 | + |
| 1717 | + def test_multinode_entries_never_marked(self): |
| 1718 | + """Entries without top-level tp (multi-node) should never be eval-marked.""" |
| 1719 | + entries = [ |
| 1720 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1721 | + 'isl': 8192, 'osl': 1024, 'conc': 512, |
| 1722 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False, |
| 1723 | + 'prefill': {'tp': 2, 'num-worker': 1}}, |
| 1724 | + ] |
| 1725 | + result = mark_eval_entries(entries) |
| 1726 | + assert result[0]['run-eval'] is False |
| 1727 | + |
| 1728 | + def test_never_marks_all_entries(self): |
| 1729 | + """mark_eval_entries should never mark every single-node entry, |
| 1730 | + ensuring the e2e splitting logic can distinguish default from evals-only.""" |
| 1731 | + entries = [ |
| 1732 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1733 | + 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': c, |
| 1734 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False} |
| 1735 | + for c in [32, 64, 128, 256, 512] |
| 1736 | + ] + [ |
| 1737 | + # Non-8k1k entry that should never be marked |
| 1738 | + {'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', |
| 1739 | + 'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 64, |
| 1740 | + 'spec-decoding': False, 'dp-attn': False, 'run-eval': False}, |
| 1741 | + ] |
| 1742 | + result = mark_eval_entries(entries) |
| 1743 | + non_prefill = [x for x in result if 'prefill' not in x] |
| 1744 | + assert not all(x['run-eval'] for x in non_prefill), \ |
| 1745 | + "mark_eval_entries must not mark all entries — would break e2e splitting" |
0 commit comments