From 51e350876519aedbfe985db45f02fc863d107cc8 Mon Sep 17 00:00:00 2001 From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:07:20 +0800 Subject: [PATCH 1/6] [None][test] Add QA's perf test cases with L0 local mode (#12022) Signed-off-by: Chenfei Zhang Signed-off-by: Chenfei Zhang Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Co-authored-by: Chenfei Zhang Co-authored-by: Chenfei Zhang --- .../integration/defs/perf/test_perf_sanity.py | 1 + .../test_lists/qa/llm_perf_multinode.txt | 162 ++++++++++++++++++ ...ep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml | 106 ++++++++++++ ...dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml | 106 ++++++++++++ ...dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml | 106 ++++++++++++ ..._dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml | 106 ++++++++++++ ...dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml | 106 ++++++++++++ ..._dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml | 106 ++++++++++++ ...4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 101 +++++++++++ ...4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 101 +++++++++++ ...p16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml | 106 ++++++++++++ ...ep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml | 106 ++++++++++++ ...1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 91 ++++++++++ ...en1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 91 ++++++++++ ...en1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 91 ++++++++++ ...1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 91 ++++++++++ ...en1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 91 ++++++++++ ...n1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 91 ++++++++++ ...en1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 91 ++++++++++ ...en13_tep4_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml | 99 +++++++++++ ...gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml | 105 ++++++++++++ ...gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml | 99 +++++++++++ ...gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...en11_tep4_bs2_eplb0_mtp0_con2-Default.yaml | 99 +++++++++++ ...en14_tep4_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...en1_dep16_bs1_eplb0_mtp3_con1-Default.yaml | 102 +++++++++++ ...gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml | 102 +++++++++++ ...gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml | 103 +++++++++++ ...gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 103 +++++++++++ ...gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 103 +++++++++++ ...gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml | 103 +++++++++++ ...gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 99 +++++++++++ ...gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml | 99 +++++++++++ ...gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 105 ++++++++++++ ...en1_dep16_bs8_eplb0_mtp0_con8-Default.yaml | 98 +++++++++++ ...en1_dep32_bs2_eplb0_mtp0_con2-Default.yaml | 98 +++++++++++ ...1_dep8_bs16_eplb0_mtp1_con128-Default.yaml | 104 +++++++++++ ...1_dep16_bs16_eplb0_mtp0_con16-Default.yaml | 98 +++++++++++ ...en1_dep16_bs8_eplb0_mtp2_con8-Default.yaml | 102 +++++++++++ ...en1_dep32_bs2_eplb0_mtp3_con2-Default.yaml | 102 +++++++++++ ...en1_dep32_bs4_eplb0_mtp0_con4-Default.yaml | 98 +++++++++++ ..._dep16_bs16_eplb0_mtp0_con256-Default.yaml | 98 +++++++++++ ...1_dep16_bs8_eplb0_mtp3_con128-Default.yaml | 104 +++++++++++ ...n1_dep32_bs2_eplb0_mtp3_con64-Default.yaml | 104 +++++++++++ ...1_dep32_bs4_eplb0_mtp0_con128-Default.yaml | 98 +++++++++++ ..._dep16_bs16_eplb0_mtp1_con256-Default.yaml | 104 +++++++++++ ..._dep16_bs32_eplb0_mtp0_con512-Default.yaml | 98 +++++++++++ ..._dep16_bs32_eplb0_mtp1_con512-Default.yaml | 104 +++++++++++ ...1_dep32_bs4_eplb0_mtp3_con128-Default.yaml | 104 +++++++++++ ...1_dep32_bs8_eplb0_mtp0_con256-Default.yaml | 98 +++++++++++ ...1_dep32_bs8_eplb0_mtp3_con256-Default.yaml | 104 +++++++++++ ...ep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml | 100 +++++++++++ ...dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml | 99 +++++++++++ ...4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 101 +++++++++++ ...4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 101 +++++++++++ ...n4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 101 +++++++++++ ...en4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 101 +++++++++++ ...4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml | 107 ++++++++++++ ...en4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml | 107 ++++++++++++ ...en4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml | 107 ++++++++++++ ...4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml | 107 ++++++++++++ ...en4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml | 107 ++++++++++++ ...n4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml | 107 ++++++++++++ ...en4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml | 107 ++++++++++++ ...p16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml | 105 ++++++++++++ ...ep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml | 105 ++++++++++++ ...ep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml | 119 +++++++++++++ ...3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml | 107 ++++++++++++ ...n3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml | 107 ++++++++++++ ...n3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml | 107 ++++++++++++ ...en3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml | 107 ++++++++++++ ...n3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml | 107 ++++++++++++ ...en3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml | 107 ++++++++++++ ...n3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml | 107 ++++++++++++ ...en3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml | 107 ++++++++++++ ...n3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml | 107 ++++++++++++ ...en3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml | 107 ++++++++++++ ...3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 101 +++++++++++ ...en3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 101 +++++++++++ ...en3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 101 +++++++++++ ...3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 101 +++++++++++ ...en3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 101 +++++++++++ ...n3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 101 +++++++++++ ...en3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 101 +++++++++++ ...ep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml | 99 +++++++++++ ...dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml | 99 +++++++++++ ...dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml | 105 ++++++++++++ ..._dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml | 105 ++++++++++++ ...32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml | 106 ++++++++++++ ...plb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml | 106 ++++++++++++ ...p32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml | 106 ++++++++++++ ...6_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml | 112 ++++++++++++ ...16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml | 112 ++++++++++++ ...s16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml | 106 ++++++++++++ ...s128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml | 113 ++++++++++++ ...16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml | 106 ++++++++++++ ...p16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml | 106 ++++++++++++ ...p32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml | 112 ++++++++++++ ...ep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml | 112 ++++++++++++ ...32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml | 114 ++++++++++++ ...6_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml | 120 +++++++++++++ ...s16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml | 113 ++++++++++++ ...s128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml | 121 +++++++++++++ ...16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml | 114 ++++++++++++ ...p32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml | 120 +++++++++++++ ...bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml | 107 ++++++++++++ ...2_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml | 107 ++++++++++++ 143 files changed, 14674 insertions(+) create mode 100644 tests/integration/test_lists/qa/llm_perf_multinode.txt create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 58e316d7f2f..88d46bf8596 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -55,6 +55,7 @@ "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4 + "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8 } SUPPORTED_GPU_MAPPING = { diff --git a/tests/integration/test_lists/qa/llm_perf_multinode.txt b/tests/integration/test_lists/qa/llm_perf_multinode.txt new file mode 100644 index 00000000000..b333f36ee34 --- /dev/null +++ b/tests/integration/test_lists/qa/llm_perf_multinode.txt @@ -0,0 +1,162 @@ +# disagg multi-node +# GB200 + GB300 supported cases +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX] + +# GB200 supported cases +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default] + +# GB300 supported cases +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default] + + + +# wideep multi-node +# GB200 + GB300 supported cases +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con512_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con1024_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con512_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con1024_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_con2048_ccb-NIXL] +# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL] +# GB200 supported cases +# GB300 supported cases diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..d9d0ae88cdb --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..6a1796bf830 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml new file mode 100644 index 00000000000..ab5bd95a7ba --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml new file mode 100644 index 00000000000..02540e48a28 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml new file mode 100644 index 00000000000..1183e971925 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml new file mode 100644 index 00000000000..38bd0ddc4e9 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..a63bb52ec96 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..573c0b94cdb --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..f8b623a0cad --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..13f48032da4 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..c59e682d51d --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..b3ec30d4232 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml new file mode 100644 index 00000000000..29d4120b302 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml new file mode 100644 index 00000000000..99cfeacc892 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..accda03e6af --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..6ea24bb4810 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..338c4d0fbbc --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..1512c2754bd --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml new file mode 100644 index 00000000000..00415ca3564 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml new file mode 100644 index 00000000000..6667a05b753 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 256 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..b9812cd4eaa --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..8b2ef1a5cd9 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..56502cbcd9c --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..0cda2dd3633 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..11ba1857deb --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..9ebbec0b0f7 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml new file mode 100644 index 00000000000..8988a5c74b4 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '36' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml new file mode 100644 index 00000000000..0e8277f268b --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '36' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..a5ab0aa447f --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..52ec72394d5 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..7657581678a --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..bb5c06392f0 --- /dev/null +++ b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..3887effd1e1 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 13 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml new file mode 100644 index 00000000000..70b4dd94a61 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 5 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml new file mode 100644 index 00000000000..327c0ae797d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 6 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..e966f0280eb --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 7 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml new file mode 100644 index 00000000000..61bee8027a9 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 8 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + max_batch_size: 2 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..1ae02497361 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 8 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml new file mode 100644 index 00000000000..7b3066f2868 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 11 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..a0e0717b0e3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 14 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml new file mode 100644 index 00000000000..c382932be0b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml new file mode 100644 index 00000000000..3f4aee7d753 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 12 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 2 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..7f4394f574b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml new file mode 100644 index 00000000000..a5de96fc505 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 4 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml new file mode 100644 index 00000000000..d447a67b3a5 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml new file mode 100644 index 00000000000..d45c42bc8cc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 5 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml new file mode 100644 index 00000000000..0c8a3833548 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 7 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 6 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 2 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml new file mode 100644 index 00000000000..9a639194509 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 7 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml new file mode 100644 index 00000000000..d0274f8a00d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 8 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml new file mode 100644 index 00000000000..9167382bddf --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 7 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml new file mode 100644 index 00000000000..2d6760bf7dc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml new file mode 100644 index 00000000000..f88d250e52b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml new file mode 100644 index 00000000000..37abd60ca4d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 32 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml new file mode 100644 index 00000000000..458b34c4dcc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml new file mode 100644 index 00000000000..3cef4212ced --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 24 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 2 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml new file mode 100644 index 00000000000..b921750a006 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: *id001 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml new file mode 100644 index 00000000000..85967f76c16 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 8 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml new file mode 100644 index 00000000000..ea7c356b611 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 5 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml new file mode 100644 index 00000000000..ac17975c3bc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 5 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml new file mode 100644 index 00000000000..9d2a844f5d7 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '64' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 5 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 8 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml new file mode 100644 index 00000000000..37b38f9c1af --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 5 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml new file mode 100644 index 00000000000..c7489d2864d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 7 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 32 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml new file mode 100644 index 00000000000..3492ea65c42 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 7 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml new file mode 100644 index 00000000000..7be406bdc39 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 64 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml new file mode 100644 index 00000000000..7a34ac9edd0 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 16 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml new file mode 100644 index 00000000000..9402160635d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 128 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml new file mode 100644 index 00000000000..c19dc3ed6f0 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml @@ -0,0 +1,104 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 32 + max_seq_len: 139296 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 131104 + max_seq_len: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + enable_attention_dp: false + pipeline_parallel_size: 4 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + moe_config: + backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..007c4b33258 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,100 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + trtllm_wheel_path: '' + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..d85ff79d08b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..e837bd894f6 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..7ff05e15ed5 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..dcbdf85cb14 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..fde80dab2da --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..8fc9a9d1255 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..50d674fb79b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml new file mode 100644 index 00000000000..2adc8e2303b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml new file mode 100644 index 00000000000..0126bc21ace --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..c718acbe9e3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..b5fa6ccf3fe --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..0f04f4cbea6 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..55b2c50f50d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..f5cfc62133d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..ab3dcc0f94d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..c315d899767 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..6bb1b1e3738 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..09da6f76da3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..e87ca8c7867 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml new file mode 100644 index 00000000000..33ab0317ec6 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml new file mode 100644 index 00000000000..2e57bf2e459 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..49b53a8aa7e --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..64100cb5be9 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..ec664924f8f --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..bec518b6cdf --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 128 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml new file mode 100644 index 00000000000..7bf0861937d --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml new file mode 100644 index 00000000000..9e6eda54593 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..c8f368acfcc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml @@ -0,0 +1,119 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: -1 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 9256 + tensor_parallel_size: 4 + context_parallel_size: 1 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..8ac786dd975 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..d89f0fb2d40 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..7415a85c499 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..aac703968ad --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..5aabc9772ea --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..b3f644b5ae4 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..0a48fac6f35 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..bbfe945b8f3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..c59b6d28e6b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..47037ef0c18 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml new file mode 100644 index 00000000000..ef0890f0e01 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml new file mode 100644 index 00000000000..831c08f2516 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '16' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml new file mode 100644 index 00000000000..9af25336a77 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml new file mode 100644 index 00000000000..6fe180ca053 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml new file mode 100644 index 00000000000..c5d27e0e4bf --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml new file mode 100644 index 00000000000..69b6e98e389 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml new file mode 100644 index 00000000000..8f9aecb808e --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml new file mode 100644 index 00000000000..48dceec3c08 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '32' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml new file mode 100644 index 00000000000..1d16b831739 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml new file mode 100644 index 00000000000..c15166fc119 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml new file mode 100644 index 00000000000..25f9e4045a5 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml new file mode 100644 index 00000000000..93f024b65fc --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '8' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 3 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..0b37895f1e6 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..856de14f2f7 --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml new file mode 100644 index 00000000000..690afbff78b --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml new file mode 100644 index 00000000000..115af8642dd --- /dev/null +++ b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..ff139ca6b1e --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml new file mode 100644 index 00000000000..60157103167 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..079871ddbc8 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml new file mode 100644 index 00000000000..cacafb92d98 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml new file mode 100644 index 00000000000..d61f116a617 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..9a1cb1ac23a --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 7 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '12288' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 48 + moe_expert_parallel_size: 48 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2176 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT + stream_interval: 20 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4480 + max_seq_len: 2176 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..9e1996f596b --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml @@ -0,0 +1,113 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: disaggr-test + extra_args: --gres=gpu:4 + numa_bind: true +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9423 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9423 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..31fcc8ebd03 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml new file mode 100644 index 00000000000..3433196c31d --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml new file mode 100644 index 00000000000..0e6bf4e7d98 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml new file mode 100644 index 00000000000..b7743c9bd81 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml @@ -0,0 +1,112 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..d194178ac64 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 1 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml new file mode 100644 index 00000000000..8e4780400a4 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -0,0 +1,120 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 0 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..691ffadfd32 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml @@ -0,0 +1,113 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 7 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '12288' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 48 + moe_expert_parallel_size: 48 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2176 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT + stream_interval: 20 + ctx: + max_batch_size: 4 + max_num_tokens: 4480 + max_seq_len: 2176 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: TRTLLM + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..d2df7e4574f --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml @@ -0,0 +1,121 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 14 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: disaggr-test + extra_args: --gres=gpu:4 + numa_bind: true +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9423 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9423 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml new file mode 100644 index 00000000000..a7723ba302f --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 5 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml new file mode 100644 index 00000000000..a2df3a17555 --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml @@ -0,0 +1,120 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 4 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml new file mode 100644 index 00000000000..1d2e6f73d5e --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 00:45:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 1.0 + streaming: true + concurrency_list: '16384' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 3 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 1024 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + load_balancer: + num_slots: 384 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 100 + num_postprocess_workers: 4 + trust_remote_code: true + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 8 + max_num_tokens: 8448 + max_seq_len: 1044 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml new file mode 100644 index 00000000000..d421bcd7ebe --- /dev/null +++ b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: k2_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2-Thinking-NVFP4 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 00:45:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 1.0 + streaming: true + concurrency_list: '8192' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: false + pipeline_parallel_size: 1 + max_batch_size: 256 + max_num_tokens: 256 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 256 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + use_low_precision_moe_combine: true + load_balancer: + num_slots: 416 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 100 + num_postprocess_workers: 4 + trust_remote_code: true + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 8232 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + trust_remote_code: true From b411e149bff3ba75a20c2f498b257e03657d60c7 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:24:53 +0800 Subject: [PATCH 2/6] [TRTLLM-11246][feat] Add tool parser support for GLM-4 models (#11986) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tensorrt_llm/serve/tool_parser/glm4_parser.py | 488 ++++++++++++++++++ .../serve/tool_parser/tool_parser_factory.py | 2 + tensorrt_llm/serve/tool_parser/utils.py | 81 ++- .../unittest/llmapi/apps/test_tool_parsers.py | 236 +++++++++ 4 files changed, 806 insertions(+), 1 deletion(-) create mode 100644 tensorrt_llm/serve/tool_parser/glm4_parser.py diff --git a/tensorrt_llm/serve/tool_parser/glm4_parser.py b/tensorrt_llm/serve/tool_parser/glm4_parser.py new file mode 100644 index 00000000000..540f3d84230 --- /dev/null +++ b/tensorrt_llm/serve/tool_parser/glm4_parser.py @@ -0,0 +1,488 @@ +# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/function_call/glm4_moe_detector.py +import ast +import json +import re +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple + +from tensorrt_llm.logger import logger +from tensorrt_llm.serve.openai_protocol import ChatCompletionToolsParam as Tool +from tensorrt_llm.serve.tool_parser.base_tool_parser import BaseToolParser +from tensorrt_llm.serve.tool_parser.core_types import ( + StreamingParseResult, + ToolCallItem, + _GetInfoFunc, +) + +from .utils import infer_type_from_json_schema + + +class StreamState(str, Enum): + """State machine states for XML to JSON streaming conversion.""" + + INIT = "INIT" + BETWEEN = "BETWEEN" + IN_KEY = "IN_KEY" + WAITING_VALUE = "WAITING_VALUE" + IN_VALUE = "IN_VALUE" + + +def get_argument_type(func_name: str, arg_key: str, defined_tools: List[Tool]) -> Optional[str]: + """Get the expected type of a function argument from tool definitions.""" + name2tool = {tool.function.name: tool for tool in defined_tools} + if func_name not in name2tool: + return None + tool = name2tool[func_name] + properties = (tool.function.parameters or {}).get("properties", {}) + if not isinstance(properties, dict): + properties = {} + if arg_key not in properties: + return None + return infer_type_from_json_schema(properties[arg_key]) + + +def _convert_to_number(value: str) -> Any: + """Convert string to appropriate number type (int or float).""" + try: + if "." in value or "e" in value.lower(): + return float(value) + else: + return int(value) + except (ValueError, AttributeError): + return value + + +def parse_arguments(json_value: str, arg_type: Optional[str] = None) -> Tuple[Any, bool]: + """Parse argument value with multiple fallback strategies. + + Returns: + Tuple of (parsed_value, is_valid_json) + """ + try: + parsed_value = json.loads(json_value) + if arg_type == "number" and isinstance(parsed_value, str): + parsed_value = _convert_to_number(parsed_value) + return parsed_value, True + except (json.JSONDecodeError, ValueError): + pass + + try: + wrapped = json.loads('{"tmp": "' + json_value + '"}') + parsed_value = json.loads(wrapped["tmp"]) + if arg_type == "number" and isinstance(parsed_value, str): + parsed_value = _convert_to_number(parsed_value) + return parsed_value, True + except (json.JSONDecodeError, ValueError, KeyError): + pass + + try: + parsed_value = ast.literal_eval(json_value) + return parsed_value, True + except (ValueError, SyntaxError): + pass + + try: + quoted_value = json.dumps(str(json_value)) + return json.loads(quoted_value), True + except (json.JSONDecodeError, ValueError): + return json_value, False + + +class Glm4ToolParser(BaseToolParser): + r"""Tool parser for GLM-4.5 and GLM-4.6 models. + + Assumes function call format (with actual newlines): + get_weather + city + 北京 + date + 2024-06-27 + + + Or with literal \n characters (escaped as \\n in the output): + get_weather\ncity\n北京\n + + Uses a streaming state machine to convert XML to JSON incrementally. + """ + + def __init__(self): + super().__init__() + self.bot_token = "" # nosec B105 + self.eot_token = "" # nosec B105 + self.func_call_regex = r".*?" + self.func_detail_regex = re.compile( + r"(.*?)(?:\\n|\n)(.*)", re.DOTALL + ) + self.func_arg_regex = re.compile( + r"(.*?)(?:\\n|\s)*(.*?)", + re.DOTALL, + ) + self._last_arguments = "" + self.current_tool_id = -1 + self.current_tool_name_sent = False + self._streamed_raw_length = 0 + self._reset_streaming_state() + + def _reset_streaming_state(self) -> None: + """Reset the streaming state machine for a new tool call.""" + self._stream_state = StreamState.INIT + self._current_key = "" + self._current_value = "" + self._xml_tag_buffer = "" + self._is_first_param = True + self._value_started = False + self._cached_value_type: Optional[str] = None + + def has_tool_call(self, text: str) -> bool: + """Check if the text contains a GLM-4 format tool call.""" + return self.bot_token in text + + def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult: + """One-time parsing: Detects and parses tool calls in the provided text.""" + idx = text.find(self.bot_token) + normal_text = text[:idx].strip() if idx != -1 else text + if self.bot_token not in text: + return StreamingParseResult(normal_text=normal_text, calls=[]) + match_result_list = re.findall(self.func_call_regex, text, re.DOTALL) + calls = [] + try: + for match_result in match_result_list: + func_detail = self.func_detail_regex.search(match_result) + if func_detail is None: + continue + func_name = func_detail.group(1) if func_detail.group(1) else "" + func_args = func_detail.group(2) if func_detail.group(2) else "" + pairs = self.func_arg_regex.findall(func_args) + + arguments = self._parse_argument_pairs(pairs, func_name, tools) + + match_result = {"name": func_name, "parameters": arguments} + calls.extend(self.parse_base_json(match_result, tools)) + return StreamingParseResult(normal_text=normal_text, calls=calls) + except Exception as e: + logger.error(f"Error in detect_and_parse: {e}") + return StreamingParseResult(normal_text=text) + + def _get_value_type(self, func_name: str, key: str, tools: List[Tool]) -> str: + """Get parameter type from tool definition, with fallback to auto-detection.""" + arg_type = get_argument_type(func_name, key, tools) + if arg_type: + return arg_type + + value_content = self._current_value.strip() if self._current_value else "" + + if not value_content: + return "string" + + try: + parsed = json.loads(value_content) + if isinstance(parsed, dict): + return "object" + elif isinstance(parsed, list): + return "array" + elif isinstance(parsed, bool): + return "boolean" + elif isinstance(parsed, (int, float)): + return "number" + elif isinstance(parsed, str): + if parsed.isdigit() or (parsed.startswith("-") and parsed[1:].isdigit()): + return "number" + return "string" + except json.JSONDecodeError: + first_char = value_content[0] if value_content else "" + if first_char.isdigit() or first_char in ["-", "."]: + return "number" + elif first_char in ["{", "["]: + return "object" + elif first_char in ['"', "'"]: + return "string" + + return "string" + + def _format_value_complete(self, value: str, value_type: str) -> str: + """Format complete value based on type.""" + if value_type == "string": + return json.dumps(value, ensure_ascii=False) + elif value_type == "number": + try: + num = _convert_to_number(value.strip()) + return str(num) + except (ValueError, AttributeError): + logger.warning(f"Failed to parse '{value}' as number, treating as string") + return json.dumps(str(value), ensure_ascii=False) + else: + return value + + def _process_xml_to_json_streaming( + self, raw_increment: str, func_name: str, tools: List[Tool] + ) -> str: + """Convert XML increment to JSON streaming output using state machine. + + Processes XML fragments character by character and converts them + to JSON format incrementally, maintaining state across calls. + """ + json_output = "" + + for char in raw_increment: + self._xml_tag_buffer += char + + if self._stream_state in [StreamState.INIT, StreamState.BETWEEN]: + if self._xml_tag_buffer.endswith(""): + self._stream_state = StreamState.IN_KEY + self._current_key = "" + self._xml_tag_buffer = "" + json_output += "{" if self._is_first_param else ", " + self._is_first_param = False + + elif self._stream_state == StreamState.IN_KEY: + if self._xml_tag_buffer.endswith(""): + self._current_key = self._xml_tag_buffer[:-10].strip() + self._xml_tag_buffer = "" + self._stream_state = StreamState.WAITING_VALUE + json_output += json.dumps(self._current_key, ensure_ascii=False) + ": " + + elif self._stream_state == StreamState.WAITING_VALUE: + if self._xml_tag_buffer.endswith(""): + self._stream_state = StreamState.IN_VALUE + self._current_value = "" + self._xml_tag_buffer = "" + self._value_started = False + self._cached_value_type = self._get_value_type( + func_name, self._current_key, tools + ) + + elif self._stream_state == StreamState.IN_VALUE: + if self._xml_tag_buffer.endswith(""): + final_value = self._xml_tag_buffer[:-12] + self._current_value += final_value + + value_type = self._cached_value_type or "string" + + if self._value_started: + if final_value: + if value_type == "string": + json_output += json.dumps(final_value, ensure_ascii=False)[1:-1] + else: + json_output += final_value + if value_type == "string": + json_output += '"' + else: + json_output += self._format_value_complete(self._current_value, value_type) + + self._xml_tag_buffer = "" + self._stream_state = StreamState.BETWEEN + self._current_value = "" + self._value_started = False + self._cached_value_type = None + else: + closing_tag = "" + is_potential_closing = len(self._xml_tag_buffer) <= len( + closing_tag + ) and closing_tag.startswith(self._xml_tag_buffer) + + if not is_potential_closing: + content = self._xml_tag_buffer + value_type = self._cached_value_type or "string" + + if value_type == "string": + if not self._value_started: + json_output += '"' + self._value_started = True + if content: + json_output += json.dumps(content, ensure_ascii=False)[1:-1] + self._current_value += content + self._xml_tag_buffer = "" + elif value_type == "number": + if content: + if not self._value_started: + self._value_started = True + json_output += content + self._current_value += content + self._xml_tag_buffer = "" + else: + if content: + if not self._value_started: + self._value_started = True + json_output += content + self._current_value += content + self._xml_tag_buffer = "" + + return json_output + + def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult: + """Streaming incremental parsing for GLM-4 format. + + Uses a state machine to convert XML to JSON incrementally for + true character-by-character streaming. + """ + self._buffer += new_text + current_text = self._buffer + + has_tool_call = self.bot_token in current_text + + if not has_tool_call: + is_potential_start = any( + self.bot_token.startswith(current_text[-i:]) + for i in range(1, min(len(current_text), len(self.bot_token)) + 1) + ) + + if not is_potential_start: + output_text = current_text + self._buffer = "" + if self.eot_token in output_text: + output_text = output_text.replace(self.eot_token, "") + return StreamingParseResult(normal_text=output_text) + else: + return StreamingParseResult(normal_text="", calls=[]) + + if not hasattr(self, "_tool_indices"): + self._tool_indices = self._get_tool_indices(tools) + + calls: list[ToolCallItem] = [] + try: + partial_match = re.search( + pattern=r"(.*?)(?:\\n|\n)(.*?)(|$)", + string=current_text, + flags=re.DOTALL, + ) + if partial_match: + func_name_raw = partial_match.group(1) + func_args_raw = partial_match.group(2) + is_tool_end = partial_match.group(3) + + if func_name_raw is None or not func_name_raw.strip(): + return StreamingParseResult(normal_text="", calls=[]) + + func_name = func_name_raw.strip() + func_args_raw = func_args_raw.strip() if func_args_raw else "" + + if self.current_tool_id == -1: + self.current_tool_id = 0 + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [""] + self._streamed_raw_length = 0 + self.current_tool_name_sent = False + self._reset_streaming_state() + + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + + if not self.current_tool_name_sent: + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=func_name, + parameters="", + ) + ) + self.current_tool_name_sent = True + self._streamed_raw_length = 0 + self._reset_streaming_state() + self.prev_tool_call_arr[self.current_tool_id] = { + "name": func_name, + "arguments": {}, + } + else: + current_raw_length = len(func_args_raw) + + if current_raw_length > self._streamed_raw_length: + raw_increment = func_args_raw[self._streamed_raw_length :] + + json_increment = self._process_xml_to_json_streaming( + raw_increment, func_name, tools + ) + + self._streamed_raw_length = current_raw_length + + if json_increment: + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=None, + parameters=json_increment, + ) + ) + self._last_arguments += json_increment + self.streamed_args_for_tool[self.current_tool_id] += json_increment + + if is_tool_end == self.eot_token: + if self._is_first_param: + empty_object = "{}" + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=None, + parameters=empty_object, + ) + ) + self._last_arguments += empty_object + elif not self._last_arguments.endswith("}"): + closing_brace = "}" + calls.append( + ToolCallItem( + tool_index=self.current_tool_id, + name=None, + parameters=closing_brace, + ) + ) + self._last_arguments += closing_brace + self.streamed_args_for_tool[self.current_tool_id] += closing_brace + + try: + pairs = self.func_arg_regex.findall(func_args_raw) + if pairs: + arguments = self._parse_argument_pairs(pairs, func_name, tools) + self.prev_tool_call_arr[self.current_tool_id]["arguments"] = ( + arguments + ) + except Exception as e: + logger.debug(f"Failed to parse arguments: {e}") + + self._buffer = current_text[partial_match.end(3) :] + + result = StreamingParseResult(normal_text="", calls=calls) + self.current_tool_id += 1 + self._last_arguments = "" + self.current_tool_name_sent = False + self._streamed_raw_length = 0 + self._reset_streaming_state() + return result + + return StreamingParseResult(normal_text="", calls=calls) + + except Exception as e: + logger.error(f"Error in parse_streaming_increment: {e}") + return StreamingParseResult(normal_text=current_text) + + def _parse_argument_pairs( + self, pairs: List[Tuple[str, str]], func_name: str, tools: List[Tool] + ) -> Dict[str, Any]: + """Parse argument key-value pairs with type coercion.""" + arguments = {} + for arg_key, arg_value in pairs: + arg_key = arg_key.strip() + arg_value = arg_value.strip() + arg_type = get_argument_type(func_name, arg_key, tools) + parsed_value, is_good_json = parse_arguments(arg_value, arg_type) + + if arg_type == "string": + if isinstance(parsed_value, str): + arguments[arg_key] = parsed_value + elif isinstance(parsed_value, (dict, list)): + arguments[arg_key] = json.dumps(parsed_value, ensure_ascii=False) + else: + arguments[arg_key] = str(parsed_value) + elif arg_type is None: + arguments[arg_key] = parsed_value if is_good_json else arg_value + else: + arguments[arg_key] = parsed_value if is_good_json else arg_value + + return arguments + + def supports_structural_tag(self) -> bool: + return False + + def structure_info(self) -> _GetInfoFunc: + raise NotImplementedError() diff --git a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py index c76246cf39e..f3bf95cd941 100644 --- a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py +++ b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py @@ -4,6 +4,7 @@ from .deepseekv3_parser import DeepSeekV3Parser from .deepseekv31_parser import DeepSeekV31Parser from .deepseekv32_parser import DeepSeekV32Parser +from .glm4_parser import Glm4ToolParser from .kimi_k2_tool_parser import KimiK2ToolParser from .qwen3_coder_parser import Qwen3CoderToolParser from .qwen3_tool_parser import Qwen3ToolParser @@ -17,6 +18,7 @@ class ToolParserFactory: "deepseek_v3": DeepSeekV3Parser, "deepseek_v31": DeepSeekV31Parser, "deepseek_v32": DeepSeekV32Parser, + "glm4": Glm4ToolParser, } @staticmethod diff --git a/tensorrt_llm/serve/tool_parser/utils.py b/tensorrt_llm/serve/tool_parser/utils.py index 7666036be50..2c143bae51e 100644 --- a/tensorrt_llm/serve/tool_parser/utils.py +++ b/tensorrt_llm/serve/tool_parser/utils.py @@ -2,7 +2,7 @@ import json from json import JSONDecodeError, JSONDecoder from json.decoder import WHITESPACE -from typing import Any +from typing import Any, Dict, Optional import partial_json_parser from partial_json_parser.core.options import Allow @@ -54,3 +54,82 @@ def is_complete_json(input_str: str) -> bool: return True except JSONDecodeError: return False + + +# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/function_call/utils.py +def infer_type_from_json_schema(schema: Dict[str, Any]) -> Optional[str]: + """Infer the primary type of a parameter from JSON Schema. + + Supports complex JSON Schema structures including: + - Direct type field (including type arrays) + - anyOf/oneOf: parameter can be any of multiple types + - enum: parameter must be one of enum values + - allOf: parameter must satisfy all type definitions + - properties: inferred as object type + - items: inferred as array type + """ + if not isinstance(schema, dict): + return None + + if "type" in schema: + type_value = schema["type"] + if isinstance(type_value, str): + return type_value + elif isinstance(type_value, list) and type_value: + non_null_types = [t for t in type_value if t != "null"] + if non_null_types: + return non_null_types[0] + return "string" + + if "anyOf" in schema or "oneOf" in schema: + schemas = schema.get("anyOf") or schema.get("oneOf") + types = [] + if isinstance(schemas, list): + for sub_schema in schemas: + inferred_type = infer_type_from_json_schema(sub_schema) + if inferred_type: + types.append(inferred_type) + if types: + if len(set(types)) == 1: + return types[0] + if "string" in types: + return "string" + return types[0] + + if "enum" in schema and isinstance(schema["enum"], list): + if not schema["enum"]: + return "string" + enum_types = set() + for value in schema["enum"]: + if value is None: + enum_types.add("null") + elif isinstance(value, bool): + enum_types.add("boolean") + elif isinstance(value, int): + enum_types.add("integer") + elif isinstance(value, float): + enum_types.add("number") + elif isinstance(value, str): + enum_types.add("string") + elif isinstance(value, list): + enum_types.add("array") + elif isinstance(value, dict): + enum_types.add("object") + if len(enum_types) == 1: + return enum_types.pop() + return "string" + + if "allOf" in schema and isinstance(schema["allOf"], list): + for sub_schema in schema["allOf"]: + inferred_type = infer_type_from_json_schema(sub_schema) + if inferred_type and inferred_type != "string": + return inferred_type + return "string" + + if "properties" in schema: + return "object" + + if "items" in schema: + return "array" + + return None diff --git a/tests/unittest/llmapi/apps/test_tool_parsers.py b/tests/unittest/llmapi/apps/test_tool_parsers.py index b2032d1fbac..4d1252e011a 100644 --- a/tests/unittest/llmapi/apps/test_tool_parsers.py +++ b/tests/unittest/llmapi/apps/test_tool_parsers.py @@ -26,6 +26,7 @@ from tensorrt_llm.serve.tool_parser.deepseekv3_parser import DeepSeekV3Parser from tensorrt_llm.serve.tool_parser.deepseekv31_parser import DeepSeekV31Parser from tensorrt_llm.serve.tool_parser.deepseekv32_parser import DeepSeekV32Parser +from tensorrt_llm.serve.tool_parser.glm4_parser import Glm4ToolParser from tensorrt_llm.serve.tool_parser.kimi_k2_tool_parser import KimiK2ToolParser from tensorrt_llm.serve.tool_parser.qwen3_coder_parser import \ Qwen3CoderToolParser @@ -1315,6 +1316,241 @@ def test_deepseek_v32_format_compliance(self, sample_tools, parser): assert json.loads(result.calls[0].parameters) == {"location": "NYC"} +# ============================================================================ +# Glm4ToolParser Tests +# ============================================================================ + + +class TestGlm4ToolParser(BaseToolParserTestClass): + """Test suite for Glm4ToolParser class.""" + + def make_parser(self): + return Glm4ToolParser() + + def make_tool_parser_test_cases(self): + single_text = ("Normal text" + "get_weather\n" + "location\n" + "NYC\n" + "") + single_expected_normal = "Normal text" + single_expected_name = "get_weather" + single_expected_params = {"location": "NYC"} + + multiple_text = ("get_weather\n" + "location\n" + "LA\n" + "" + "search_web\n" + "query\n" + "AI\n" + "") + multiple_names = ("get_weather", "search_web") + + malformed_text = ("get_weather" + "MALFORMED_NO_NEWLINE") + + with_parameters_text = ("search_web\n" + "query\n" + "test\n" + "") + with_parameters_name = "search_web" + with_parameters_params = {"query": "test"} + + partial_bot_token = "undefined_func\n" + "arg\n" + "value\n" + "") + + return ToolParserTestCases( + has_tool_call_true= + "Some text get_weather\nlocation\nNYC\n", + detect_and_parse_single_tool=( + single_text, + single_expected_normal, + single_expected_name, + single_expected_params, + ), + detect_and_parse_multiple_tools=(multiple_text, multiple_names), + detect_and_parse_malformed_tool=malformed_text, + detect_and_parse_with_parameters_key=( + with_parameters_text, + with_parameters_name, + with_parameters_params, + ), + parse_streaming_increment_partial_bot_token=partial_bot_token, + undefined_tool=undefined_tool_text, + ) + + def test_initialization(self, parser): + """Test that Glm4ToolParser initializes correctly.""" + assert parser.bot_token == "" + assert parser.eot_token == "" + + def test_parse_streaming_increment_complete_tool_call( + self, sample_tools, parser): + """Test streaming parser with complete tool call in chunks.""" + + # Send bot token with function name + result = parser.parse_streaming_increment("get_weather\n", + sample_tools) + + # Should send tool name + assert len(result.calls) == 1 + assert result.calls[0].name == "get_weather" + assert result.calls[0].parameters == "" + + # Send arguments + result = parser.parse_streaming_increment( + "location\n" + "SF\n" + "", sample_tools) + + # Should stream arguments and complete the tool call + all_params = "".join(call.parameters for call in result.calls + if call.parameters) + assert "location" in all_params + assert "SF" in all_params + + def test_parse_streaming_increment_multiple_tools_streaming( + self, sample_tools, parser): + """Test streaming parser handles multiple tool calls.""" + + # First tool + parser.parse_streaming_increment("get_weather\n", + sample_tools) + parser.parse_streaming_increment( + "location\n" + "NYC\n" + "", sample_tools) + + # Second tool + result = parser.parse_streaming_increment("search_web\n", + sample_tools) + + # Should have started second tool + assert len(result.calls) == 1 + assert result.calls[0].name == "search_web" + assert result.calls[0].parameters == "" + assert result.calls[0].tool_index == 1 + + def test_parse_streaming_multiple_params(self, sample_tools, parser): + """Test streaming parser handles multiple parameters.""" + + # Send function name + parser.parse_streaming_increment("get_weather\n", + sample_tools) + + # Send first parameter + result1 = parser.parse_streaming_increment( + "location\n" + "NYC\n", sample_tools) + + params1 = "".join(call.parameters for call in result1.calls + if call.parameters) + assert "location" in params1 + + # Send second parameter and close + result2 = parser.parse_streaming_increment( + "unit\n" + "celsius\n" + "", sample_tools) + + params2 = "".join(call.parameters for call in result2.calls + if call.parameters) + assert "unit" in params2 + + def test_detect_and_parse_multiple_params(self, sample_tools): + """Test one-shot parsing with multiple parameters.""" + parser = Glm4ToolParser() + text = ("get_weather\n" + "location\n" + "Tokyo\n" + "unit\n" + "celsius\n" + "") + + result = parser.detect_and_parse(text, sample_tools) + + assert len(result.calls) == 1 + assert result.calls[0].name == "get_weather" + params = json.loads(result.calls[0].parameters) + assert params == {"location": "Tokyo", "unit": "celsius"} + + def test_detect_and_parse_with_number_type(self): + """Test parsing with number type coercion.""" + parser = Glm4ToolParser() + tools = [ + ChatCompletionToolsParam( + type="function", + function=FunctionDefinition( + name="set_temperature", + description="Set temperature", + parameters={ + "type": "object", + "properties": { + "value": { + "type": "number", + }, + "label": { + "type": "string", + }, + }, + "required": ["value"], + }, + ), + ) + ] + + text = ("set_temperature\n" + "value\n" + "72.5\n" + "label\n" + "room temp\n" + "") + + result = parser.detect_and_parse(text, tools) + + assert len(result.calls) == 1 + params = json.loads(result.calls[0].parameters) + assert params["value"] == 72.5 + assert params["label"] == "room temp" + + def test_glm4_format_compliance(self, sample_tools, parser): + """Test that Glm4ToolParser follows the documented format structure.""" + + text = ("get_weather\n" + "location\n" + "Tokyo\n" + "") + + result = parser.detect_and_parse(text, sample_tools) + + assert len(result.calls) == 1 + assert result.calls[0].name == "get_weather" + assert json.loads(result.calls[0].parameters) == {"location": "Tokyo"} + + def test_streaming_no_args(self, sample_tools, parser): + """Test streaming a tool call with no arguments.""" + + # First increment sends the tool name + result1 = parser.parse_streaming_increment("get_weather\n", + sample_tools) + names = [c.name for c in result1.calls if c.name] + assert "get_weather" in names + + # Second increment closes the tool call with empty args + result2 = parser.parse_streaming_increment("", sample_tools) + params = "".join(c.parameters for c in result2.calls) + assert "{}" in params + + def test_supports_structural_tag(self, parser): + """Test that supports_structural_tag returns False.""" + assert parser.supports_structural_tag() is False + + # ============================================================================ # Integration Tests # ============================================================================ From a20de8832494a2b9d15061fc635b0b7070233b3c Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:36:03 +0800 Subject: [PATCH 3/6] [https://nvbugs/5937478][fix] Fix DS v32 tool calling type and parse error (#11935) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tensorrt_llm/serve/openai_server.py | 7 +++++++ tensorrt_llm/serve/tool_parser/base_tool_parser.py | 2 ++ tensorrt_llm/serve/tool_parser/deepseekv32_parser.py | 8 +++++++- tensorrt_llm/tokenizer/deepseek_v32/encoding.py | 3 ++- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 4ab06057210..17f85b2da34 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -81,6 +81,7 @@ from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds from tensorrt_llm.serve.responses_utils import \ request_preprocess as responses_api_request_preprocess +from tensorrt_llm.serve.tool_parser.tool_parser_factory import ToolParserFactory from tensorrt_llm.serve.visual_gen_utils import (VIDEO_STORE, parse_visual_gen_params) from tensorrt_llm.version import __version__ as VERSION @@ -809,6 +810,12 @@ async def chat_stream_generator( gather_generation_logits, reasoning_parser=self.generator.args.reasoning_parser, backend=self.generator.args.backend) + if self.tool_parser and request.tools: + tool_parser_cls = ToolParserFactory.parsers.get( + self.tool_parser.lower()) + if tool_parser_cls and getattr( + tool_parser_cls, 'needs_raw_special_tokens', False): + sampling_params.skip_special_tokens = False postproc_args = ChatPostprocArgs.from_request(request) disaggregated_params = to_llm_disaggregated_params( request.disaggregated_params) diff --git a/tensorrt_llm/serve/tool_parser/base_tool_parser.py b/tensorrt_llm/serve/tool_parser/base_tool_parser.py index 9fa87ec0bf6..77353f4c6e7 100644 --- a/tensorrt_llm/serve/tool_parser/base_tool_parser.py +++ b/tensorrt_llm/serve/tool_parser/base_tool_parser.py @@ -16,6 +16,8 @@ class BaseToolParser(ABC): """Base class providing two sets of interfaces: one-time and streaming incremental.""" + needs_raw_special_tokens: bool = False + def __init__(self): # Streaming state management # Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks diff --git a/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py b/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py index fc74d46e219..25c49ae2a2c 100644 --- a/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py +++ b/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py @@ -61,6 +61,10 @@ class DeepSeekV32Parser(BaseToolParser): Reference: DeepSeek V3.2 format specification """ + needs_raw_special_tokens = True + + _eos_token = "<|end▁of▁sentence|>" # nosec B105 + def __init__(self): super().__init__() self.bot_token = "<|DSML|function_calls>" # nosec B105 @@ -118,6 +122,8 @@ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult :param tools: List of available tools. :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls. """ + if self._eos_token in text: + text = text.replace(self._eos_token, "") idx = text.find(self.bot_token) normal_text = text[:idx].strip() if idx != -1 else text if self.bot_token not in text: @@ -177,7 +183,7 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami if not has_tool_call and not potentially_dsml and not ends_with_prefix: self._buffer = "" - for e_token in [self.eot_token, self.invoke_end_token]: + for e_token in [self.eot_token, self.invoke_end_token, self._eos_token]: if e_token in new_text: new_text = new_text.replace(e_token, "") return StreamingParseResult(normal_text=new_text) diff --git a/tensorrt_llm/tokenizer/deepseek_v32/encoding.py b/tensorrt_llm/tokenizer/deepseek_v32/encoding.py index 24833b7b023..6e901cdfc07 100644 --- a/tensorrt_llm/tokenizer/deepseek_v32/encoding.py +++ b/tensorrt_llm/tokenizer/deepseek_v32/encoding.py @@ -91,7 +91,8 @@ def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: ) P_dsml_strs = [] - arguments = json.loads(tool_call["arguments"]) + raw_args = tool_call["arguments"] + arguments = json.loads(raw_args) if isinstance(raw_args, str) else raw_args for k, v in arguments.items(): p_dsml_str = p_dsml_template.format( From 39d294b8590875da837ec91d1debb8579d6cb79c Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Tue, 10 Mar 2026 14:41:25 +0800 Subject: [PATCH 4/6] [TRTLLM-11135][fix] Fix vulnerabilities protobuf and aiohttp (#11898) Signed-off-by: Yiqing Yan --- constraints.txt | 4 ++++ jenkins/current_image_tags.properties | 8 ++++---- requirements-dev.txt | 2 +- requirements.txt | 2 +- tests/integration/test_lists/waives.txt | 1 + triton_backend/requirements.txt | 2 +- 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/constraints.txt b/constraints.txt index 3586deaf81d..25ae35bdecf 100644 --- a/constraints.txt +++ b/constraints.txt @@ -4,3 +4,7 @@ urllib3>=2.6.3 # WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx wheel>=0.46.2 +# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7 +protobuf>=6.33.5 +# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg +aiohttp>=3.13.3 diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index da06d20e717..3f3c623ae70 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202603051044-11898 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202603051044-11898 diff --git a/requirements-dev.txt b/requirements-dev.txt index eae33fa6c92..2450b86f3df 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -36,7 +36,7 @@ opentelemetry-api>=1.26.0 opentelemetry-exporter-otlp>=1.26.0 opentelemetry-semantic-conventions-ai>=0.4.1 fuzzywuzzy==0.18.0 -aiperf==0.3.0 +aiperf==0.4.0 nanobind>=2.9.0 nixl==0.8.0 hf-transfer==0.1.9 diff --git a/requirements.txt b/requirements.txt index df670e33e94..178d3a46f77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0 # torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7 nvidia-nccl-cu13>=2.27.7,<=2.28.9 nvidia-cuda-nvrtc -transformers==4.57.1 +transformers==4.57.3 prometheus_client prometheus_fastapi_instrumentator pydantic>=2.9.1 diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5fa921dfb25..18f0fd16790 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -357,6 +357,7 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5945081) full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5948435) accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[tp8] SKIP (https://nvbugs/5951789) +unittest/_torch/modeling -k "modeling_siglip" SKIP (https://nvbugs/5941242) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_1k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5955765) diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt index 7daa868ed48..4375447772c 100644 --- a/triton_backend/requirements.txt +++ b/triton_backend/requirements.txt @@ -1,7 +1,7 @@ regex fire tritonclient[all] -transformers==4.57.1 +transformers==4.57.3 pandas tabulate flash_attn From 81350b70450ffe18f8aca61db3296d85088c6f24 Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:09:06 +0800 Subject: [PATCH 5/6] [None][chore] Align perf benchmark output format (#12067) Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- .../slurm/benchmark/run_benchmark_nv_sa.sh | 14 ++ .../defs/perf/disagg/reporting/report.py | 153 +++++++++--------- 2 files changed, 86 insertions(+), 81 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh index b72a54f8860..09bb78d9b76 100644 --- a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh +++ b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh @@ -191,6 +191,20 @@ for concurrency in ${concurrency_list}; do --percentile-metrics "ttft,tpot,itl,e2el" \ $([ "${streaming}" = "false" ] && echo "--non-streaming") + # Print failed request count (consistent with non-nv_sa benchmark format) + python - "${output_dir}/result.json" <<-'PYEOF' + import json + import sys + + try: + with open(sys.argv[1], encoding="utf-8") as f: + d = json.load(f) + failed = d["num_prompts"] - d["completed"] + print(f"Total failed requests: {failed}") + except (OSError, json.JSONDecodeError, KeyError) as exc: + print(f"WARNING: failed to read request counts from {sys.argv[1]}: {exc}", file=sys.stderr) + PYEOF + echo "Benchmark with concurrency ${concurrency} done" do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "log" done diff --git a/tests/integration/defs/perf/disagg/reporting/report.py b/tests/integration/defs/perf/disagg/reporting/report.py index a16fff01038..aa6f493b4fd 100644 --- a/tests/integration/defs/perf/disagg/reporting/report.py +++ b/tests/integration/defs/perf/disagg/reporting/report.py @@ -1,3 +1,4 @@ +import json import os import re from datetime import datetime @@ -40,7 +41,7 @@ def __init__(self, benchmark_type: str, config, metrics_config, result_dir: str) """ self.benchmark_type = benchmark_type self.config = config - self.metrics_config = metrics_config # 保存 metrics 配置 + self.metrics_config = metrics_config self.result_dir = result_dir def _extract_log(self, pattern: str, metric_names: List[str], log_content: str): @@ -50,26 +51,69 @@ def _extract_log(self, pattern: str, metric_names: List[str], log_content: str): for match in compiled.finditer(log_content): logger.debug(f"Found match: {match.group(0)[:100]}...") logger.debug(f"All groups: {match.groups()}") - logger.debug(f"Number of groups: {len(match.groups())}") if len(match.groups()) < 3: - logger.warning(f"Expected 3 groups but got {len(match.groups())}") + logger.warning(f"Expected at least 3 groups but got {len(match.groups())}") continue try: values = [float(x) for x in match.groups()[:-1]] - concurrency = int(match.groups()[-1]) # Use groups()[-1] instead of group(-1) + concurrency = int(match.groups()[-1]) item = dict(zip(metric_names, values)) - item["concurrency"] = concurrency # Concurrency used to make test names + item["concurrency"] = concurrency results.append(item) logger.debug( - f"Successfully extracted: E2EL={values[0]}, TTFT={values[1]}, concurrency={concurrency}" + f"Extracted: concurrency={concurrency}, {dict(zip(metric_names, values))}" ) except (ValueError, IndexError) as e: logger.warning(f"Error processing match: {e}") continue return results + def _extract_request_counts_from_log(self, log_content: str) -> Tuple[int, int]: + """Extract failed/total from log via regex (TRT-LLM benchmark_serving.py format). + + Sums all matches to handle multi-concurrency logs correctly. + """ + failed_requests = 0 + total_requests = 0 + # Match "Failed requests:" (capital F) from summary block, not + # "Total failed requests:" (lowercase f) which can report 0 incorrectly + failed_matches = re.findall(r"Failed requests:\s+(\d+)", log_content) + total_matches = re.findall(r"Total requests:\s+(\d+)", log_content) + if failed_matches: + failed_requests = sum(int(x) for x in failed_matches) + if total_matches: + total_requests = sum(int(x) for x in total_matches) + return failed_requests, total_requests + + def _extract_request_counts_from_json(self, concurrencies: List[int]) -> Tuple[int, int]: + """Extract failed/total from result.json files (bench_serving format). + + Used when use_nv_sa_benchmark is true, since bench_serving logs do not + contain "Total requests" / "Total failed requests" fields. + """ + total_requests = 0 + failed_requests = 0 + for concurrency in concurrencies: + result_json_path = os.path.join( + self.result_dir, f"concurrency_{concurrency}", "result.json" + ) + if not os.path.exists(result_json_path): + logger.warning(f"result.json not found: {result_json_path}") + continue + try: + with open(result_json_path, "r") as f: + data = json.load(f) + num_prompts = data.get("num_prompts", 0) + completed = data.get("completed", 0) + total_requests += num_prompts + failed_requests += num_prompts - completed + except json.JSONDecodeError as e: + logger.warning(f"Error reading result.json: {result_json_path}, {e}") + continue + return failed_requests, total_requests + def parse( self, model_name: str, @@ -77,7 +121,6 @@ def parse( test_name: Optional[str] = None, ): """Parse logs using configured metrics.""" - # Build log file path using metrics_config.log_file log_file_name = os.path.join(self.result_dir, self.metrics_config.log_file) if not os.path.exists(log_file_name): @@ -87,21 +130,21 @@ def parse( with open(log_file_name, "r", encoding="utf-8", errors="replace") as log_file: log_content = log_file.read() - # Extract failed/total request counts from log (for executor to mark failed cases) - # Use findall + last match to handle multi-concurrency logs correctly - failed_requests = 0 - total_requests = 0 - failed_matches = re.findall(r"Total failed requests:\s*(\d+)", log_content) - total_matches = re.findall(r"Total requests:\s*(\d+)", log_content) - if failed_matches: - failed_requests = int(failed_matches[-1]) - if total_matches: - total_requests = int(total_matches[-1]) - - # Use metrics_config for extraction raw_results = self._extract_log( self.metrics_config.extractor_pattern, self.metrics_config.metric_names, log_content ) + + # Determine request count extraction strategy based on benchmark backend + use_nv_sa = False + if isinstance(self.config, dict): + use_nv_sa = self.config.get("benchmark", {}).get("use_nv_sa_benchmark", False) + + if use_nv_sa: + concurrencies = [item.get("concurrency", 0) for item in raw_results] + failed_requests, total_requests = self._extract_request_counts_from_json(concurrencies) + else: + failed_requests, total_requests = self._extract_request_counts_from_log(log_content) + if len(raw_results) == 0: logger.warning("No metrics extracted from log file") return { @@ -111,7 +154,6 @@ def parse( "total_requests": total_requests, } - # Convert to perf result format df = self._convert_to_perf_result_format(raw_results, model_name, timestamps, test_name) return { @@ -128,19 +170,9 @@ def _convert_to_perf_result_format( timestamps: Optional[Dict[str, str]] = None, test_name: Optional[str] = None, ): - """Convert raw results to perf result format. - - Each test result is expanded into multiple rows, one row per metric. - - Args: - raw_results: Raw performance results - model_name: Model name - timestamps: Optional timestamps dict - test_name: Optional pytest test name (e.g., "test_benchmark[deepseek-r1_1k1k_...]") - """ + """Convert raw results to perf result format (one row per metric).""" expanded_rows = [] - test_prefix = test_name - # Use provided timestamps or fallback to current time + if timestamps: start_time = timestamps.get( "start_timestamp", datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -158,53 +190,40 @@ def _convert_to_perf_result_format( lock_freq_graphics = gpu_config.get("lock_freq_graphics_mhz", 0) or 0 lock_freq_memory = gpu_config.get("lock_freq_memory_mhz", 0) or 0 - # Get precision from YAML config metadata if isinstance(self.config, dict): precision = self.config.get("metadata", {}).get("precision", "unknown") else: - # Fallback if config is not a dict (should not happen in current system) precision = "unknown" for item in raw_results: concurrency = item.get("concurrency", "1") - base_test_name = f"{test_prefix}_con:{concurrency}" + base_test_name = f"{test_name}_con:{concurrency}" - # Create a separate row for each performance metric for metric_name, metric_value in item.items(): if metric_name == "concurrency": continue - # Create new row row = { - # Network related fields (use test_name) "network_name": self._get_network_name(base_test_name), "network_hash": base_test_name, - # Hardware related fields (leave empty) "sm_clk": lock_freq_graphics, "mem_clk": lock_freq_memory, "gpu_idx": np.nan, - # Test related fields "perf_case_name": base_test_name, "test_name": base_test_name, "original_test_name": base_test_name, - # Performance metrics "perf_metric": float(metric_value), "metric_type": metric_name, - # Time related fields - use actual timestamps from TestCaseTracker "total_time__sec": total_time, "start_timestamp": start_time, "end_timestamp": end_time, - # State and configuration "state": "valid", "command": f"disagg_benchmark --model={model_name} --{precision} --concurrency={concurrency}", - # Threshold related fields "threshold": np.nan, "absolute_threshold": np.nan, } - expanded_rows.append(row) - # Create DataFrame and ensure column order expected_columns = [ "network_name", "network_hash", @@ -226,13 +245,9 @@ def _convert_to_perf_result_format( ] df = pd.DataFrame(expanded_rows) - - # Ensure all expected columns exist for col in expected_columns: if col not in df.columns: df[col] = np.nan - - # Rearrange column order df = df[expected_columns] return df @@ -240,57 +255,33 @@ def _convert_to_perf_result_format( def _get_network_name(self, base_test_name: str): """Extract network name from test name. - Input format: - test_disagg_simple.py::TestDisaggBenchmark::test_benchmark[deepseek-r1_1k1k_...]-con-1 - Output format: - deepseek-r1_1k1k_...-con-1 + e.g. "...::test_benchmark[deepseek-r1_1k1k_...]-con-1" -> "deepseek-r1_1k1k_...-con-1" """ - # Pattern to extract content inside brackets and the trailing -con-X - # Group 1: content inside [] - # Group 2: -con-X suffix - pattern = r"\[([^\]]+)\](-con-\d+)" - match = re.search(pattern, base_test_name) - + match = re.search(r"\[([^\]]+)\](-con-\d+)", base_test_name) if match: - # Combine the bracket content with -con-X suffix return f"{match.group(1)}{match.group(2)}" - else: - # Fallback: if pattern doesn't match, use original logic - return base_test_name.replace("/", "-") + return base_test_name.replace("/", "-") class ResultSaver(object): - """All of the benchmarks append to the same csv, add header to it each time. - - No matter whether the columns are of the same count. - """ + """Append benchmark results to a shared CSV file.""" def __init__(self, output_path: str): self.output_path = output_path def append_a_df(self, df: pd.DataFrame): - """Seamlessly append DataFrame to CSV without headers or extra line breaks. - - Ideal for unified format data where consistency is maintained across appends. - """ - # Check if file exists and has content + """Append DataFrame to CSV, writing header only on first write.""" file_exists = os.path.exists(self.output_path) and os.path.getsize(self.output_path) > 0 if file_exists: - # File exists, append data only (no header) df.to_csv(self.output_path, mode="a", index=False, header=False) - logger.success(f"Seamlessly appended {len(df)} rows to {self.output_path}") + logger.success(f"Appended {len(df)} rows to {self.output_path}") else: - # First write, include header df.to_csv(self.output_path, mode="w", index=False, header=True) logger.success(f"Created new file with {len(df)} rows: {self.output_path}") def save_all(self, results: List[Tuple[pd.DataFrame, str]]): - """Save in batch manner: Append each dataframe with header. - - The 2nd parameter can print to logs. - ex: [(df1, '1k1k'), (df2, '8k1k')] - """ + """Append each (DataFrame, benchmark_type) pair to CSV.""" for df, btype in results: logger.info(f"Writing benchmark type: {btype}") self.append_a_df(df) From 1fef88e95d45b2f3daefe81bd007fdcc93aa5ddc Mon Sep 17 00:00:00 2001 From: Stefan Niebler <82932102+stnie@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:33:30 +0100 Subject: [PATCH 6/6] [None][chore] Improve sampler performance by replacing torch.where with masked_fill_ (#11949) Signed-off-by: Stefan Niebler <82932102+stnie@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/sampler.py | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 001bf40843d..e6c17a51dcd 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -1792,7 +1792,8 @@ def _write_finish_reasons( if not single_token_stop_words_only else self._are_stop_words_single_token ) - batched_finish_reasons[:, stop_word_indices] = torch.where( + batched_finish_reasons_stop_words = batched_finish_reasons[:, stop_word_indices] + _ = batched_finish_reasons_stop_words.masked_fill_( stop_words_func( stop_seq_slots, stop_tokens, @@ -1801,18 +1802,17 @@ def _write_finish_reasons( else num_accepted_tokens, ), FinishReason.STOP_WORDS.value, - batched_finish_reasons[:, stop_word_indices], ) + batched_finish_reasons[:, stop_word_indices] = batched_finish_reasons_stop_words - batched_finish_reasons = torch.where( + _ = batched_finish_reasons.masked_fill_( self._are_max_length(seq_lens, store.max_lengths_cuda[seq_slots]), FinishReason.LENGTH.value, - batched_finish_reasons, ) - batched_finish_reasons = torch.where( + + _ = batched_finish_reasons.masked_fill_( self._are_end_id(store.end_ids_cuda[seq_slots], tokens), FinishReason.END_ID.value, - batched_finish_reasons, ) finish_reasons[:, seq_slots] = batched_finish_reasons @@ -1916,7 +1916,7 @@ def _are_stop_words( # Fill in the new tokens at the end of the past tokens buffer full_tokens[-self._max_tokens :] = tokens # short words are padded with _PAD_STOP_WORD_TOKEN_ID, so we need to mask them - mask = stop_words != self._PAD_STOP_WORD_TOKEN_ID + mask = stop_words == self._PAD_STOP_WORD_TOKEN_ID matches = torch.empty( ( self._max_tokens, @@ -1941,15 +1941,15 @@ def _are_stop_words( stop_words_for_match = stop_words.unsqueeze(0) _ = torch.eq(full_tokens_for_match, stop_words_for_match, out=matches) # Mask the padding tokens - matches_after_mask = torch.where( - mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), matches, True + _ = matches.masked_fill_( + mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), True ) # Update the past tokens storage for the next iteration store.past_tokens_cuda[:, seq_slots] = full_tokens # Return the result word_len_dim = 2 num_words_dim = 1 - return torch.any(matches_after_mask.all(dim=word_len_dim), dim=num_words_dim) + return torch.any(matches.all(dim=word_len_dim), dim=num_words_dim) @nvtx_range("_are_stop_words_single_token") def _are_stop_words_single_token( @@ -3721,8 +3721,10 @@ def _sample_batched_by_strategy( group_logits_indices_for_processed_logprobs_cuda ] current_softmax_cuda = group_softmax_cuda[logit_indices_for_processed_logprobs_cuda] - processed_logits_cuda = torch.where( - current_softmax_cuda > 0, current_logits_cuda, float("-inf") + + # processed_logits_cuda is an alias to current_logits_cuda after this operation + processed_logits_cuda = current_logits_cuda.masked_fill_( + current_softmax_cuda == 0, float("-inf") ) temperature_for_processed_logprobs = group_temperature_cuda if isinstance(temperature_for_processed_logprobs, torch.Tensor):