Skip to content

Commit 60091ff

Browse files
JennyLiu-nvJenny Liu
andauthored
[None][Test] Add multinode e2e and accuracy cases on DGX-Spark (NVIDIA#12110)
Signed-off-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com> Co-authored-by: Jenny Liu <JennyLiu-nv+JennyLiu@users.noreply.github.com>
1 parent 0fc0cbd commit 60091ff

File tree

5 files changed

+318
-79
lines changed

5 files changed

+318
-79
lines changed

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ meta-llama/Llama-3.1-8B-Instruct:
3737
- quant_algo: FP8
3838
kv_cache_quant_algo: NVFP4
3939
accuracy: 66.45
40+
meta-llama/Llama-3.1-70B:
41+
- accuracy: 78.58
42+
nvidia/Llama-3.1-405B-Instruct-NVFP4:
43+
- quant_algo: NVFP4
44+
kv_cache_quant_algo: FP8
45+
accuracy: 84.82
4046
meta-llama/Llama-3.2-1B:
4147
- quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
4248
accuracy: 32.72
@@ -208,6 +214,8 @@ deepseek-ai/DeepSeek-R1:
208214
kv_cache_quant_algo: FP8
209215
spec_dec_algo: MTP
210216
accuracy: 87.573
217+
deepseek-ai/DeepSeek-R1-Distill-Llama-70B:
218+
- accuracy: 78.19
211219
deepseek-ai/DeepSeek-V3.2-Exp:
212220
- quant_algo: FP8_BLOCK_SCALES
213221
accuracy: 88.2

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,50 @@ def _get_default_torch_compile_config(torch_compile):
7878
max_num_streams=3) if torch_compile else None
7979

8080

81+
def _run_multinode_accuracy(model_path,
82+
model_name,
83+
*,
84+
benchmarks,
85+
tp_size=2,
86+
pp_size=1,
87+
ep_size=1,
88+
draft_model_path=None,
89+
max_draft_len=2,
90+
kv_cache_config=None,
91+
max_num_tokens=4096,
92+
max_batch_size=1,
93+
**llm_kwargs):
94+
benchmark_task_map = {
95+
"mmlu": MMLU,
96+
"gsm8k": GSM8K,
97+
}
98+
if kv_cache_config is None:
99+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
100+
enable_block_reuse=draft_model_path
101+
is None)
102+
spec_config = None
103+
if draft_model_path is not None:
104+
spec_config = Eagle3DecodingConfig(max_draft_len=max_draft_len,
105+
speculative_model=draft_model_path,
106+
eagle3_one_model=True)
107+
108+
with LLM(model_path,
109+
tensor_parallel_size=tp_size,
110+
pipeline_parallel_size=pp_size,
111+
moe_expert_parallel_size=ep_size,
112+
max_num_tokens=max_num_tokens,
113+
max_batch_size=max_batch_size,
114+
kv_cache_config=kv_cache_config,
115+
speculative_config=spec_config,
116+
**llm_kwargs) as llm:
117+
for benchmark in benchmarks:
118+
task_cls = benchmark_task_map.get(benchmark)
119+
if task_cls is None:
120+
raise ValueError(f"Unsupported benchmark: {benchmark}")
121+
task = task_cls(model_name)
122+
task.evaluate(llm)
123+
124+
81125
class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
82126
MODEL_NAME = "meta-llama/Llama-3.1-8B"
83127
MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"
@@ -740,6 +784,32 @@ def test_fp8_prequantized(self):
740784
task.evaluate(llm)
741785

742786

787+
class TestLlama3_1_70B(LlmapiAccuracyTestHarness):
788+
MODEL_NAME = "meta-llama/Llama-3.1-70B"
789+
MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-70B"
790+
791+
@skip_pre_hopper
792+
@pytest.mark.skip_less_mpi_world_size(2)
793+
def test_auto_dtype_tp2(self):
794+
_run_multinode_accuracy(self.MODEL_PATH,
795+
self.MODEL_NAME,
796+
benchmarks=["mmlu"])
797+
798+
799+
class TestLlama3_1_405BInstructFp4(LlmapiAccuracyTestHarness):
800+
MODEL_NAME = "nvidia/Llama-3.1-405B-Instruct-NVFP4"
801+
MODEL_PATH = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4"
802+
803+
@skip_pre_blackwell
804+
@pytest.mark.skip_less_mpi_world_size(2)
805+
def test_fp4_tp2(self):
806+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
807+
_run_multinode_accuracy(self.MODEL_PATH,
808+
self.MODEL_NAME,
809+
benchmarks=["mmlu"],
810+
kv_cache_config=kv_cache_config)
811+
812+
743813
@pytest.mark.timeout(7200)
744814
@pytest.mark.skip_less_device_memory(80000)
745815
class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
@@ -757,6 +827,13 @@ def test_auto_dtype_tp8(self):
757827
task.evaluate(llm,
758828
extra_evaluator_kwargs=dict(apply_chat_template=True))
759829

830+
@pytest.mark.skip_less_mpi_world_size(2)
831+
def test_auto_dtype_tp2(self):
832+
_run_multinode_accuracy(
833+
f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct",
834+
self.MODEL_NAME,
835+
benchmarks=["mmlu"])
836+
760837
@skip_pre_hopper
761838
@pytest.mark.skip_less_mpi_world_size(8)
762839
@parametrize_with_ids("torch_compile", [False, True])
@@ -1104,6 +1181,28 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
11041181
task = GSM8K(self.MODEL_NAME)
11051182
task.evaluate(llm)
11061183

1184+
@skip_pre_hopper
1185+
@pytest.mark.skip_less_mpi_world_size(2)
1186+
def test_auto_dtype_tp2(self):
1187+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
1188+
_run_multinode_accuracy(
1189+
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct",
1190+
self.MODEL_NAME,
1191+
benchmarks=["mmlu"],
1192+
ep_size=2,
1193+
kv_cache_config=kv_cache_config)
1194+
1195+
@skip_pre_hopper
1196+
@pytest.mark.skip_less_mpi_world_size(2)
1197+
def test_fp8_tp2(self):
1198+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
1199+
_run_multinode_accuracy(
1200+
f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
1201+
self.MODEL_NAME,
1202+
benchmarks=["mmlu"],
1203+
ep_size=2,
1204+
kv_cache_config=kv_cache_config)
1205+
11071206

11081207
class TestMistral7B(LlmapiAccuracyTestHarness):
11091208
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
@@ -2831,6 +2930,20 @@ def test_fp8_blockscale_chunked_prefill(self, tp_size, pp_size, ep_size,
28312930
task.evaluate(llm)
28322931

28332932

2933+
class TestDeepSeekR1DistillLlama70B(LlmapiAccuracyTestHarness):
2934+
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
2935+
MODEL_PATH = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B"
2936+
2937+
@skip_pre_hopper
2938+
@pytest.mark.skip_less_mpi_world_size(2)
2939+
def test_auto_dtype_tp2(self):
2940+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
2941+
_run_multinode_accuracy(self.MODEL_PATH,
2942+
self.MODEL_NAME,
2943+
benchmarks=["mmlu"],
2944+
kv_cache_config=kv_cache_config)
2945+
2946+
28342947
@pytest.mark.timeout(14400)
28352948
@pytest.mark.skip_less_device(8)
28362949
class TestDeepSeekV3(LlmapiAccuracyTestHarness):
@@ -4450,6 +4563,42 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
44504563
task = GSM8K(self.MODEL_NAME)
44514564
task.evaluate(llm)
44524565

4566+
@skip_pre_blackwell
4567+
@pytest.mark.skip_less_mpi_world_size(2)
4568+
@pytest.mark.parametrize(
4569+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
4570+
[
4571+
(2, 1, 2, False, False, False, "CUTLASS", False),
4572+
(2, 1, 2, False, False, False, "CUTLASS", True),
4573+
],
4574+
ids=[
4575+
"latency_moe_cutlass",
4576+
"latency_moe_cutlass_eagle3",
4577+
],
4578+
)
4579+
def test_nvfp4_2gpus(self, tp_size, pp_size, ep_size, attention_dp,
4580+
cuda_graph, overlap_scheduler, moe_backend, eagle3):
4581+
4582+
pytorch_config = dict(
4583+
disable_overlap_scheduler=not overlap_scheduler,
4584+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
4585+
moe_config=MoeConfig(backend=moe_backend))
4586+
4587+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4588+
enable_block_reuse=not eagle3)
4589+
_run_multinode_accuracy(
4590+
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
4591+
self.MODEL_NAME,
4592+
benchmarks=["mmlu"],
4593+
tp_size=tp_size,
4594+
pp_size=pp_size,
4595+
ep_size=ep_size,
4596+
draft_model_path=(f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/"
4597+
if eagle3 else None),
4598+
kv_cache_config=kv_cache_config,
4599+
enable_attention_dp=attention_dp,
4600+
**pytorch_config)
4601+
44534602

44544603
class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
44554604
MODEL_NAME = "Qwen3/Qwen3-30B-A3B-Instruct-2507"

tests/integration/defs/test_e2e.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -1957,7 +1957,8 @@ def test_ptp_quickstart_advanced_deepseek_multi_nodes(llm_root, llm_venv,
19571957
"--max_num_tokens=2048",
19581958
"--disable_kv_cache_reuse",
19591959
]
1960-
check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
1960+
output = check_output(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
1961+
assert "Generated text:" in output, output[-4000:]
19611962

19621963

19631964
@pytest.mark.parametrize("model_name,model_path,eagle_model_path", [
@@ -3120,6 +3121,55 @@ def test_multi_nodes_eval(model_path, tp_size, pp_size, ep_size, eval_task,
31203121
assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}"
31213122

31223123

3124+
@pytest.mark.skip_less_device_memory(80000)
3125+
@pytest.mark.skip_less_mpi_world_size(2)
3126+
@pytest.mark.parametrize("tp_size,pp_size", [(2, 1), (1, 2)],
3127+
ids=["tp2", "pp2"])
3128+
@pytest.mark.parametrize("model_path", [
3129+
pytest.param('llama-3.1-model/Meta-Llama-3.1-70B', marks=skip_pre_hopper),
3130+
pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct',
3131+
marks=skip_pre_hopper),
3132+
pytest.param('Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf',
3133+
marks=skip_pre_blackwell),
3134+
pytest.param('DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B',
3135+
marks=skip_pre_hopper),
3136+
pytest.param('llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8',
3137+
marks=skip_pre_hopper),
3138+
pytest.param('llama4-models/Llama-4-Scout-17B-16E-Instruct',
3139+
marks=skip_pre_hopper),
3140+
pytest.param('modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4',
3141+
marks=skip_pre_blackwell),
3142+
])
3143+
def test_ptp_quickstart_advanced_multinode(llm_root, llm_venv, model_path,
3144+
tp_size, pp_size):
3145+
print(
3146+
f"Testing quickstart {model_path} with tp_size={tp_size}, pp_size={pp_size}."
3147+
)
3148+
3149+
example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
3150+
prompt = "Explain why New York is great city to live in, in 1 short paragraph"
3151+
run_cmd = [
3152+
"python3",
3153+
str(example_root / "quickstart_advanced.py"),
3154+
f"--model_dir={llm_models_root()}/{model_path}",
3155+
f"--tp_size={tp_size}",
3156+
f"--pp_size={pp_size}",
3157+
"--max_num_tokens=4096",
3158+
"--max_batch_size=1",
3159+
"--use_cuda_graph",
3160+
f"--kv_cache_fraction={_MEM_FRACTION_50}",
3161+
"--prompt",
3162+
prompt,
3163+
]
3164+
3165+
if ("Llama-4" in model_path or "Qwen3" in model_path) and tp_size > 1:
3166+
run_cmd.append(f"--moe_ep_size={tp_size}")
3167+
3168+
output = check_output(run_cmd, env=llm_venv._new_env)
3169+
print(output)
3170+
assert "Generated text:" in output, output[-4000:]
3171+
3172+
31233173
@pytest.mark.skip_less_device_memory(80000)
31243174
@pytest.mark.parametrize("return_generation_logits", [True, False])
31253175
@pytest.mark.parametrize("model_path", [

tests/integration/test_lists/qa/llm_spark_func.txt

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)