Tracin
diff --git a/‎tests/integration/defs/accuracy/references/mmlu.yaml‎
Lines changed: 8 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/mmlu.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/test_llm_api_pytorch.py‎
Lines changed: 149 additions & 0 deletions b/‎tests/integration/defs/accuracy/test_llm_api_pytorch.py‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎tests/integration/defs/test_e2e.py‎
Lines changed: 52 additions & 2 deletions b/‎tests/integration/defs/test_e2e.py‎
Lines changed: 52 additions & 2 deletions
diff --git a/‎tests/integration/test_lists/qa/llm_spark_func.txt‎
Lines changed: 0 additions & 77 deletions b/‎tests/integration/test_lists/qa/llm_spark_func.txt‎
Lines changed: 0 additions & 77 deletions
@@ -37,6 +37,12 @@ meta-llama/Llama-3.1-8B-Instruct:
   - quant_algo: FP8
     kv_cache_quant_algo: NVFP4
     accuracy: 66.45
+meta-llama/Llama-3.1-70B:
+  - accuracy: 78.58
+nvidia/Llama-3.1-405B-Instruct-NVFP4:
+  - quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 84.82
 meta-llama/Llama-3.2-1B:
   - quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
     accuracy: 32.72
@@ -208,6 +214,8 @@ deepseek-ai/DeepSeek-R1:
     kv_cache_quant_algo: FP8
     spec_dec_algo: MTP
     accuracy: 87.573
+deepseek-ai/DeepSeek-R1-Distill-Llama-70B:
+  - accuracy: 78.19
 deepseek-ai/DeepSeek-V3.2-Exp:
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 88.2
 
@@ -78,6 +78,50 @@ def _get_default_torch_compile_config(torch_compile):
                               max_num_streams=3) if torch_compile else None
 
 
+def _run_multinode_accuracy(model_path,
+                            model_name,
+                            *,
+                            benchmarks,
+                            tp_size=2,
+                            pp_size=1,
+                            ep_size=1,
+                            draft_model_path=None,
+                            max_draft_len=2,
+                            kv_cache_config=None,
+                            max_num_tokens=4096,
+                            max_batch_size=1,
+                            **llm_kwargs):
+    benchmark_task_map = {
+        "mmlu": MMLU,
+        "gsm8k": GSM8K,
+    }
+    if kv_cache_config is None:
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+                                        enable_block_reuse=draft_model_path
+                                        is None)
+    spec_config = None
+    if draft_model_path is not None:
+        spec_config = Eagle3DecodingConfig(max_draft_len=max_draft_len,
+                                           speculative_model=draft_model_path,
+                                           eagle3_one_model=True)
+
+    with LLM(model_path,
+             tensor_parallel_size=tp_size,
+             pipeline_parallel_size=pp_size,
+             moe_expert_parallel_size=ep_size,
+             max_num_tokens=max_num_tokens,
+             max_batch_size=max_batch_size,
+             kv_cache_config=kv_cache_config,
+             speculative_config=spec_config,
+             **llm_kwargs) as llm:
+        for benchmark in benchmarks:
+            task_cls = benchmark_task_map.get(benchmark)
+            if task_cls is None:
+                raise ValueError(f"Unsupported benchmark: {benchmark}")
+            task = task_cls(model_name)
+            task.evaluate(llm)
+
+
 class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"
@@ -740,6 +784,32 @@ def test_fp8_prequantized(self):
             task.evaluate(llm)
 
 
+class TestLlama3_1_70B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "meta-llama/Llama-3.1-70B"
+    MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-70B"
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_auto_dtype_tp2(self):
+        _run_multinode_accuracy(self.MODEL_PATH,
+                                self.MODEL_NAME,
+                                benchmarks=["mmlu"])
+
+
+class TestLlama3_1_405BInstructFp4(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Llama-3.1-405B-Instruct-NVFP4"
+    MODEL_PATH = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4"
+
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_fp4_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        _run_multinode_accuracy(self.MODEL_PATH,
+                                self.MODEL_NAME,
+                                benchmarks=["mmlu"],
+                                kv_cache_config=kv_cache_config)
+
+
 @pytest.mark.timeout(7200)
 @pytest.mark.skip_less_device_memory(80000)
 class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
@@ -757,6 +827,13 @@ def test_auto_dtype_tp8(self):
             task.evaluate(llm,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
 
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_auto_dtype_tp2(self):
+        _run_multinode_accuracy(
+            f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct",
+            self.MODEL_NAME,
+            benchmarks=["mmlu"])
+
     @skip_pre_hopper
     @pytest.mark.skip_less_mpi_world_size(8)
     @parametrize_with_ids("torch_compile", [False, True])
@@ -1104,6 +1181,28 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        _run_multinode_accuracy(
+            f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct",
+            self.MODEL_NAME,
+            benchmarks=["mmlu"],
+            ep_size=2,
+            kv_cache_config=kv_cache_config)
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_fp8_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        _run_multinode_accuracy(
+            f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
+            self.MODEL_NAME,
+            benchmarks=["mmlu"],
+            ep_size=2,
+            kv_cache_config=kv_cache_config)
+
 
 class TestMistral7B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Mistral-7B-v0.1"
@@ -2831,6 +2930,20 @@ def test_fp8_blockscale_chunked_prefill(self, tp_size, pp_size, ep_size,
             task.evaluate(llm)
 
 
+class TestDeepSeekR1DistillLlama70B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
+    MODEL_PATH = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B"
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_mpi_world_size(2)
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        _run_multinode_accuracy(self.MODEL_PATH,
+                                self.MODEL_NAME,
+                                benchmarks=["mmlu"],
+                                kv_cache_config=kv_cache_config)
+
+
 @pytest.mark.timeout(14400)
 @pytest.mark.skip_less_device(8)
 class TestDeepSeekV3(LlmapiAccuracyTestHarness):
@@ -4450,6 +4563,42 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(2)
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (2, 1, 2, False, False, False, "CUTLASS", False),
+            (2, 1, 2, False, False, False, "CUTLASS", True),
+        ],
+        ids=[
+            "latency_moe_cutlass",
+            "latency_moe_cutlass_eagle3",
+        ],
+    )
+    def test_nvfp4_2gpus(self, tp_size, pp_size, ep_size, attention_dp,
+                         cuda_graph, overlap_scheduler, moe_backend, eagle3):
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        _run_multinode_accuracy(
+            f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+            self.MODEL_NAME,
+            benchmarks=["mmlu"],
+            tp_size=tp_size,
+            pp_size=pp_size,
+            ep_size=ep_size,
+            draft_model_path=(f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/"
+                              if eagle3 else None),
+            kv_cache_config=kv_cache_config,
+            enable_attention_dp=attention_dp,
+            **pytorch_config)
+
 
 class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen3/Qwen3-30B-A3B-Instruct-2507"
 
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -1957,7 +1957,8 @@ def test_ptp_quickstart_advanced_deepseek_multi_nodes(llm_root, llm_venv,
         "--max_num_tokens=2048",
         "--disable_kv_cache_reuse",
     ]
-    check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
+    output = check_output(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
+    assert "Generated text:" in output, output[-4000:]
 
 
 @pytest.mark.parametrize("model_name,model_path,eagle_model_path", [
@@ -3120,6 +3121,55 @@ def test_multi_nodes_eval(model_path, tp_size, pp_size, ep_size, eval_task,
             assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}"
 
 
+@pytest.mark.skip_less_device_memory(80000)
+@pytest.mark.skip_less_mpi_world_size(2)
+@pytest.mark.parametrize("tp_size,pp_size", [(2, 1), (1, 2)],
+                         ids=["tp2", "pp2"])
+@pytest.mark.parametrize("model_path", [
+    pytest.param('llama-3.1-model/Meta-Llama-3.1-70B', marks=skip_pre_hopper),
+    pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct',
+                 marks=skip_pre_hopper),
+    pytest.param('Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf',
+                 marks=skip_pre_blackwell),
+    pytest.param('DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B',
+                 marks=skip_pre_hopper),
+    pytest.param('llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8',
+                 marks=skip_pre_hopper),
+    pytest.param('llama4-models/Llama-4-Scout-17B-16E-Instruct',
+                 marks=skip_pre_hopper),
+    pytest.param('modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4',
+                 marks=skip_pre_blackwell),
+])
+def test_ptp_quickstart_advanced_multinode(llm_root, llm_venv, model_path,
+                                           tp_size, pp_size):
+    print(
+        f"Testing quickstart {model_path} with tp_size={tp_size}, pp_size={pp_size}."
+    )
+
+    example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
+    prompt = "Explain why New York is great city to live in, in 1 short paragraph"
+    run_cmd = [
+        "python3",
+        str(example_root / "quickstart_advanced.py"),
+        f"--model_dir={llm_models_root()}/{model_path}",
+        f"--tp_size={tp_size}",
+        f"--pp_size={pp_size}",
+        "--max_num_tokens=4096",
+        "--max_batch_size=1",
+        "--use_cuda_graph",
+        f"--kv_cache_fraction={_MEM_FRACTION_50}",
+        "--prompt",
+        prompt,
+    ]
+
+    if ("Llama-4" in model_path or "Qwen3" in model_path) and tp_size > 1:
+        run_cmd.append(f"--moe_ep_size={tp_size}")
+
+    output = check_output(run_cmd, env=llm_venv._new_env)
+    print(output)
+    assert "Generated text:" in output, output[-4000:]
+
+
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.parametrize("return_generation_logits", [True, False])
 @pytest.mark.parametrize("model_path", [