@@ -78,6 +78,50 @@ def _get_default_torch_compile_config(torch_compile):
7878 max_num_streams = 3 ) if torch_compile else None
7979
8080
81+ def _run_multinode_accuracy (model_path ,
82+ model_name ,
83+ * ,
84+ benchmarks ,
85+ tp_size = 2 ,
86+ pp_size = 1 ,
87+ ep_size = 1 ,
88+ draft_model_path = None ,
89+ max_draft_len = 2 ,
90+ kv_cache_config = None ,
91+ max_num_tokens = 4096 ,
92+ max_batch_size = 1 ,
93+ ** llm_kwargs ):
94+ benchmark_task_map = {
95+ "mmlu" : MMLU ,
96+ "gsm8k" : GSM8K ,
97+ }
98+ if kv_cache_config is None :
99+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 ,
100+ enable_block_reuse = draft_model_path
101+ is None )
102+ spec_config = None
103+ if draft_model_path is not None :
104+ spec_config = Eagle3DecodingConfig (max_draft_len = max_draft_len ,
105+ speculative_model = draft_model_path ,
106+ eagle3_one_model = True )
107+
108+ with LLM (model_path ,
109+ tensor_parallel_size = tp_size ,
110+ pipeline_parallel_size = pp_size ,
111+ moe_expert_parallel_size = ep_size ,
112+ max_num_tokens = max_num_tokens ,
113+ max_batch_size = max_batch_size ,
114+ kv_cache_config = kv_cache_config ,
115+ speculative_config = spec_config ,
116+ ** llm_kwargs ) as llm :
117+ for benchmark in benchmarks :
118+ task_cls = benchmark_task_map .get (benchmark )
119+ if task_cls is None :
120+ raise ValueError (f"Unsupported benchmark: { benchmark } " )
121+ task = task_cls (model_name )
122+ task .evaluate (llm )
123+
124+
81125class TestLlama3_1_8B (LlmapiAccuracyTestHarness ):
82126 MODEL_NAME = "meta-llama/Llama-3.1-8B"
83127 MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Meta-Llama-3.1-8B"
@@ -740,6 +784,32 @@ def test_fp8_prequantized(self):
740784 task .evaluate (llm )
741785
742786
787+ class TestLlama3_1_70B (LlmapiAccuracyTestHarness ):
788+ MODEL_NAME = "meta-llama/Llama-3.1-70B"
789+ MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Meta-Llama-3.1-70B"
790+
791+ @skip_pre_hopper
792+ @pytest .mark .skip_less_mpi_world_size (2 )
793+ def test_auto_dtype_tp2 (self ):
794+ _run_multinode_accuracy (self .MODEL_PATH ,
795+ self .MODEL_NAME ,
796+ benchmarks = ["mmlu" ])
797+
798+
799+ class TestLlama3_1_405BInstructFp4 (LlmapiAccuracyTestHarness ):
800+ MODEL_NAME = "nvidia/Llama-3.1-405B-Instruct-NVFP4"
801+ MODEL_PATH = f"{ llm_models_root ()} /modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4"
802+
803+ @skip_pre_blackwell
804+ @pytest .mark .skip_less_mpi_world_size (2 )
805+ def test_fp4_tp2 (self ):
806+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
807+ _run_multinode_accuracy (self .MODEL_PATH ,
808+ self .MODEL_NAME ,
809+ benchmarks = ["mmlu" ],
810+ kv_cache_config = kv_cache_config )
811+
812+
743813@pytest .mark .timeout (7200 )
744814@pytest .mark .skip_less_device_memory (80000 )
745815class TestLlama3_3_70BInstruct (LlmapiAccuracyTestHarness ):
@@ -757,6 +827,13 @@ def test_auto_dtype_tp8(self):
757827 task .evaluate (llm ,
758828 extra_evaluator_kwargs = dict (apply_chat_template = True ))
759829
830+ @pytest .mark .skip_less_mpi_world_size (2 )
831+ def test_auto_dtype_tp2 (self ):
832+ _run_multinode_accuracy (
833+ f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct" ,
834+ self .MODEL_NAME ,
835+ benchmarks = ["mmlu" ])
836+
760837 @skip_pre_hopper
761838 @pytest .mark .skip_less_mpi_world_size (8 )
762839 @parametrize_with_ids ("torch_compile" , [False , True ])
@@ -1104,6 +1181,28 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
11041181 task = GSM8K (self .MODEL_NAME )
11051182 task .evaluate (llm )
11061183
1184+ @skip_pre_hopper
1185+ @pytest .mark .skip_less_mpi_world_size (2 )
1186+ def test_auto_dtype_tp2 (self ):
1187+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
1188+ _run_multinode_accuracy (
1189+ f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct" ,
1190+ self .MODEL_NAME ,
1191+ benchmarks = ["mmlu" ],
1192+ ep_size = 2 ,
1193+ kv_cache_config = kv_cache_config )
1194+
1195+ @skip_pre_hopper
1196+ @pytest .mark .skip_less_mpi_world_size (2 )
1197+ def test_fp8_tp2 (self ):
1198+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
1199+ _run_multinode_accuracy (
1200+ f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8" ,
1201+ self .MODEL_NAME ,
1202+ benchmarks = ["mmlu" ],
1203+ ep_size = 2 ,
1204+ kv_cache_config = kv_cache_config )
1205+
11071206
11081207class TestMistral7B (LlmapiAccuracyTestHarness ):
11091208 MODEL_NAME = "mistralai/Mistral-7B-v0.1"
@@ -2831,6 +2930,20 @@ def test_fp8_blockscale_chunked_prefill(self, tp_size, pp_size, ep_size,
28312930 task .evaluate (llm )
28322931
28332932
2933+ class TestDeepSeekR1DistillLlama70B (LlmapiAccuracyTestHarness ):
2934+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
2935+ MODEL_PATH = f"{ llm_models_root ()} /DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B"
2936+
2937+ @skip_pre_hopper
2938+ @pytest .mark .skip_less_mpi_world_size (2 )
2939+ def test_auto_dtype_tp2 (self ):
2940+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
2941+ _run_multinode_accuracy (self .MODEL_PATH ,
2942+ self .MODEL_NAME ,
2943+ benchmarks = ["mmlu" ],
2944+ kv_cache_config = kv_cache_config )
2945+
2946+
28342947@pytest .mark .timeout (14400 )
28352948@pytest .mark .skip_less_device (8 )
28362949class TestDeepSeekV3 (LlmapiAccuracyTestHarness ):
@@ -4450,6 +4563,42 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
44504563 task = GSM8K (self .MODEL_NAME )
44514564 task .evaluate (llm )
44524565
4566+ @skip_pre_blackwell
4567+ @pytest .mark .skip_less_mpi_world_size (2 )
4568+ @pytest .mark .parametrize (
4569+ "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3" ,
4570+ [
4571+ (2 , 1 , 2 , False , False , False , "CUTLASS" , False ),
4572+ (2 , 1 , 2 , False , False , False , "CUTLASS" , True ),
4573+ ],
4574+ ids = [
4575+ "latency_moe_cutlass" ,
4576+ "latency_moe_cutlass_eagle3" ,
4577+ ],
4578+ )
4579+ def test_nvfp4_2gpus (self , tp_size , pp_size , ep_size , attention_dp ,
4580+ cuda_graph , overlap_scheduler , moe_backend , eagle3 ):
4581+
4582+ pytorch_config = dict (
4583+ disable_overlap_scheduler = not overlap_scheduler ,
4584+ cuda_graph_config = CudaGraphConfig () if cuda_graph else None ,
4585+ moe_config = MoeConfig (backend = moe_backend ))
4586+
4587+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 ,
4588+ enable_block_reuse = not eagle3 )
4589+ _run_multinode_accuracy (
4590+ f"{ llm_models_root ()} /Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf" ,
4591+ self .MODEL_NAME ,
4592+ benchmarks = ["mmlu" ],
4593+ tp_size = tp_size ,
4594+ pp_size = pp_size ,
4595+ ep_size = ep_size ,
4596+ draft_model_path = (f"{ llm_models_root ()} /Qwen3/qwen3-235B-eagle3/"
4597+ if eagle3 else None ),
4598+ kv_cache_config = kv_cache_config ,
4599+ enable_attention_dp = attention_dp ,
4600+ ** pytorch_config )
4601+
44534602
44544603class TestQwen3_30B_A3B_Instruct_2507 (LlmapiAccuracyTestHarness ):
44554604 MODEL_NAME = "Qwen3/Qwen3-30B-A3B-Instruct-2507"
0 commit comments