Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/disaggregated/slurm/benchmark/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def submit_job(config, log_dir, dry_run):
f"--container-image {env_config['container_image']}",
f"--container-name {container_name}",
f"--container-mounts {env_config['container_mount']}",
"--mpi=pmix --overlap",
"--no-container-mount-home --mpi=pmix --overlap",
f"bash {os.path.join(env_config['work_dir'], 'start_worker.sh')}",
server_type,
str(server_id),
Expand All @@ -313,7 +313,7 @@ def submit_job(config, log_dir, dry_run):
f"--container-name={container_name}",
f"--container-image={env_config['container_image']}",
f"--container-mounts={env_config['container_mount']}",
f"--mpi=pmix --overlap -N 1 -n 1",
f"--no-container-mount-home --mpi=pmix --overlap -N 1 -n 1",
f"bash {env_config['work_dir']}/start_server.sh {os.path.join(log_dir, 'server_config.yaml')} \"{server_env_var}\"",
f"&> {log_dir}/4_output_server.log &",
]
Expand Down
3 changes: 2 additions & 1 deletion jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3256,12 +3256,13 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// PerfSanity post-merge tests
Expand Down
5 changes: 3 additions & 2 deletions tensorrt_llm/_torch/attention_backend/sparse/dsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,13 +813,14 @@ def on_update_kv_lens(self):
# Expand schedule metadata buffer (only generation)
kv_lens_expanded = self.kv_lens_expanded_cuda[:num_tokens]
scheduler_metadata_buffer_expanded = get_paged_mqa_logits_metadata(
kv_lens_expanded, tokens_per_block, self.num_sms)
kv_lens_expanded, self.kv_cache_manager.tokens_per_block,
self.num_sms)
self.scheduler_metadata_buffer_expanded.copy_(
scheduler_metadata_buffer_expanded, non_blocking=True)
elif self.max_draft_tokens == 3:
scheduler_metadata_buffer_mtp3 = get_paged_mqa_logits_metadata(
self.kv_lens_cuda[self.num_contexts:self.num_seqs],
tokens_per_block, self.num_sms // 2)
self.kv_cache_manager.tokens_per_block, self.num_sms // 2)
self.scheduler_metadata_buffer_mtp3.copy_(
scheduler_metadata_buffer_mtp3, non_blocking=True)
self.prepare_dense_topk_indices(self.kv_lens_cuda, device=True)
Expand Down
11 changes: 11 additions & 0 deletions tests/integration/defs/accuracy/references/json_mode_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,14 @@ deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 77.00
- spec_dec_algo: MTP
accuracy: 77.00
google/gemma-3-1b-it:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 61.00
GPT-OSS/120B-MXFP4:
- quant_algo: W4A16_MXFP4
spec_dec_algo: Eagle
accuracy: 62.00
- quant_algo: W4A8_MXFP4_MXFP8
spec_dec_algo: Eagle
accuracy: 62.00
140 changes: 140 additions & 0 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,37 @@ def test_fp8_prequantized(self):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

def test_fp8_vswa_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
)
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
with LLM(prequantized_model_path,
kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.parametrize("backend", ["xgrammar"])
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
)
cuda_graph_config = CudaGraphConfig(enable_padding=True)
llm = LLM(prequantized_model_path,
guided_decoding_backend=backend,
kv_cache_config=kv_cache_config,
cuda_graph_config=cuda_graph_config)
with llm:
task = JsonModeEval(self.MODEL_NAME)
task.evaluate(llm)

def test_auto_dtype_vswa_without_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
Expand Down Expand Up @@ -2269,6 +2300,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
torch.cuda.empty_cache()

@skip_pre_blackwell
@pytest.mark.skip_less_device_memory(95000)
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
[
Expand Down Expand Up @@ -4460,6 +4492,114 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
sampling_params=sampling_params,
extra_evaluator_kwargs=extra_evaluator_kwargs)

@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("one_model", [True, False],
ids=["one_model", "two_model"])
def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
MAX_OUTPUT_LEN = 128179
MAX_INPUT_LEN = 32768

mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
{"scores_filter": "exact_match,flexible-extract"})

mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)

pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
dtype="auto",
enable_block_reuse=True,
max_attention_window=[128, 32768])

eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
draft_len = 3
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=one_model,
allow_advanced_sampling=True)

max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=4,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
kv_cache_config=kv_cache_config,
max_seq_len=max_seq_len,
speculative_config=spec_config,
**pytorch_config,
enable_attention_dp=False)

with llm:
model_name = "GPT-OSS/120B-MXFP4"

# GSM8K
task = GSM8K(model_name)
task.evaluate(llm,
extra_evaluator_kwargs=self.extra_evaluator_kwargs)

# GPQA Medium Reasoning
task = GPQADiamond(model_name)

chat_template_kwargs = dict(reasoning_effort="medium")
extra_evaluator_kwargs = {
**self.extra_evaluator_kwargs, "chat_template_kwargs":
chat_template_kwargs
}

sampling_params = SamplingParams(
temperature=1.0,
top_p=1.0,
max_tokens=MAX_OUTPUT_LEN,
truncate_prompt_tokens=MAX_INPUT_LEN)

task.evaluate(llm,
sampling_params=sampling_params,
extra_evaluator_kwargs=extra_evaluator_kwargs)

@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("one_model", [True, False],
ids=["one_model", "two_model"])
def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
MAX_OUTPUT_LEN = 128179
MAX_INPUT_LEN = 32768

mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
{"scores_filter": "exact_match,flexible-extract"})

mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)

pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
dtype="auto")

eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
draft_len = 3
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=one_model,
allow_advanced_sampling=True)

max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=4,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
guided_decoding_backend="xgrammar",
kv_cache_config=kv_cache_config,
max_seq_len=max_seq_len,
speculative_config=spec_config,
**pytorch_config,
enable_attention_dp=False)

with llm:
model_name = "GPT-OSS/120B-MXFP4"
task = JsonModeEval(model_name)
task.evaluate(llm)

@pytest.mark.skip_less_device(2)
@pytest.mark.timeout(14400)
@pytest.mark.parametrize("overlap_scheduler", [True, False],
Expand Down
6 changes: 6 additions & 0 deletions tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
Expand Down Expand Up @@ -613,6 +615,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/test_lists/qa/llm_function_rtx6k.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
Expand Down Expand Up @@ -204,6 +207,18 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]

accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
Expand Down
Loading
Loading