Skip to content
Draft

Pearl #13551

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,8 @@ enroot/tensorrt_llm.devel.sqsh
.claude/agent-memory/
.claude/agent-tests/perf-test-sync/report.html
.claude/agent-tests/perf-test-sync/results.json

# Runtime third-party dependencies: symlinks created by cmake build, not part of this repo
tensorrt_llm/deep_ep
tensorrt_llm/deep_gemm
tensorrt_llm/flash_mla
3 changes: 2 additions & 1 deletion docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ base_pull:
docker pull $(IMAGE_WITH_TAG)

DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
RDMA_DOCKER_RUN_ARGS ?= $(shell if [ -e /dev/infiniband ]; then printf '%s' '--device=/dev/infiniband --cap-add=IPC_LOCK --cap-add=SYS_RESOURCE --cap-add=NET_RAW --volume /sys/class/infiniband:/sys/class/infiniband:ro --volume /sys/devices:/sys/devices:ro'; fi)
DOCKER_RUN_ARGS ?=
# Check if NVIDIA_VISIBLE_DEVICES is set and not empty
NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
Expand Down Expand Up @@ -163,7 +164,7 @@ endif
ifeq ($(LOCAL_USER),1)
$(call add_local_user,$(IMAGE_WITH_TAG))
endif
docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
docker run $(DOCKER_RUN_OPTS) $(RDMA_DOCKER_RUN_ARGS) $(DOCKER_RUN_ARGS) \
$(GPU_OPTS) \
--volume $(SOURCE_DIR):$(CODE_DIR) \
$(EXTRA_VOLUMES) \
Expand Down
7 changes: 7 additions & 0 deletions docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ init_ubuntu() {
apt remove -y ibverbs-providers libibverbs1
apt-get --reinstall install -y libibverbs-dev
apt-get install -y --no-install-recommends \
rdma-core \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
ibverbs-providers \
gcc \
libc-dev \
libtool \
autoconf \
automake \
Expand Down
96 changes: 96 additions & 0 deletions examples/llm-api/llm_rdma_draft_offload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
"""Minimal target-side RDMA draft offload example.

Start the fake draft server first, then run this script. The target model is a
real TensorRT-LLM LLM; the draft model is temporarily replaced by the fake RDMA
peer.
"""

from __future__ import annotations

import argparse
import os
import sys
from pathlib import Path

_REPO_ROOT = Path(__file__).resolve().parents[2]
if str(_REPO_ROOT) not in sys.path:
sys.path.insert(0, str(_REPO_ROOT))

DEFAULT_MODEL = "/scratch.trt_llm_data/llm-models/Qwen3/Qwen3-8B"


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument("--prompt", default="Explain GPUDirect RDMA in one short sentence.")
parser.add_argument("--max-tokens", type=int, default=16)
parser.add_argument("--max-draft-len", type=int, default=5)
parser.add_argument("--draft-host", default="127.0.0.1")
parser.add_argument("--draft-port", type=int, default=47320)
parser.add_argument("--ib-dev", default="mlx5_0")
parser.add_argument(
"--gpu-id",
type=int,
default=0,
help="Physical GPU ID for RDMA memory registration (target side)",
)
parser.add_argument(
"--cuda-visible-devices",
default="0",
help="Set CUDA_VISIBLE_DEVICES before importing TensorRT-LLM.",
)
parser.add_argument("--max-batch-size", type=int, default=1)
parser.add_argument("--max-seq-len", type=int, default=512)
parser.add_argument("--max-num-tokens", type=int, default=512)
parser.add_argument("--kv-cache-max-tokens", type=int, default=512)
parser.add_argument("--kv-cache-free-gpu-memory-fraction", type=float, default=0.05)
return parser.parse_args()


def main() -> None:
args = parse_args()
if args.cuda_visible_devices is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_visible_devices

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig

spec_config = DraftTargetDecodingConfig(
max_draft_len=args.max_draft_len,
draft_offload_enabled=True,
draft_offload_nic_name=args.ib_dev,
draft_offload_server_host=args.draft_host,
draft_offload_server_port=args.draft_port,
draft_offload_gpu_id=args.gpu_id,
)
# Disable CUDA graphs: RDMA draft calls are Python-side operations and
# would not be re-executed on each CUDA graph replay.
# cuda_graph_config=None disables CUDA graphs entirely (empty CudaGraphConfig
# does NOT disable graphs because the validator auto-fills batch sizes).

llm = LLM(
model=args.model,
speculative_config=spec_config,
disable_overlap_scheduler=True,
tensor_parallel_size=1,
cuda_graph_config=None,
max_batch_size=args.max_batch_size,
max_seq_len=args.max_seq_len,
max_num_tokens=args.max_num_tokens,
kv_cache_config=KvCacheConfig(
max_tokens=args.kv_cache_max_tokens,
free_gpu_memory_fraction=args.kv_cache_free_gpu_memory_fraction,
),
)
output = llm.generate(
args.prompt,
SamplingParams(max_tokens=args.max_tokens),
use_tqdm=False,
)
print(output.outputs[0].text)


if __name__ == "__main__":
main()
9 changes: 7 additions & 2 deletions tensorrt_llm/_torch/models/modeling_speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,8 +1027,11 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]):
spec_config = getattr(model_config, 'spec_config', None)
self.spec_config = spec_config
if spec_config and spec_config.spec_dec_mode.use_one_engine():
draft_offload_enabled = bool(
getattr(spec_config, "draft_offload_enabled", False))
# Only create draft_model for modes MTP, Eagle3 (not SA)
if not spec_config.spec_dec_mode.is_sa():
if not spec_config.spec_dec_mode.is_sa(
) and not draft_offload_enabled:
if spec_config.spec_dec_mode.is_eagle3_one_model():
if spec_config.eagle3_model_arch == "mistral_large3":
from tensorrt_llm._torch.models.checkpoints.mistral.config_loader import \
Expand Down Expand Up @@ -1105,6 +1108,7 @@ def forward(
return_context_logits: bool = False,
spec_metadata: Optional[SpecMetadata] = None,
resource_manager=None,
is_warmup: bool = False,
**kwargs,
) -> torch.Tensor:
hidden_states = self.model(
Expand Down Expand Up @@ -1150,7 +1154,8 @@ def forward(
attn_metadata=attn_metadata,
spec_metadata=spec_metadata,
draft_model=self.draft_model,
resource_manager=resource_manager)
resource_manager=resource_manager,
is_warmup=is_warmup)
else:
logits = self.logits_processor.forward(
hidden_states,
Expand Down
2 changes: 2 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,8 @@ def _should_create_separate_draft_kv_cache(self) -> bool:
"Attention DP is enabled, separate draft KV cache is not supported."
)
return False
if getattr(self._speculative_config, "draft_offload_enabled", False):
return False
return should_use_separate_draft_kv_cache(self._speculative_config)

def _get_effective_draft_config(self) -> ModelConfig:
Expand Down
4 changes: 4 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2200,6 +2200,7 @@ def _apply_incremental_update_target(
'inputs_embeds': None,
"multimodal_params": [],
'resource_manager': resource_manager,
'is_warmup': self.is_warmup,
}

if bool(lora_params):
Expand Down Expand Up @@ -3052,6 +3053,7 @@ def previous_seq_slots_device():
'inputs_embeds': None,
"multimodal_params": multimodal_params_list,
'resource_manager': resource_manager,
'is_warmup': self.is_warmup,
}

if bool(lora_params):
Expand Down Expand Up @@ -3224,6 +3226,7 @@ def _prepare_tp_inputs_no_cache(
'inputs_embeds': None,
"multimodal_params": multimodal_params_list,
'resource_manager': resource_manager,
'is_warmup': self.is_warmup,
}

if bool(lora_params):
Expand Down Expand Up @@ -3492,6 +3495,7 @@ def _prepare_star_attention_inputs(
'position_ids': self.position_ids_cuda[:num_tokens].unsqueeze(0),
'inputs_embeds': None,
'resource_manager': resource_manager,
'is_warmup': self.is_warmup,
}, gather_ids if is_spec_decode else None

def _get_lora_params_from_requests(
Expand Down
10 changes: 6 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,9 @@ def init_meta_tensor(t: torch.Tensor):
self._call_load_weights(model.load_weights, weights,
self.weight_mapper)

if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
):
if (self.spec_config is not None and self.spec_config.
spec_dec_mode.need_load_draft_weights() and not getattr(
self.spec_config, "draft_offload_enabled", False)):
weights = checkpoint_loader.load_weights(
self.spec_config.speculative_model,
mapping=self.mapping)
Expand All @@ -414,8 +415,9 @@ def init_meta_tensor(t: torch.Tensor):
self.weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
model, config)
initialize_dummy_weights(model)
if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
):
if (self.spec_config is not None and self.spec_config.
spec_dec_mode.need_load_draft_weights() and not getattr(
self.spec_config, "draft_offload_enabled", False)):
model.draft_model.load_weights_from_target_model(model)

elif load_format == LoadFormat.VISION_ONLY:
Expand Down
Loading
Loading