diff --git a/docs/source/media/ray_orchestrator_architecture.jpg b/docs/source/media/ray_orchestrator_architecture.jpg new file mode 100644 index 00000000000..482b5f9c3c8 Binary files /dev/null and b/docs/source/media/ray_orchestrator_architecture.jpg differ diff --git a/examples/llm-api/ray/README.md b/examples/llm-api/ray/README.md index 63082c6e3c0..d60484699ea 100644 --- a/examples/llm-api/ray/README.md +++ b/examples/llm-api/ray/README.md @@ -3,3 +3,41 @@ # TensorRT-LLM with Ray orchestrator
+ +This folder contains examples for an experimental **Ray orchestrator** that supports on-demand LLM instance spin-up and flexible GPU placement across single- and multi-node inference. It’s a first step toward making TensorRT-LLM a better fit for Reinforcement learning from human feedback (RLHF) workflows. For RLHF, [Ray](https://docs.ray.io/en/latest/index.html) β€” unlike MPI’s fixed world size and placement β€” can dynamically spawn and reconnect distributed inference actors, each with its own parallelism strategy. + +This feature is a prototype and under active development. MPI remains the default and will continue to be supported in the long term. + + +## Quick Start + +Run a simple `TP=2` example with a Hugging Face model: + +```shell +cd examples/llm-api/ray +python llm_inference_distributed_ray.py +``` + +This example is the same as in `/examples/llm-api`, with the only change being `orchestrator_type="ray"` on `LLM()`. Other examples can be adapted similarly by toggling this flag. + + +## Features +### Available +- Generate text asynchronously (refer to [llm_inference_async_ray.py](llm_inference_async_ray.py)) +- Multi-node inference (refer to [multi-node README](./multi_nodes/README.md)) +- Disaggregated serving (refer to [disagg README](./disaggregated/README.md)) + +**Initial testing has been focused on LLaMA and DeepSeek variants. Please open an Issue if you encounter problems with other models so we can prioritize support. + +### Upcoming +- Performance optimization +- Integration with RLHF frameworks, such as [NVIDIA Nemo-RL](https://github.com/NVIDIA-NeMo/RL) and [Verl](https://github.com/volcengine/verl). + +## Architecture +This feature introduces new classes such as [RayExecutor](/tensorrt_llm/executor/ray_executor.py) and [RayGPUWorker](/tensorrt_llm/executor/ray_gpu_worker.py) for Ray actor lifecycle management and distributed inference. In Ray mode, collective ops run on [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) without MPI. We welcome contributions to improve and extend this support. + +![Ray orchestrator architecture](/docs/source/media/ray_orchestrator_architecture.jpg) + + +## Disclaimer +The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability. diff --git a/examples/llm-api/ray/disaggregated/README.md b/examples/llm-api/ray/disaggregated/README.md index d7da62b301b..38811688ea5 100644 --- a/examples/llm-api/ray/disaggregated/README.md +++ b/examples/llm-api/ray/disaggregated/README.md @@ -1,127 +1,28 @@ -TODO: need rewrite +# Disaggregated Serving with Ray orchestrator +TensorRT-LLM supports an experimental [Ray orchestrator](../README.md) as an alternative to MPI. -# TensorRT-LLM OpenAPI Client Examples +Running disaggregated serving with Ray follows [the same workflow as in MPI](/examples/disaggregated/README.md), except that `orchestrator_type="ray"` must be set on the `LLM` class, and `CUDA_VISIBLE_DEVICES` can be omitted since Ray handles GPU placement. -This directory contains simple client examples using the `requests` library to interact with TensorRT-LLM OpenAPI servers. -## Files +## Quick Start +This script is a shorthand to launch a single-GPU context and generation server, as well as the disaggregated server within a single Ray cluster. Please see [this page]((/examples/disaggregated/README.md)) for details on adjusting parallel settings. -- **`disagg_serving_test.py`** - Comprehensive client with multiple test cases -- **`simple_client_example.py`** - Minimal example showing core usage patterns - -## Prerequisites - -1. Start a TensorRT-LLM server manually using one of these methods: - -### Option A: Using trtllm-serve (Recommended) -```bash -trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" -``` - -### Option B: Using the FastAPI server example -```bash -cd ../apps -python fastapi_server.py TinyLlama/TinyLlama-1.1B-Chat-v1.0 -``` - -### Option C: Using any OpenAI-compatible server -Make sure it has a `/v1/chat/completions` endpoint that accepts: -```json -{ - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "messages": [{"role": "user", "content": "Your prompt here"}], - "max_tokens": 100, - "temperature": 0.8, - "stream": false -} -``` - -## Usage - -### Simple Client (Minimal Example) ```bash -python simple_client_example.py +# requires a total of two GPUs +bash -e disagg_serving_local.sh ``` -### Comprehensive Client +Once the disaggregated server is ready, you can send requests to the disaggregated server using curl: ```bash -# Default server URL (http://localhost:8000) -python disagg_serving_test.py - -# Custom server URL -python disagg_serving_test.py --server-url http://localhost:8080 -``` - -## Example Output - -``` -πŸ€– Testing TensorRT-LLM server at: http://localhost:8000 -================================================== -1. Health check... - βœ… Server healthy: {'status': 'healthy'} - -2. Testing text generation... - -🎯 Test 1: 'Hello, my name is' - Generated: 'John and I am a software engineer...' - -🎯 Test 2: 'The capital of France is' - Generated: 'Paris, the city of lights...' - -3. Testing streaming generation... -🎯 Streaming: 'Write a short story about a robot:' -πŸ“‘ Response: Once upon a time, there was a robot named... - -βœ… All tests completed! -``` - -## API Endpoints Used - -The clients expect these endpoints: - -- `GET /health` - Health check (fallback: simple chat completion request) -- `POST /v1/chat/completions` - OpenAI-compatible chat completions -- Streaming support via Server-Sent Events (SSE) - -## Key Features Demonstrated - -1. **Basic text generation** with requests.post() -2. **Streaming response** handling with SSE -3. **Error handling** for connection issues -4. **Session management** for efficient connections -5. **OpenAI-compatible format** with messages array - -## Example Code - -The minimal example shows exactly what you need: - -```python -import requests - -# Basic generation (OpenAI format) -payload = { - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "messages": [{"role": "user", "content": "Hello, my name is"}], - "max_tokens": 50, - "temperature": 0.8 -} -response = requests.post("http://localhost:8000/v1/chat/completions", json=payload) -result = response.json() -text = result["choices"][0]["message"]["content"] - -# Streaming generation -payload = { - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "messages": [{"role": "user", "content": "The future of AI is"}], - "stream": True -} -response = requests.post("http://localhost:8000/v1/chat/completions", json=payload, stream=True) -``` - -## Customization - -You can modify the scripts to: -- Change server URLs -- Adjust generation parameters (temperature, max_tokens, etc.) -- Add new test prompts -- Handle different response formats +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "prompt": "NVIDIA is a great company because", + "max_tokens": 16, + "temperature": 0 + }' -w "\n" +``` + +## Disclaimer +The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability. diff --git a/examples/llm-api/ray/multi_nodes/README.md b/examples/llm-api/ray/multi_nodes/README.md index af72c2f3967..06196bfbd78 100644 --- a/examples/llm-api/ray/multi_nodes/README.md +++ b/examples/llm-api/ray/multi_nodes/README.md @@ -1,9 +1,10 @@ # Multi-node inference with Ray orchestrator -TensorRT-LLM supports [Ray](https://docs.ray.io/en/latest/index.html) as an orchestrator with [PyTorch Distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) as an alternative to MPI. This feature is currently experimental and under active development. +TensorRT-LLM supports an experimental [Ray orchestrator](../README.md) as an alternative to MPI. The following example shows how to start a Ray cluster for multi-node inference. -**Prerequisite:** a container image with TensorRT-LLM preinstalled (or suitable for installing it). The examples use Slurm and [Enroot](https://github.com/NVIDIA/enroot); if you use a different setup, adapt the container options and launch commands to your multi-node environment. -## Run multi-node inference with Ray +## Quick Start + +**Prerequisite:** a container image with TensorRT-LLM preinstalled (or suitable for installing it). The examples use Slurm and [Enroot](https://github.com/NVIDIA/enroot). if you use a different setup, adapt the following scrips and commands to your multi-node environment. 1. Allocate nodes and open a shell on the head node: @@ -32,5 +33,9 @@ TensorRT-LLM supports [Ray](https://docs.ray.io/en/latest/index.html) as an orch # Under your work directory: >> pip install -e . # if needed + # You can change this script to a model and parallel settings effective for multi-node inference (e.g., TP8 or TP4PP4) >> python examples/ray/llm_inference_async_ray.py ``` + +## Disclaimer +The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability. diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index a830ade11f5..a4ce0092a0b 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -1,6 +1,4 @@ import contextlib -import copy -import dataclasses import json import os from dataclasses import dataclass, field @@ -565,17 +563,3 @@ def get_num_attention_layers(self): return self.pretrained_config.hybrid_override_pattern.count("*") else: return self.pretrained_config.num_hidden_layers - - def clone(self) -> "ModelConfig[TConfig]": - """ - Create a clone of the config. - """ - shallow_fields = ["mapping"] - - clone = dataclasses.replace(self, - **{field: None - for field in shallow_fields}) - clone = copy.deepcopy(clone) - for field in shallow_fields: - setattr(clone, field, getattr(self, field)) - return clone diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 350d6ab7442..0f886d0cd5c 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -25,6 +25,7 @@ # SOFTWARE. # -------------------------------------------------- +import copy import math import os import warnings @@ -1163,7 +1164,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, def __init__(self, model_config: ModelConfig[PretrainedConfig]): # Rename some keys of quant_config_dict to support legacy checkpoints if model_config.quant_config_dict is not None: - model_config = model_config.clone() + model_config = copy.deepcopy(model_config) quant_config_dict = {} for key, val in model_config.quant_config_dict.items(): key_split = key.split(".") diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py index fb0795b2cb0..37770d2f0d3 100644 --- a/tensorrt_llm/_torch/models/modeling_gemma3vl.py +++ b/tensorrt_llm/_torch/models/modeling_gemma3vl.py @@ -1,3 +1,4 @@ +import copy import dataclasses import os from typing import List, Optional, Tuple @@ -164,7 +165,7 @@ def __init__(self, model_config: ModelConfig[Gemma3Config]): dtype=torch.int32, device=self._device) - model_config_cp = model_config.clone() + model_config_cp = copy.deepcopy(model_config) self.model_config = model_config_cp llm_model_config = self.get_sub_model_config(model_config_cp, diff --git a/tensorrt_llm/_torch/models/modeling_hyperclovax.py b/tensorrt_llm/_torch/models/modeling_hyperclovax.py index 452b6b13e93..a05784b9d8d 100644 --- a/tensorrt_llm/_torch/models/modeling_hyperclovax.py +++ b/tensorrt_llm/_torch/models/modeling_hyperclovax.py @@ -1,3 +1,4 @@ +import copy import math import os from functools import partial @@ -990,7 +991,7 @@ def __init__(self, model_config: ModelConfig): return if not DISAGG: self.mm_encoder = HCXVisionModel(model_config) - llm_model_config = model_config.clone() + llm_model_config = copy.deepcopy(model_config) llm_model_config.pretrained_config = PretrainedConfig.from_dict( llm_model_config.pretrained_config.language_config) self.llm = AutoModelForCausalLM.from_config(llm_model_config) diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py index 73d0867c127..282ec0c5cc7 100644 --- a/tensorrt_llm/_torch/models/modeling_llava_next.py +++ b/tensorrt_llm/_torch/models/modeling_llava_next.py @@ -1,3 +1,4 @@ +import copy import os from typing import Dict, List, Optional, Tuple, Union @@ -424,7 +425,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, if not DISAGG: self.mm_encoder = LlavaNextVisionModel(model_config) - llm_model_config = model_config.clone() + llm_model_config = copy.deepcopy(model_config) llm_model_config.pretrained_config = model_config.pretrained_config.text_config # TODO Remove these when MistralConfig is natively supported diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py index 138b387517f..74b41e8c93a 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py @@ -1,3 +1,4 @@ +import copy import os from typing import Any, Dict, List, Optional, Tuple, Union @@ -475,7 +476,7 @@ def __init__( if hasattr(self, "llm"): return - llm_model_config = model_config.clone() + llm_model_config = copy.deepcopy(model_config) llm_model_config.pretrained_config.architectures = ["Qwen2ForCausalLM"] self.llm = AutoModelForCausalLM.from_config(llm_model_config) self.vocab_size = config.vocab_size