diff --git a/docs/source/media/ray_orchestrator_architecture.jpg b/docs/source/media/ray_orchestrator_architecture.jpg
new file mode 100644
index 00000000000..482b5f9c3c8
Binary files /dev/null and b/docs/source/media/ray_orchestrator_architecture.jpg differ
diff --git a/examples/llm-api/ray/README.md b/examples/llm-api/ray/README.md
index 63082c6e3c0..d60484699ea 100644
--- a/examples/llm-api/ray/README.md
+++ b/examples/llm-api/ray/README.md
@@ -3,3 +3,41 @@
# TensorRT-LLM with Ray orchestrator
+
+This folder contains examples for an experimental **Ray orchestrator** that supports on-demand LLM instance spin-up and flexible GPU placement across single- and multi-node inference. Itβs a first step toward making TensorRT-LLM a better fit for Reinforcement learning from human feedback (RLHF) workflows. For RLHF, [Ray](https://docs.ray.io/en/latest/index.html) β unlike MPIβs fixed world size and placement β can dynamically spawn and reconnect distributed inference actors, each with its own parallelism strategy.
+
+This feature is a prototype and under active development. MPI remains the default and will continue to be supported in the long term.
+
+
+## Quick Start
+
+Run a simple `TP=2` example with a Hugging Face model:
+
+```shell
+cd examples/llm-api/ray
+python llm_inference_distributed_ray.py
+```
+
+This example is the same as in `/examples/llm-api`, with the only change being `orchestrator_type="ray"` on `LLM()`. Other examples can be adapted similarly by toggling this flag.
+
+
+## Features
+### Available
+- Generate text asynchronously (refer to [llm_inference_async_ray.py](llm_inference_async_ray.py))
+- Multi-node inference (refer to [multi-node README](./multi_nodes/README.md))
+- Disaggregated serving (refer to [disagg README](./disaggregated/README.md))
+
+**Initial testing has been focused on LLaMA and DeepSeek variants. Please open an Issue if you encounter problems with other models so we can prioritize support.
+
+### Upcoming
+- Performance optimization
+- Integration with RLHF frameworks, such as [NVIDIA Nemo-RL](https://github.com/NVIDIA-NeMo/RL) and [Verl](https://github.com/volcengine/verl).
+
+## Architecture
+This feature introduces new classes such as [RayExecutor](/tensorrt_llm/executor/ray_executor.py) and [RayGPUWorker](/tensorrt_llm/executor/ray_gpu_worker.py) for Ray actor lifecycle management and distributed inference. In Ray mode, collective ops run on [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) without MPI. We welcome contributions to improve and extend this support.
+
+
+
+
+## Disclaimer
+The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability.
diff --git a/examples/llm-api/ray/disaggregated/README.md b/examples/llm-api/ray/disaggregated/README.md
index d7da62b301b..38811688ea5 100644
--- a/examples/llm-api/ray/disaggregated/README.md
+++ b/examples/llm-api/ray/disaggregated/README.md
@@ -1,127 +1,28 @@
-TODO: need rewrite
+# Disaggregated Serving with Ray orchestrator
+TensorRT-LLM supports an experimental [Ray orchestrator](../README.md) as an alternative to MPI.
-# TensorRT-LLM OpenAPI Client Examples
+Running disaggregated serving with Ray follows [the same workflow as in MPI](/examples/disaggregated/README.md), except that `orchestrator_type="ray"` must be set on the `LLM` class, and `CUDA_VISIBLE_DEVICES` can be omitted since Ray handles GPU placement.
-This directory contains simple client examples using the `requests` library to interact with TensorRT-LLM OpenAPI servers.
-## Files
+## Quick Start
+This script is a shorthand to launch a single-GPU context and generation server, as well as the disaggregated server within a single Ray cluster. Please see [this page]((/examples/disaggregated/README.md)) for details on adjusting parallel settings.
-- **`disagg_serving_test.py`** - Comprehensive client with multiple test cases
-- **`simple_client_example.py`** - Minimal example showing core usage patterns
-
-## Prerequisites
-
-1. Start a TensorRT-LLM server manually using one of these methods:
-
-### Option A: Using trtllm-serve (Recommended)
-```bash
-trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-```
-
-### Option B: Using the FastAPI server example
-```bash
-cd ../apps
-python fastapi_server.py TinyLlama/TinyLlama-1.1B-Chat-v1.0
-```
-
-### Option C: Using any OpenAI-compatible server
-Make sure it has a `/v1/chat/completions` endpoint that accepts:
-```json
-{
- "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- "messages": [{"role": "user", "content": "Your prompt here"}],
- "max_tokens": 100,
- "temperature": 0.8,
- "stream": false
-}
-```
-
-## Usage
-
-### Simple Client (Minimal Example)
```bash
-python simple_client_example.py
+# requires a total of two GPUs
+bash -e disagg_serving_local.sh
```
-### Comprehensive Client
+Once the disaggregated server is ready, you can send requests to the disaggregated server using curl:
```bash
-# Default server URL (http://localhost:8000)
-python disagg_serving_test.py
-
-# Custom server URL
-python disagg_serving_test.py --server-url http://localhost:8080
-```
-
-## Example Output
-
-```
-π€ Testing TensorRT-LLM server at: http://localhost:8000
-==================================================
-1. Health check...
- β
Server healthy: {'status': 'healthy'}
-
-2. Testing text generation...
-
-π― Test 1: 'Hello, my name is'
- Generated: 'John and I am a software engineer...'
-
-π― Test 2: 'The capital of France is'
- Generated: 'Paris, the city of lights...'
-
-3. Testing streaming generation...
-π― Streaming: 'Write a short story about a robot:'
-π‘ Response: Once upon a time, there was a robot named...
-
-β
All tests completed!
-```
-
-## API Endpoints Used
-
-The clients expect these endpoints:
-
-- `GET /health` - Health check (fallback: simple chat completion request)
-- `POST /v1/chat/completions` - OpenAI-compatible chat completions
-- Streaming support via Server-Sent Events (SSE)
-
-## Key Features Demonstrated
-
-1. **Basic text generation** with requests.post()
-2. **Streaming response** handling with SSE
-3. **Error handling** for connection issues
-4. **Session management** for efficient connections
-5. **OpenAI-compatible format** with messages array
-
-## Example Code
-
-The minimal example shows exactly what you need:
-
-```python
-import requests
-
-# Basic generation (OpenAI format)
-payload = {
- "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- "messages": [{"role": "user", "content": "Hello, my name is"}],
- "max_tokens": 50,
- "temperature": 0.8
-}
-response = requests.post("http://localhost:8000/v1/chat/completions", json=payload)
-result = response.json()
-text = result["choices"][0]["message"]["content"]
-
-# Streaming generation
-payload = {
- "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- "messages": [{"role": "user", "content": "The future of AI is"}],
- "stream": True
-}
-response = requests.post("http://localhost:8000/v1/chat/completions", json=payload, stream=True)
-```
-
-## Customization
-
-You can modify the scripts to:
-- Change server URLs
-- Adjust generation parameters (temperature, max_tokens, etc.)
-- Add new test prompts
-- Handle different response formats
+curl http://localhost:8000/v1/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+ "prompt": "NVIDIA is a great company because",
+ "max_tokens": 16,
+ "temperature": 0
+ }' -w "\n"
+```
+
+## Disclaimer
+The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability.
diff --git a/examples/llm-api/ray/multi_nodes/README.md b/examples/llm-api/ray/multi_nodes/README.md
index af72c2f3967..06196bfbd78 100644
--- a/examples/llm-api/ray/multi_nodes/README.md
+++ b/examples/llm-api/ray/multi_nodes/README.md
@@ -1,9 +1,10 @@
# Multi-node inference with Ray orchestrator
-TensorRT-LLM supports [Ray](https://docs.ray.io/en/latest/index.html) as an orchestrator with [PyTorch Distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) as an alternative to MPI. This feature is currently experimental and under active development.
+TensorRT-LLM supports an experimental [Ray orchestrator](../README.md) as an alternative to MPI. The following example shows how to start a Ray cluster for multi-node inference.
-**Prerequisite:** a container image with TensorRT-LLM preinstalled (or suitable for installing it). The examples use Slurm and [Enroot](https://github.com/NVIDIA/enroot); if you use a different setup, adapt the container options and launch commands to your multi-node environment.
-## Run multi-node inference with Ray
+## Quick Start
+
+**Prerequisite:** a container image with TensorRT-LLM preinstalled (or suitable for installing it). The examples use Slurm and [Enroot](https://github.com/NVIDIA/enroot). if you use a different setup, adapt the following scrips and commands to your multi-node environment.
1. Allocate nodes and open a shell on the head node:
@@ -32,5 +33,9 @@ TensorRT-LLM supports [Ray](https://docs.ray.io/en/latest/index.html) as an orch
# Under your work directory:
>> pip install -e . # if needed
+ # You can change this script to a model and parallel settings effective for multi-node inference (e.g., TP8 or TP4PP4)
>> python examples/ray/llm_inference_async_ray.py
```
+
+## Disclaimer
+The code is experimental and subject to change. Currently, there are no guarantees regarding functionality, performance, or stability.
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index a830ade11f5..a4ce0092a0b 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -1,6 +1,4 @@
import contextlib
-import copy
-import dataclasses
import json
import os
from dataclasses import dataclass, field
@@ -565,17 +563,3 @@ def get_num_attention_layers(self):
return self.pretrained_config.hybrid_override_pattern.count("*")
else:
return self.pretrained_config.num_hidden_layers
-
- def clone(self) -> "ModelConfig[TConfig]":
- """
- Create a clone of the config.
- """
- shallow_fields = ["mapping"]
-
- clone = dataclasses.replace(self,
- **{field: None
- for field in shallow_fields})
- clone = copy.deepcopy(clone)
- for field in shallow_fields:
- setattr(clone, field, getattr(self, field))
- return clone
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 350d6ab7442..0f886d0cd5c 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -25,6 +25,7 @@
# SOFTWARE.
# --------------------------------------------------
+import copy
import math
import os
import warnings
@@ -1163,7 +1164,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
def __init__(self, model_config: ModelConfig[PretrainedConfig]):
# Rename some keys of quant_config_dict to support legacy checkpoints
if model_config.quant_config_dict is not None:
- model_config = model_config.clone()
+ model_config = copy.deepcopy(model_config)
quant_config_dict = {}
for key, val in model_config.quant_config_dict.items():
key_split = key.split(".")
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
index fb0795b2cb0..37770d2f0d3 100644
--- a/tensorrt_llm/_torch/models/modeling_gemma3vl.py
+++ b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
@@ -1,3 +1,4 @@
+import copy
import dataclasses
import os
from typing import List, Optional, Tuple
@@ -164,7 +165,7 @@ def __init__(self, model_config: ModelConfig[Gemma3Config]):
dtype=torch.int32,
device=self._device)
- model_config_cp = model_config.clone()
+ model_config_cp = copy.deepcopy(model_config)
self.model_config = model_config_cp
llm_model_config = self.get_sub_model_config(model_config_cp,
diff --git a/tensorrt_llm/_torch/models/modeling_hyperclovax.py b/tensorrt_llm/_torch/models/modeling_hyperclovax.py
index 452b6b13e93..a05784b9d8d 100644
--- a/tensorrt_llm/_torch/models/modeling_hyperclovax.py
+++ b/tensorrt_llm/_torch/models/modeling_hyperclovax.py
@@ -1,3 +1,4 @@
+import copy
import math
import os
from functools import partial
@@ -990,7 +991,7 @@ def __init__(self, model_config: ModelConfig):
return
if not DISAGG:
self.mm_encoder = HCXVisionModel(model_config)
- llm_model_config = model_config.clone()
+ llm_model_config = copy.deepcopy(model_config)
llm_model_config.pretrained_config = PretrainedConfig.from_dict(
llm_model_config.pretrained_config.language_config)
self.llm = AutoModelForCausalLM.from_config(llm_model_config)
diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py
index 73d0867c127..282ec0c5cc7 100644
--- a/tensorrt_llm/_torch/models/modeling_llava_next.py
+++ b/tensorrt_llm/_torch/models/modeling_llava_next.py
@@ -1,3 +1,4 @@
+import copy
import os
from typing import Dict, List, Optional, Tuple, Union
@@ -424,7 +425,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
if not DISAGG:
self.mm_encoder = LlavaNextVisionModel(model_config)
- llm_model_config = model_config.clone()
+ llm_model_config = copy.deepcopy(model_config)
llm_model_config.pretrained_config = model_config.pretrained_config.text_config
# TODO Remove these when MistralConfig is natively supported
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
index 138b387517f..74b41e8c93a 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -1,3 +1,4 @@
+import copy
import os
from typing import Any, Dict, List, Optional, Tuple, Union
@@ -475,7 +476,7 @@ def __init__(
if hasattr(self, "llm"):
return
- llm_model_config = model_config.clone()
+ llm_model_config = copy.deepcopy(model_config)
llm_model_config.pretrained_config.architectures = ["Qwen2ForCausalLM"]
self.llm = AutoModelForCausalLM.from_config(llm_model_config)
self.vocab_size = config.vocab_size