Skip to content

Commit cdcbce9

Browse files
Merge pull request #595 from karya0/ai-dynamo-vllm_v1
Update AI Dynamo config to use vLLM_V1 API.
2 parents 8dad179 + 3eddbdc commit cdcbce9

File tree

9 files changed

+173
-200
lines changed

9 files changed

+173
-200
lines changed

doc/ai_dynamo.md

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ The path to the downloaded weights should be consistent with the structure expec
5151

5252
---
5353

54-
### Step 2: Configure `HF_HOME` in the Test Schema
54+
### Step 2: Configure `huggingface_home` in the Test Schema
5555

56-
Set the `HF_HOME` environment variable in the test schema file (e.g., `test.toml`) so that CloudAI can locate the model weights:
56+
Set the `huggingface_home` variable in the test schema file (e.g., `test.toml`) so that CloudAI can locate the model weights:
5757

5858
```toml
5959
name = "llama3.1_405b_fp8"
@@ -63,41 +63,39 @@ test_template_name = "AIDynamo"
6363
[cmd_args]
6464
docker_image_url = "/path/to/docker/image"
6565
served_model_name = "nvidia/Llama-3.1-405B-Instruct-FP8"
66+
huggingface_home_host_path = "/your/path/to/hf_home"
67+
huggingface_home_container_path = "/root/.cache/huggingface"
6668

67-
[cmd_args.dynamo.processor]
68-
[cmd_args.dynamo.router]
6969
[cmd_args.dynamo.frontend]
7070
[cmd_args.dynamo.prefill_worker]
7171
num_nodes = 1
7272

73-
[cmd_args.dynamo.vllm_worker]
73+
[cmd_args.dynamo.decode_worker]
7474
num_nodes = 0
7575

7676
[cmd_args.genai_perf]
7777
endpoint = "v1/chat/completions"
7878
endpoint_type = "chat"
7979
streaming = true
8080

81-
[extra_env_vars]
82-
HF_HOME = "/your/path/to/hf_home"
8381
```
8482

85-
This environment variable should point to the root directory used with `--local-dir` in the download step. CloudAI will use this directory to locate and load the appropriate model weights.
83+
This location should point to the root directory used with `--local-dir` in the download step. CloudAI will use this directory to locate and load the appropriate model weights.
8684

8785
---
8886

8987
### Step 3: Node Configuration for AI Dynamo
9088

9189
AI Dynamo jobs use three distinct types of nodes:
9290

93-
- **Frontend node**: Hosts the coordination services (`etcd`, `nats`) as well as the **frontend server** and the **request generator** (`genai-perf`)
91+
- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`genai-perf`), and the first decode worker.
9492
- **Prefill node(s)**: Handle the prefill stage of inference
9593
- **Decode node(s)**: Handle the decode stage of inference (optional, depending on model and setup)
9694

9795
The total number of nodes required must be:
9896

9997
```
100-
1 (frontend) + num_prefill_nodes + num_decode_nodes
98+
num_prefill_nodes + num_decode_nodes
10199
```
102100

103101
If there is a mismatch in the number of nodes between the schema and the test scenario, CloudAI will use the number of nodes specified in the test schema, ignoring the value in the test scenario.

src/cloudai/workloads/ai_dynamo/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
AIDynamoArgs,
1919
AIDynamoCmdArgs,
2020
AIDynamoTestDefinition,
21+
CommonConfig,
22+
DecodeWorkerArgs,
2123
FrontendArgs,
2224
GenAIPerfArgs,
2325
PrefillWorkerArgs,
24-
ProcessorArgs,
25-
RouterArgs,
26-
VllmWorkerArgs,
26+
SimpleLoadBalancerArgs,
2727
)
2828
from .report_generation_strategy import AIDynamoReportGenerationStrategy
2929
from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy
@@ -34,10 +34,10 @@
3434
"AIDynamoReportGenerationStrategy",
3535
"AIDynamoSlurmCommandGenStrategy",
3636
"AIDynamoTestDefinition",
37+
"CommonConfig",
38+
"DecodeWorkerArgs",
3739
"FrontendArgs",
3840
"GenAIPerfArgs",
3941
"PrefillWorkerArgs",
40-
"ProcessorArgs",
41-
"RouterArgs",
42-
"VllmWorkerArgs",
42+
"SimpleLoadBalancerArgs",
4343
]

src/cloudai/workloads/ai_dynamo/ai_dynamo.py

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -23,85 +23,70 @@
2323
from cloudai.models.workload import CmdArgs, TestDefinition
2424

2525

26-
class FrontendArgs(BaseModel):
27-
"""Arguments for the frontend node."""
26+
class CommonConfig(BaseModel):
27+
"""Common configuration shared across components."""
2828

2929
model_config = ConfigDict(extra="forbid", populate_by_name=True)
3030

31-
endpoint: str = "dynamo.Processor.chat/completions"
32-
port: int = 8000
33-
port_etcd: int = 2379
34-
port_nats: int = 4222
31+
model: str
32+
kv_transfer_config: str = Field('{"kv_connector":"NixlConnector","kv_role":"kv_both"}', alias="kv-transfer-config")
33+
served_model_name: str
3534

3635

37-
class ProcessorArgs(BaseModel):
38-
"""Arguments for the processor node."""
36+
class FrontendArgs(BaseModel):
37+
"""Arguments for the frontend node."""
3938

4039
model_config = ConfigDict(extra="forbid", populate_by_name=True)
4140

42-
block_size: int = Field(64, alias="block-size")
43-
max_model_len: int = Field(8192, alias="max-model-len")
44-
router: str = "kv"
41+
endpoint: str = "dynamo.SimpleLoadBalancer.generate_disagg"
42+
port: int = 8000
43+
port_etcd: int = 2379
44+
port_nats: int = 4222
4545

4646

47-
class RouterArgs(BaseModel):
48-
"""Arguments for the router."""
47+
class SimpleLoadBalancerArgs(BaseModel):
48+
"""Arguments for the load balancer."""
4949

5050
model_config = ConfigDict(extra="forbid", populate_by_name=True)
5151

52-
min_workers: int = Field(1, alias="min-workers")
52+
enable_disagg: bool = True
5353

5454

55-
class PrefillWorkerArgs(BaseModel):
56-
"""Arguments for the prefill worker node."""
55+
class WorkerBaseArgs(BaseModel):
56+
"""Base arguments for VLLM workers."""
5757

5858
model_config = ConfigDict(extra="forbid", populate_by_name=True)
5959

6060
num_nodes: Union[int, list[int]]
61-
kv_transfer_config: str = Field('{"kv_connector":"DynamoNixlConnector"}', alias="kv-transfer-config")
62-
block_size: int = Field(64, alias="block-size")
63-
max_model_len: int = Field(8192, alias="max-model-len")
64-
max_num_seqs: int = Field(16, alias="max-num-seqs")
65-
gpu_memory_utilization: float = Field(0.95, alias="gpu-memory-utilization")
66-
tensor_parallel_size: Union[int, list[int]] = Field(8, alias="tensor-parallel-size")
67-
pipeline_parallel_size: Union[int, list[int]] = Field(1, alias="pipeline-parallel-size")
68-
quantization: Optional[str] = None
6961
service_args: dict = Field({"workers": 1, "resources": {"gpu": "8"}}, alias="ServiceArgs")
62+
gpu_memory_utilization: float = Field(0.7, alias="gpu-memory-utilization")
63+
tensor_parallel_size: int = Field(8, alias="tensor-parallel-size")
64+
pipeline_parallel_size: int = Field(1, alias="pipeline-parallel-size")
65+
enforce_eager: bool = Field(True, alias="enforce-eager")
7066

7167

72-
class VllmWorkerArgs(BaseModel):
73-
"""Arguments for the VllmWorker node."""
68+
class PrefillWorkerArgs(WorkerBaseArgs):
69+
"""Arguments for the VLLM prefill worker."""
7470

75-
model_config = ConfigDict(extra="forbid", populate_by_name=True)
71+
pass
7672

77-
num_nodes: Union[int, list[int]]
78-
kv_transfer_config: str = Field('{"kv_connector":"DynamoNixlConnector"}', alias="kv-transfer-config")
79-
block_size: int = Field(64, alias="block-size")
80-
max_model_len: int = Field(8192, alias="max-model-len")
81-
max_num_seqs: int = Field(16, alias="max-num-seqs")
82-
remote_prefill: bool = Field(True, alias="remote-prefill")
83-
conditional_disagg: bool = Field(True, alias="conditional-disagg")
84-
max_local_prefill_length: int = Field(10, alias="max-local-prefill-length")
85-
max_prefill_queue_size: int = Field(2, alias="max-prefill-queue-size")
86-
gpu_memory_utilization: float = Field(0.95, alias="gpu-memory-utilization")
87-
tensor_parallel_size: Union[int, list[int]] = Field(8, alias="tensor-parallel-size")
88-
pipeline_parallel_size: Union[int, list[int]] = Field(1, alias="pipeline-parallel-size")
89-
router: str = "kv"
90-
quantization: Optional[str] = None
91-
enable_prefix_caching: bool = Field(True, alias="enable-prefix-caching")
92-
service_args: dict = Field({"workers": 1, "resources": {"gpu": "8"}}, alias="ServiceArgs")
73+
74+
class DecodeWorkerArgs(WorkerBaseArgs):
75+
"""Arguments for the VLLM decode worker."""
76+
77+
pass
9378

9479

9580
class AIDynamoArgs(BaseModel):
9681
"""Arguments for AI Dynamo setup."""
9782

9883
model_config = ConfigDict(extra="forbid")
9984

100-
frontend: FrontendArgs = FrontendArgs(port_etcd=2379, port_nats=4222)
101-
processor: ProcessorArgs = ProcessorArgs(**{"block-size": 64, "max-model-len": 8192, "router": "kv"})
102-
router: RouterArgs = RouterArgs(**{"min-workers": 1})
85+
common: CommonConfig
86+
frontend: FrontendArgs = FrontendArgs()
87+
simple_load_balancer: SimpleLoadBalancerArgs = SimpleLoadBalancerArgs()
10388
prefill_worker: PrefillWorkerArgs
104-
vllm_worker: VllmWorkerArgs
89+
decode_worker: DecodeWorkerArgs
10590

10691

10792
class GenAIPerfArgs(BaseModel):
@@ -132,10 +117,13 @@ class AIDynamoCmdArgs(CmdArgs):
132117
"""Arguments for AI Dynamo."""
133118

134119
docker_image_url: str
135-
served_model_name: str
120+
huggingface_home_host_path: Path = Path.home() / ".cache/huggingface"
121+
huggingface_home_container_path: Path = Path("/root/.cache/huggingface")
136122
dynamo: AIDynamoArgs
137123
sleep_seconds: int = 660
138124
genai_perf: GenAIPerfArgs
125+
node_setup_cmd: str = ""
126+
extra_args: str = ""
139127

140128

141129
class AIDynamoTestDefinition(TestDefinition):
@@ -155,11 +143,8 @@ def installables(self) -> List[Installable]:
155143
return [self.docker_image]
156144

157145
@property
158-
def hugging_face_home_path(self) -> Path:
159-
raw = self.extra_env_vars.get("HF_HOME")
160-
if not isinstance(raw, str) or not raw.strip():
161-
raise ValueError("HF_HOME must be set and non-empty")
162-
path = Path(raw)
146+
def huggingface_home_host_path(self) -> Path:
147+
path = Path(self.cmd_args.huggingface_home_host_path)
163148
if not path.is_dir():
164-
raise FileNotFoundError(f"HF_HOME path not found at {path}")
149+
raise FileNotFoundError(f"HuggingFace home path not found at {path}")
165150
return path

src/cloudai/workloads/ai_dynamo/report_generation_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def generate_report(self) -> None:
102102

103103
num_frontend_nodes = 1
104104
num_prefill_nodes = self.test_run.test.test_definition.cmd_args.dynamo.prefill_worker.num_nodes
105-
num_decode_nodes = self.test_run.test.test_definition.cmd_args.dynamo.vllm_worker.num_nodes
105+
num_decode_nodes = self.test_run.test.test_definition.cmd_args.dynamo.decode_worker.num_nodes
106106

107107
total_gpus = (num_frontend_nodes + num_prefill_nodes + num_decode_nodes) * gpus_per_node
108108

0 commit comments

Comments
 (0)