NVIDIA
diff --git a/‎doc/ai_dynamo.md‎
Lines changed: 8 additions & 10 deletions b/‎doc/ai_dynamo.md‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎src/cloudai/workloads/ai_dynamo/__init__.py‎
Lines changed: 6 additions & 6 deletions b/‎src/cloudai/workloads/ai_dynamo/__init__.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/cloudai/workloads/ai_dynamo/ai_dynamo.py‎
Lines changed: 39 additions & 54 deletions b/‎src/cloudai/workloads/ai_dynamo/ai_dynamo.py‎
Lines changed: 39 additions & 54 deletions
diff --git a/‎src/cloudai/workloads/ai_dynamo/report_generation_strategy.py‎
Lines changed: 1 addition & 1 deletion b/‎src/cloudai/workloads/ai_dynamo/report_generation_strategy.py‎
Lines changed: 1 addition & 1 deletion
@@ -51,9 +51,9 @@ The path to the downloaded weights should be consistent with the structure expec
 
 ---
 
-### Step 2: Configure `HF_HOME` in the Test Schema
+### Step 2: Configure `huggingface_home` in the Test Schema
 
-Set the `HF_HOME` environment variable in the test schema file (e.g., `test.toml`) so that CloudAI can locate the model weights:
+Set the `huggingface_home` variable in the test schema file (e.g., `test.toml`) so that CloudAI can locate the model weights:
 
 ```toml
 name = "llama3.1_405b_fp8"
@@ -63,41 +63,39 @@ test_template_name = "AIDynamo"
 [cmd_args]
 docker_image_url = "/path/to/docker/image"
 served_model_name = "nvidia/Llama-3.1-405B-Instruct-FP8"
+huggingface_home_host_path = "/your/path/to/hf_home"
+huggingface_home_container_path = "/root/.cache/huggingface"
 
-  [cmd_args.dynamo.processor]
-  [cmd_args.dynamo.router]
   [cmd_args.dynamo.frontend]
   [cmd_args.dynamo.prefill_worker]
   num_nodes = 1
 
-  [cmd_args.dynamo.vllm_worker]
+  [cmd_args.dynamo.decode_worker]
   num_nodes = 0
 
   [cmd_args.genai_perf]
   endpoint = "v1/chat/completions"
   endpoint_type = "chat"
   streaming = true
 
-[extra_env_vars]
-HF_HOME = "/your/path/to/hf_home"
 ```
 
-This environment variable should point to the root directory used with `--local-dir` in the download step. CloudAI will use this directory to locate and load the appropriate model weights.
+This location should point to the root directory used with `--local-dir` in the download step. CloudAI will use this directory to locate and load the appropriate model weights.
 
 ---
 
 ### Step 3: Node Configuration for AI Dynamo
 
 AI Dynamo jobs use three distinct types of nodes:
 
-- **Frontend node**: Hosts the coordination services (`etcd`, `nats`) as well as the **frontend server** and the **request generator** (`genai-perf`)
+- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`genai-perf`), and the first decode worker.
 - **Prefill node(s)**: Handle the prefill stage of inference
 - **Decode node(s)**: Handle the decode stage of inference (optional, depending on model and setup)
 
 The total number of nodes required must be:
 
 ```
-1 (frontend) + num_prefill_nodes + num_decode_nodes
+num_prefill_nodes + num_decode_nodes
 ```
 
 If there is a mismatch in the number of nodes between the schema and the test scenario, CloudAI will use the number of nodes specified in the test schema, ignoring the value in the test scenario.
 
@@ -18,12 +18,12 @@
     AIDynamoArgs,
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
+    CommonConfig,
+    DecodeWorkerArgs,
     FrontendArgs,
     GenAIPerfArgs,
     PrefillWorkerArgs,
-    ProcessorArgs,
-    RouterArgs,
-    VllmWorkerArgs,
+    SimpleLoadBalancerArgs,
 )
 from .report_generation_strategy import AIDynamoReportGenerationStrategy
 from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy
@@ -34,10 +34,10 @@
     "AIDynamoReportGenerationStrategy",
     "AIDynamoSlurmCommandGenStrategy",
     "AIDynamoTestDefinition",
+    "CommonConfig",
+    "DecodeWorkerArgs",
     "FrontendArgs",
     "GenAIPerfArgs",
     "PrefillWorkerArgs",
-    "ProcessorArgs",
-    "RouterArgs",
-    "VllmWorkerArgs",
+    "SimpleLoadBalancerArgs",
 ]
@@ -23,85 +23,70 @@
 from cloudai.models.workload import CmdArgs, TestDefinition
 
 
-class FrontendArgs(BaseModel):
-    """Arguments for the frontend node."""
+class CommonConfig(BaseModel):
+    """Common configuration shared across components."""
 
     model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
-    endpoint: str = "dynamo.Processor.chat/completions"
-    port: int = 8000
-    port_etcd: int = 2379
-    port_nats: int = 4222
+    model: str
+    kv_transfer_config: str = Field('{"kv_connector":"NixlConnector","kv_role":"kv_both"}', alias="kv-transfer-config")
+    served_model_name: str
 
 
-class ProcessorArgs(BaseModel):
-    """Arguments for the processor node."""
+class FrontendArgs(BaseModel):
+    """Arguments for the frontend node."""
 
     model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
-    block_size: int = Field(64, alias="block-size")
-    max_model_len: int = Field(8192, alias="max-model-len")
-    router: str = "kv"
+    endpoint: str = "dynamo.SimpleLoadBalancer.generate_disagg"
+    port: int = 8000
+    port_etcd: int = 2379
+    port_nats: int = 4222
 
 
-class RouterArgs(BaseModel):
-    """Arguments for the router."""
+class SimpleLoadBalancerArgs(BaseModel):
+    """Arguments for the load balancer."""
 
     model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
-    min_workers: int = Field(1, alias="min-workers")
+    enable_disagg: bool = True
 
 
-class PrefillWorkerArgs(BaseModel):
-    """Arguments for the prefill worker node."""
+class WorkerBaseArgs(BaseModel):
+    """Base arguments for VLLM workers."""
 
     model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
     num_nodes: Union[int, list[int]]
-    kv_transfer_config: str = Field('{"kv_connector":"DynamoNixlConnector"}', alias="kv-transfer-config")
-    block_size: int = Field(64, alias="block-size")
-    max_model_len: int = Field(8192, alias="max-model-len")
-    max_num_seqs: int = Field(16, alias="max-num-seqs")
-    gpu_memory_utilization: float = Field(0.95, alias="gpu-memory-utilization")
-    tensor_parallel_size: Union[int, list[int]] = Field(8, alias="tensor-parallel-size")
-    pipeline_parallel_size: Union[int, list[int]] = Field(1, alias="pipeline-parallel-size")
-    quantization: Optional[str] = None
     service_args: dict = Field({"workers": 1, "resources": {"gpu": "8"}}, alias="ServiceArgs")
+    gpu_memory_utilization: float = Field(0.7, alias="gpu-memory-utilization")
+    tensor_parallel_size: int = Field(8, alias="tensor-parallel-size")
+    pipeline_parallel_size: int = Field(1, alias="pipeline-parallel-size")
+    enforce_eager: bool = Field(True, alias="enforce-eager")
 
 
-class VllmWorkerArgs(BaseModel):
-    """Arguments for the VllmWorker node."""
+class PrefillWorkerArgs(WorkerBaseArgs):
+    """Arguments for the VLLM prefill worker."""
 
-    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+    pass
 
-    num_nodes: Union[int, list[int]]
-    kv_transfer_config: str = Field('{"kv_connector":"DynamoNixlConnector"}', alias="kv-transfer-config")
-    block_size: int = Field(64, alias="block-size")
-    max_model_len: int = Field(8192, alias="max-model-len")
-    max_num_seqs: int = Field(16, alias="max-num-seqs")
-    remote_prefill: bool = Field(True, alias="remote-prefill")
-    conditional_disagg: bool = Field(True, alias="conditional-disagg")
-    max_local_prefill_length: int = Field(10, alias="max-local-prefill-length")
-    max_prefill_queue_size: int = Field(2, alias="max-prefill-queue-size")
-    gpu_memory_utilization: float = Field(0.95, alias="gpu-memory-utilization")
-    tensor_parallel_size: Union[int, list[int]] = Field(8, alias="tensor-parallel-size")
-    pipeline_parallel_size: Union[int, list[int]] = Field(1, alias="pipeline-parallel-size")
-    router: str = "kv"
-    quantization: Optional[str] = None
-    enable_prefix_caching: bool = Field(True, alias="enable-prefix-caching")
-    service_args: dict = Field({"workers": 1, "resources": {"gpu": "8"}}, alias="ServiceArgs")
+
+class DecodeWorkerArgs(WorkerBaseArgs):
+    """Arguments for the VLLM decode worker."""
+
+    pass
 
 
 class AIDynamoArgs(BaseModel):
     """Arguments for AI Dynamo setup."""
 
     model_config = ConfigDict(extra="forbid")
 
-    frontend: FrontendArgs = FrontendArgs(port_etcd=2379, port_nats=4222)
-    processor: ProcessorArgs = ProcessorArgs(**{"block-size": 64, "max-model-len": 8192, "router": "kv"})
-    router: RouterArgs = RouterArgs(**{"min-workers": 1})
+    common: CommonConfig
+    frontend: FrontendArgs = FrontendArgs()
+    simple_load_balancer: SimpleLoadBalancerArgs = SimpleLoadBalancerArgs()
     prefill_worker: PrefillWorkerArgs
-    vllm_worker: VllmWorkerArgs
+    decode_worker: DecodeWorkerArgs
 
 
 class GenAIPerfArgs(BaseModel):
@@ -132,10 +117,13 @@ class AIDynamoCmdArgs(CmdArgs):
     """Arguments for AI Dynamo."""
 
     docker_image_url: str
-    served_model_name: str
+    huggingface_home_host_path: Path = Path.home() / ".cache/huggingface"
+    huggingface_home_container_path: Path = Path("/root/.cache/huggingface")
     dynamo: AIDynamoArgs
     sleep_seconds: int = 660
     genai_perf: GenAIPerfArgs
+    node_setup_cmd: str = ""
+    extra_args: str = ""
 
 
 class AIDynamoTestDefinition(TestDefinition):
@@ -155,11 +143,8 @@ def installables(self) -> List[Installable]:
         return [self.docker_image]
 
     @property
-    def hugging_face_home_path(self) -> Path:
-        raw = self.extra_env_vars.get("HF_HOME")
-        if not isinstance(raw, str) or not raw.strip():
-            raise ValueError("HF_HOME must be set and non-empty")
-        path = Path(raw)
+    def huggingface_home_host_path(self) -> Path:
+        path = Path(self.cmd_args.huggingface_home_host_path)
         if not path.is_dir():
-            raise FileNotFoundError(f"HF_HOME path not found at {path}")
+            raise FileNotFoundError(f"HuggingFace home path not found at {path}")
         return path
@@ -102,7 +102,7 @@ def generate_report(self) -> None:
 
         num_frontend_nodes = 1
         num_prefill_nodes = self.test_run.test.test_definition.cmd_args.dynamo.prefill_worker.num_nodes
-        num_decode_nodes = self.test_run.test.test_definition.cmd_args.dynamo.vllm_worker.num_nodes
+        num_decode_nodes = self.test_run.test.test_definition.cmd_args.dynamo.decode_worker.num_nodes
 
         total_gpus = (num_frontend_nodes + num_prefill_nodes + num_decode_nodes) * gpus_per_node