address review comments

divyashreepathihalli · divyashreepathihalli · commit dac47b8d58d2 · 2026-03-09T20:30:39.000Z
diff --git a/keras_remote/cli/infra/program.py b/keras_remote/cli/infra/program.py
@@ -232,7 +232,7 @@ def _create_gpu_node_pool(cluster, gpu: GpuConfig, zone, project_id, pool_name):
       guest_accelerators=[
         gcp.container.NodePoolNodeConfigGuestAcceleratorArgs(
           type=gpu.gke_label,
-          count=1,
+          count=gpu.count,
         ),
       ],
       labels={RESOURCE_NAME_PREFIX: "true"},
diff --git a/keras_remote/core/accelerators.py b/keras_remote/core/accelerators.py
@@ -40,8 +40,7 @@ class GpuSpec:
   """Registry entry for a GPU type."""
 
   gke_label: str
-  machine_type: str
-  counts: tuple[int, ...]
+  counts: dict[int, str]  # count -> machine_type
 
 
 @dataclass(frozen=True)
@@ -63,14 +62,53 @@ class TpuSpec:
 
 
 GPUS: dict[str, GpuSpec] = {
-  "l4": GpuSpec("nvidia-l4", "g2-standard-4", (1, 2, 4, 8)),
-  "t4": GpuSpec("nvidia-tesla-t4", "n1-standard-4", (1, 2, 4)),
-  "v100": GpuSpec("nvidia-tesla-v100", "n1-standard-8", (1, 2, 4, 8)),
-  "a100": GpuSpec("nvidia-tesla-a100", "a2-highgpu-1g", (1, 2, 4, 8, 16)),
-  "a100-80gb": GpuSpec("nvidia-a100-80gb", "a2-ultragpu-1g", (1, 2, 4, 8, 16)),
-  "h100": GpuSpec("nvidia-h100-80gb", "a3-highgpu-1g", (1, 2, 4, 8)),
-  "p4": GpuSpec("nvidia-tesla-p4", "n1-standard-4", (1, 2, 4)),
-  "p100": GpuSpec("nvidia-tesla-p100", "n1-standard-4", (1, 2, 4)),
+  "l4": GpuSpec("nvidia-l4", {
+      1: "g2-standard-4",
+      2: "g2-standard-24",
+      4: "g2-standard-48",
+      8: "g2-standard-96",
+  }),
+  "t4": GpuSpec("nvidia-tesla-t4", {
+      1: "n1-standard-4",
+      2: "n1-standard-8",
+      4: "n1-standard-16",
+  }),
+  "v100": GpuSpec("nvidia-tesla-v100", {
+      1: "n1-standard-8",
+      2: "n1-standard-16",
+      4: "n1-standard-32",
+      8: "n1-standard-64",
+  }),
+  "a100": GpuSpec("nvidia-tesla-a100", {
+      1: "a2-highgpu-1g",
+      2: "a2-highgpu-2g",
+      4: "a2-highgpu-4g",
+      8: "a2-highgpu-8g",
+      16: "a2-megagpu-16g",
+  }),
+  "a100-80gb": GpuSpec("nvidia-a100-80gb", {
+      1: "a2-ultragpu-1g",
+      2: "a2-ultragpu-2g",
+      4: "a2-ultragpu-4g",
+      8: "a2-ultragpu-8g",
+      16: "a2-ultragpu-16g",
+  }),
+  "h100": GpuSpec("nvidia-h100-80gb", {
+      1: "a3-highgpu-1g",
+      2: "a3-highgpu-2g",
+      4: "a3-highgpu-4g",
+      8: "a3-highgpu-8g",
+  }),
+  "p4": GpuSpec("nvidia-tesla-p4", {
+      1: "n1-standard-4",
+      2: "n1-standard-8",
+      4: "n1-standard-16",
+  }),
+  "p100": GpuSpec("nvidia-tesla-p100", {
+      1: "n1-standard-4",
+      2: "n1-standard-8",
+      4: "n1-standard-16",
+  }),
 }
 
 _GPU_ALIASES: dict[str, str] = {
@@ -83,19 +121,6 @@ class TpuSpec:
 # Machine-type suffix "-Nt" → N chips per VM (e.g. ct5p-hightpu-4t → 4 chips).
 # v5p uses 3-D topologies (AxBxC); v2, v3, v5litepod, v6e use 2-D (AxB).
 TPUS: dict[str, TpuSpec] = {
-  "v2": TpuSpec(
-    "tpu-v2-podslice",
-    4,
-    {
-      4: TpuTopologySpec("2x2", "ct2-hightpu-4t", 1),
-      16: TpuTopologySpec("4x4", "ct2-hightpu-4t", 4),
-      32: TpuTopologySpec("4x8", "ct2-hightpu-4t", 8),
-      64: TpuTopologySpec("8x8", "ct2-hightpu-4t", 16),
-      128: TpuTopologySpec("8x16", "ct2-hightpu-4t", 32),
-      256: TpuTopologySpec("16x16", "ct2-hightpu-4t", 64),
-      512: TpuTopologySpec("16x32", "ct2-hightpu-4t", 128),
-    },
-  ),
   "v3": TpuSpec(
     "tpu-v3-podslice",
     4,
@@ -148,6 +173,7 @@ class TpuSpec:
     {
       8: TpuTopologySpec("2x2x2", "ct5p-hightpu-4t", 2),
       16: TpuTopologySpec("2x2x4", "ct5p-hightpu-4t", 4),
+      32: TpuTopologySpec("2x4x4", "ct5p-hightpu-4t", 8),
     },
   ),
   "v6e": TpuSpec(
@@ -165,13 +191,11 @@ class TpuSpec:
 }
 
 
-# ── Parser ────────────────────────────────────────────────────────
-
-_MULTI_GPU_RE = re.compile(r"^(.+?)(?:x|-)(\d+)$")  # "a100x4", "l4-2"
-_TPU_CHIPS_RE = re.compile(r"^([a-z0-9_]+)-(\d+)$")  # "v3-8", "v5litepod-16"
+_MULTI_GPU_RE = re.compile(r"^([^x]+)(?:x)(\d+)$")  # "a100x4"
+_TPU_CHIPS_RE = re.compile(r"^([a-z0-9_]+)-(\d+)$")  # "v3-8"
 _TPU_TOPO_RE = re.compile(
   r"^([a-z0-9_]+)-(\d+x\d+(?:x\d+)?)$"
-)  # "v5litepod-2x2", "v5p-2x2x2"
+)  # "v5litepod-2x2"
 
 DEFAULT_GPU = "l4"
 DEFAULT_TPU = "v5litepod"
@@ -186,7 +210,7 @@ class TpuSpec:
   "p100",
   "p4",
 ]
-_PREFERRED_TPUS = ["v6e", "v5p", "v5litepod", "v4", "v3", "v2"]
+_PREFERRED_TPUS = ["v6e", "v5p", "v5litepod", "v4", "v3"]
 
 
 def _resolve_gpu_alias(name: str) -> str:
@@ -203,13 +227,25 @@ def parse_accelerator(accel_str: str) -> Accelerator:
   Returns GpuConfig, TpuConfig, or None (for "cpu").
 
   Accepted formats:
-      GPU:  "l4", "gpu", "gpu-4", "a100x4", "l4-2", "a100-80gbx8"
-      TPU:  "v3-8", "tpu", "tpu-8", "v5litepod-2x2", "v5litepod"
-      CPU:  "cpu", "cpu-8"
+      - Generic: "gpu", "tpu", "cpu" (resolves to defaults)
+      - Dynamic Count: "gpu:4", "tpu:8", "cpu:8" (assigns most capable hardware matching the count)
+      - Explicit GPU Name: "gpu:l4", "l4", "gpu:a100-80gb" (resolves to 1 instance of the specified GPU)
+      - Multi-GPU Name: "gpu:a100x4", "a100x4", "gpu:l4-2" (resolves to N instances of the specified GPU)
+      - Explicit TPU Name: "tpu:v5litepod", "v5litepod" (resolves to the default topology/chips for the TPU)
+      - Explicit TPU Topology/Chips: "tpu:v3-8", "tpu:v5litepod-2x2", "v3-8" (resolves to the specified TPU slice)
+
+  Note: Prefixes ('gpu:' and 'tpu:') are recommended for complete disambiguation but are completely optional.
+
+  Dynamic Resolution:
+      When using generic formats like "gpu:<N>" or "tpu:<N>", the parser
+      dynamically assigns the most capable hardware type that supports the
+      requested device count `N`. Hardware is selected based on an internal
+      preference hierarchy (e.g., H100 > A100 > L4 for GPUs, and
+      v6e > v5p > v5litepod for TPUs).
   """
   s = accel_str.strip().lower()
 
-  if s == "cpu" or (s.startswith("cpu-") and s[4:].isdigit()):
+  if s == "cpu" or (s.startswith("cpu:") and s[4:].isdigit()):
     return None
 
   if s == "gpu":
@@ -218,42 +254,56 @@ def parse_accelerator(accel_str: str) -> Accelerator:
   if s == "tpu":
     return make_tpu(DEFAULT_TPU, TPUS[DEFAULT_TPU].default_chips)
 
-  if s.startswith("gpu-") and s[4:].isdigit():
-    count = int(s[4:])
-    search_order = _PREFERRED_GPUS
-    for gpu_name in search_order:
+  # 1) Try parsing as GPU
+  is_gpu_explicit = s.startswith("gpu:")
+  gpu_str = s[4:] if is_gpu_explicit else s
+
+  if gpu_str.isdigit():
+    count = int(gpu_str)
+    for gpu_name in _PREFERRED_GPUS:
       if gpu_name in GPUS and count in GPUS[gpu_name].counts:
         return make_gpu(gpu_name, count)
-    valid_counts = sorted(set(c for spec in GPUS.values() for c in spec.counts))
-    raise ValueError(
-      f"No GPU supports count {count}. Supported counts across all GPUs: {valid_counts}"
-    )
-
-  if s.startswith("tpu-") and s[4:].isdigit():
-    chips = int(s[4:])
-    search_order = _PREFERRED_TPUS
-    for tpu_name in search_order:
-      if tpu_name in TPUS and chips in TPUS[tpu_name].topologies:
-        return make_tpu(tpu_name, chips)
-    valid_chips = sorted(
-      set(c for spec in TPUS.values() for c in spec.topologies)
-    )
-    raise ValueError(
-      f"No TPU supports {chips} chips. Supported chip counts across all TPUs: {valid_chips}"
-    )
+    if is_gpu_explicit:
+      valid_counts = sorted(set(c for spec in GPUS.values() for c in spec.counts))
+      raise ValueError(
+        f"No GPU supports count {count}. Supported counts: {valid_counts}"
+      )
 
-  # Direct GPU name: "l4", "a100-80gb"
-  name = _resolve_gpu_alias(s)
+  name = _resolve_gpu_alias(gpu_str)
   if name in GPUS:
     return make_gpu(name, 1)
 
-  # Direct TPU name (bare): "v5litepod" → default chips
-  name = _resolve_tpu_alias(s)
+  m = _MULTI_GPU_RE.match(gpu_str)
+  if m:
+    name = _resolve_gpu_alias(m.group(1))
+    if name in GPUS:
+      return make_gpu(name, int(m.group(2)))
+
+  if is_gpu_explicit:
+    raise ValueError(f"Unknown GPU accelerator: '{accel_str}'")
+
+  # 2) Try parsing as TPU
+  is_tpu_explicit = s.startswith("tpu:")
+  tpu_str = s[4:] if is_tpu_explicit else s
+
+  if tpu_str.isdigit():
+    chips = int(tpu_str)
+    for tpu_name in _PREFERRED_TPUS:
+      if tpu_name in TPUS and chips in TPUS[tpu_name].topologies:
+        return make_tpu(tpu_name, chips)
+    if is_tpu_explicit:
+      valid_chips = sorted(
+        set(c for spec in TPUS.values() for c in spec.topologies)
+      )
+      raise ValueError(
+        f"No TPU supports {chips} chips. Supported chip counts: {valid_chips}"
+      )
+
+  name = _resolve_tpu_alias(tpu_str)
   if name in TPUS:
     return make_tpu(name, TPUS[name].default_chips)
 
-  # TPU with topology string: "v5litepod-2x2", "v5p-2x2x2"
-  m = _TPU_TOPO_RE.match(s)
+  m = _TPU_TOPO_RE.match(tpu_str)
   if m:
     name = _resolve_tpu_alias(m.group(1))
     if name in TPUS:
@@ -267,25 +317,16 @@ def parse_accelerator(accel_str: str) -> Accelerator:
         f"Supported: {', '.join(valid)}."
       )
 
-  # TPU with chip count: "v3-8", "v5litepod-4"
-  m = _TPU_CHIPS_RE.match(s)
+  m = _TPU_CHIPS_RE.match(tpu_str)
   if m:
     name = _resolve_tpu_alias(m.group(1))
     if name in TPUS:
       return make_tpu(name, int(m.group(2)))
 
-  # Multi-GPU: "a100x4", "l4x2"
-  m = _MULTI_GPU_RE.match(s)
-  if m:
-    name = _resolve_gpu_alias(m.group(1))
-    if name in GPUS:
-      return make_gpu(name, int(m.group(2)))
-
   raise ValueError(
     f"Unknown accelerator: '{accel_str}'. "
-    f"GPUs: {', '.join(GPUS)} (use 'xN' for multi-GPU, e.g. 'a100x4'). "
-    f"TPUs: {', '.join(TPUS)} (use '-N' for chips, e.g. 'v3-8', "
-    f"or '-NxM' for topology, e.g. 'v5litepod-2x2')."
+    f"GPUs: {', '.join(GPUS)} (use 'gpu:name' or 'gpu:namexN'). "
+    f"TPUs: {', '.join(TPUS)} (use 'tpu:name' or 'tpu:name-N')."
   )
 
 
@@ -324,7 +365,7 @@ def make_gpu(name: str, count: int) -> GpuConfig:
     name=name,
     count=count,
     gke_label=spec.gke_label,
-    machine_type=spec.machine_type,
+    machine_type=spec.counts[count],
   )
 
 
diff --git a/keras_remote/core/accelerators_test.py b/keras_remote/core/accelerators_test.py