update wraning to error

divyashreepathihalli · divyashreepathihalli · commit 7765e7ae7362 · 2026-03-02T23:23:42.000Z
diff --git a/keras_remote/backend/gke_client.py b/keras_remote/backend/gke_client.py
@@ -2,6 +2,8 @@
 
 import time
 from contextlib import suppress
+import subprocess
+import json
 
 from absl import logging
 from kubernetes import client, config
@@ -394,6 +396,25 @@ def _print_pod_logs(core_v1, job_name, namespace):
         logging.info("Pod %s logs:\n%s", pod.metadata.name, logs)
 
 
+def _validate_node_pool_exists(selector: dict) -> bool:
+  """Use gcloud to verify that a GKE NodePool matches the pod node selector."""
+  try:
+    # Requires gcloud CLI and valid credentials.
+    out = subprocess.check_output([
+        "gcloud", "container", "node-pools", "list",
+        "--format", "json"], text=True, stderr=subprocess.DEVNULL)
+    pools = json.loads(out)
+    for pool in pools:
+      config = pool.get("config", {})
+      labels = config.get("labels", {})
+      # Check if all keys/values in the selector exist in this pool's labels
+      if all(labels.get(k) == str(v) for k, v in selector.items()):
+        return True
+    return False
+  except Exception:
+    # If gcloud is missing or unauthenticated, degrade gracefully and assume pool exists
+    return True
+
 def _check_pod_scheduling(core_v1, job_name, namespace):
   """Check for pod scheduling issues and raise helpful errors."""
   with suppress(ApiException):
@@ -406,22 +427,30 @@ def _check_pod_scheduling(core_v1, job_name, namespace):
           if condition.type == "PodScheduled" and condition.status == "False":
             msg = condition.message or ""
             if "Insufficient nvidia.com/gpu" in msg:
+              selector = pod.spec.node_selector or {}
+              if not _validate_node_pool_exists(selector):
+                  selector_str = ", ".join([f"{k}: {v}" for k, v in selector.items()]) if selector else "None"
+                  raise RuntimeError(f"No GKE node pool exists with selector '{selector_str}'. "
+                                     "Please use 'keras-remote pool add' to configure this accelerator.")
               logging.info(
                 f"Pod {pod.metadata.name} is Pending: Insufficient nvidia.com/gpu. "
-                "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)"
+                "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)\n"
+                "  Note: If this hangs indefinitely, ensure your GCP project has adequate quota."
               )
             elif (
               "didn't match Pod's node affinity/selector" in msg
               or "node selector" in msg.lower()
             ):
-              selector = pod.spec.node_selector
-              selector_str = (
-                ", ".join([f"{k}: {v}" for k, v in selector.items()])
-                if selector
-                else "None"
-              )
+              selector = pod.spec.node_selector or {}
+              selector_str = ", ".join([f"{k}: {v}" for k, v in selector.items()]) if selector else "None"
+              
+              if not _validate_node_pool_exists(selector):
+                  raise RuntimeError(f"No GKE node pool exists with selector '{selector_str}'. "
+                                     "Please use 'keras-remote pool add' to configure this accelerator.")
+              
               logging.info(
                 f"Pod {pod.metadata.name} is Pending: No currently running nodes "
                 f"match accelerator selector '{selector_str}'. "
-                "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)"
+                "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)\n"
+                "  Note: If this hangs indefinitely, ensure your GCP project has adequate quota."
               )
diff --git a/keras_remote/cli/infra/program.py b/keras_remote/cli/infra/program.py
@@ -249,9 +249,9 @@ def _create_tpu_node_pool(cluster, tpu: TpuConfig, zone, project_id):
     cluster=cluster.name,
     location=zone,
     project=project_id,
-    initial_node_count=0,
+    initial_node_count=tpu.num_nodes if tpu.num_nodes > 1 else 0,
     autoscaling=gcp.container.NodePoolAutoscalingArgs(
-      min_node_count=0,
+      min_node_count=tpu.num_nodes if tpu.num_nodes > 1 else 0,
       max_node_count=tpu.num_nodes,
     ),
     management=gcp.container.NodePoolManagementArgs(
diff --git a/keras_remote/cli/infra/program_test.py b/keras_remote/cli/infra/program_test.py
@@ -68,13 +68,14 @@ def test_node_count_matches_config(self, gcp_mock):
 
     program._create_tpu_node_pool(cluster, tpu, "us-central2-b", "my-project")
 
-    # Due to scale-to-zero, initial_node_count is 0 and max is stored in autoscaling
+    # Due to multi-host TPU workaround, initial_node_count is equal to num_nodes
     call_kwargs = gcp_mock.container.NodePool.call_args.kwargs
-    self.assertEqual(call_kwargs.get("initial_node_count"), 0)
+    self.assertEqual(call_kwargs.get("initial_node_count"), 4)
     autoscaling_kwargs = (
       gcp_mock.container.NodePoolAutoscalingArgs.call_args.kwargs
     )
     self.assertEqual(autoscaling_kwargs.get("max_node_count"), 4)
+    self.assertEqual(autoscaling_kwargs.get("min_node_count"), 4)
 
   @mock.patch.object(program, "gcp")
   def test_pool_name_includes_tpu_name(self, gcp_mock):
@@ -300,13 +301,15 @@ def test_node_pool_scale_to_zero(
         cluster, accelerator, "us-central2-b", "my-project"
       )
 
+    is_multi_host = getattr(accelerator, "num_nodes", 1) > 1
+
     call_kwargs = gcp_mock.container.NodePool.call_args.kwargs
-    self.assertEqual(call_kwargs.get("initial_node_count"), 0)
+    self.assertEqual(call_kwargs.get("initial_node_count"), expected_max_count if is_multi_host else 0)
 
     autoscaling_kwargs = (
       gcp_mock.container.NodePoolAutoscalingArgs.call_args.kwargs
     )
-    self.assertEqual(autoscaling_kwargs.get("min_node_count"), 0)
+    self.assertEqual(autoscaling_kwargs.get("min_node_count"), expected_max_count if is_multi_host else 0)
     self.assertEqual(
       autoscaling_kwargs.get("max_node_count"), expected_max_count
     )