22
33import time
44from contextlib import suppress
5+ import subprocess
6+ import json
57
68from absl import logging
79from kubernetes import client , config
@@ -394,6 +396,25 @@ def _print_pod_logs(core_v1, job_name, namespace):
394396 logging .info ("Pod %s logs:\n %s" , pod .metadata .name , logs )
395397
396398
399+ def _validate_node_pool_exists (selector : dict ) -> bool :
400+ """Use gcloud to verify that a GKE NodePool matches the pod node selector."""
401+ try :
402+ # Requires gcloud CLI and valid credentials.
403+ out = subprocess .check_output ([
404+ "gcloud" , "container" , "node-pools" , "list" ,
405+ "--format" , "json" ], text = True , stderr = subprocess .DEVNULL )
406+ pools = json .loads (out )
407+ for pool in pools :
408+ config = pool .get ("config" , {})
409+ labels = config .get ("labels" , {})
410+ # Check if all keys/values in the selector exist in this pool's labels
411+ if all (labels .get (k ) == str (v ) for k , v in selector .items ()):
412+ return True
413+ return False
414+ except Exception :
415+ # If gcloud is missing or unauthenticated, degrade gracefully and assume pool exists
416+ return True
417+
397418def _check_pod_scheduling (core_v1 , job_name , namespace ):
398419 """Check for pod scheduling issues and raise helpful errors."""
399420 with suppress (ApiException ):
@@ -406,22 +427,30 @@ def _check_pod_scheduling(core_v1, job_name, namespace):
406427 if condition .type == "PodScheduled" and condition .status == "False" :
407428 msg = condition .message or ""
408429 if "Insufficient nvidia.com/gpu" in msg :
430+ selector = pod .spec .node_selector or {}
431+ if not _validate_node_pool_exists (selector ):
432+ selector_str = ", " .join ([f"{ k } : { v } " for k , v in selector .items ()]) if selector else "None"
433+ raise RuntimeError (f"No GKE node pool exists with selector '{ selector_str } '. "
434+ "Please use 'keras-remote pool add' to configure this accelerator." )
409435 logging .info (
410436 f"Pod { pod .metadata .name } is Pending: Insufficient nvidia.com/gpu. "
411- "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)"
437+ "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)\n "
438+ " Note: If this hangs indefinitely, ensure your GCP project has adequate quota."
412439 )
413440 elif (
414441 "didn't match Pod's node affinity/selector" in msg
415442 or "node selector" in msg .lower ()
416443 ):
417- selector = pod .spec .node_selector
418- selector_str = (
419- ", " .join ([f"{ k } : { v } " for k , v in selector .items ()])
420- if selector
421- else "None"
422- )
444+ selector = pod .spec .node_selector or {}
445+ selector_str = ", " .join ([f"{ k } : { v } " for k , v in selector .items ()]) if selector else "None"
446+
447+ if not _validate_node_pool_exists (selector ):
448+ raise RuntimeError (f"No GKE node pool exists with selector '{ selector_str } '. "
449+ "Please use 'keras-remote pool add' to configure this accelerator." )
450+
423451 logging .info (
424452 f"Pod { pod .metadata .name } is Pending: No currently running nodes "
425453 f"match accelerator selector '{ selector_str } '. "
426- "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)"
454+ "Waiting for GKE Cluster Autoscaler to provision a new node... (scale-to-zero)\n "
455+ " Note: If this hangs indefinitely, ensure your GCP project has adequate quota."
427456 )
0 commit comments