@@ -23,6 +23,7 @@ def submit_k8s_job(
2323 job_id ,
2424 bucket_name ,
2525 namespace = "default" ,
26+ spot = False ,
2627):
2728 """Submit a Kubernetes Job to GKE cluster.
2829
@@ -42,7 +43,7 @@ def submit_k8s_job(
4243 _load_kube_config ()
4344
4445 # Parse accelerator configuration
45- accel_config = _parse_accelerator (accelerator )
46+ accel_config = _parse_accelerator (accelerator , spot = spot )
4647
4748 # Create job specification
4849 job_name = f"keras-remote-{ job_id } "
@@ -224,9 +225,9 @@ def validate_preflight(
224225 logging .warning ("Preflight check: Failed to query nodes: %s" , e .reason )
225226
226227
227- def _parse_accelerator (accelerator ):
228+ def _parse_accelerator (accelerator , spot = False ):
228229 """Convert accelerator string to GKE pod spec fields."""
229- parsed = accelerators .parse_accelerator (accelerator )
230+ parsed = accelerators .parse_accelerator (accelerator , spot = spot )
230231
231232 if parsed is None :
232233 return {
@@ -241,7 +242,7 @@ def _parse_accelerator(accelerator):
241242 # For TPU Podslices (multi-node), resource requests must be per-node.
242243 # num_nodes is 1 for single-host TPUs (v3-8, v4-8, v5litepod-1/4/8).
243244 chips_per_node = parsed .chips // parsed .num_nodes
244- return {
245+ config = {
245246 "node_selector" : {
246247 "cloud.google.com/gke-tpu-accelerator" : parsed .gke_accelerator ,
247248 "cloud.google.com/gke-tpu-topology" : parsed .topology ,
@@ -254,8 +255,20 @@ def _parse_accelerator(accelerator):
254255 "jax_platform" : "tpu" ,
255256 }
256257
258+ if parsed .spot :
259+ config ["node_selector" ]["cloud.google.com/gke-spot" ] = "true"
260+ config ["tolerations" ].append (
261+ {
262+ "key" : "cloud.google.com/gke-spot" ,
263+ "operator" : "Equal" ,
264+ "value" : "true" ,
265+ "effect" : "NoSchedule" ,
266+ }
267+ )
268+ return config
269+
257270 # GpuConfig
258- return {
271+ config = {
259272 "node_selector" : {"cloud.google.com/gke-accelerator" : parsed .gke_label },
260273 "resource_limits" : {"nvidia.com/gpu" : str (parsed .count )},
261274 "resource_requests" : {"nvidia.com/gpu" : str (parsed .count )},
@@ -264,6 +277,17 @@ def _parse_accelerator(accelerator):
264277 ],
265278 "jax_platform" : "gpu" ,
266279 }
280+ if parsed .spot :
281+ config ["node_selector" ]["cloud.google.com/gke-spot" ] = "true"
282+ config ["tolerations" ].append (
283+ {
284+ "key" : "cloud.google.com/gke-spot" ,
285+ "operator" : "Equal" ,
286+ "value" : "true" ,
287+ "effect" : "NoSchedule" ,
288+ }
289+ )
290+ return config
267291
268292
269293def _load_kube_config ():
@@ -441,6 +465,10 @@ def _check_node_pool_exists_cached(selector_items) -> bool:
441465 config_dict = pool .get ("config" , {})
442466 pool_labels = config_dict .get ("labels" , {}).copy ()
443467
468+ # Spot VM mapping
469+ if config_dict .get ("spot" ):
470+ pool_labels ["cloud.google.com/gke-spot" ] = "true"
471+
444472 # Map GKE injected node labels for accelerators mapping
445473 accel_config_list = config_dict .get ("accelerators" , [])
446474 if accel_config_list :
@@ -450,6 +478,13 @@ def _check_node_pool_exists_cached(selector_items) -> bool:
450478 else :
451479 pool_labels ["cloud.google.com/gke-accelerator" ] = accel_type
452480
481+ # TPU topology mapping from placement policy
482+ placement_policy = pool .get ("placementPolicy" , {})
483+ if placement_policy and placement_policy .get ("tpuTopology" ):
484+ pool_labels ["cloud.google.com/gke-tpu-topology" ] = placement_policy [
485+ "tpuTopology"
486+ ]
487+
453488 # TPU mapping fallback
454489 machine_type = config_dict .get ("machineType" , "" )
455490
@@ -460,7 +495,9 @@ def _check_node_pool_exists_cached(selector_items) -> bool:
460495 "goog-gke-accelerator-type"
461496 ]
462497
463- if machine_type .startswith ("ct" ):
498+ if machine_type .startswith ("ct" ) and not pool_labels .get (
499+ "cloud.google.com/gke-tpu-topology"
500+ ):
464501 # We roughly map TPU topology presence for preflight
465502 pool_labels ["cloud.google.com/gke-tpu-topology" ] = selector .get (
466503 "cloud.google.com/gke-tpu-topology" , ""
0 commit comments