Fixes TPU detection logic (#77)

JyotinderSingh · web-flow · commit ef4b74d84499 · 2026-03-08T16:11:11.000-07:00
diff --git a/keras_remote/backend/gke_client.py b/keras_remote/backend/gke_client.py
@@ -461,6 +461,14 @@ def _check_node_pool_exists_cached(selector_items) -> bool:
           "cloud.google.com/gke-tpu-topology", ""
         )
 
+      # Infer accelerator count from machine type using registry
+      # This is robust because it uses the same source of truth as the Pod spec generation
+      for tpu_spec in accelerators.TPUS.values():
+        for chips, topo_spec in tpu_spec.topologies.items():
+          if topo_spec.machine_type == machine_type:
+            pool_labels["cloud.google.com/gke-accelerator-count"] = str(chips)
+            break
+
       if all(pool_labels.get(k) == str(v) for k, v in selector.items()):
         return True
     return False