Fixes pathways integration

JyotinderSingh · JyotinderSingh · commit c24041c7ebb0 · 2026-03-11T11:36:38.000+05:30
diff --git a/examples/pathways_example.py b/examples/pathways_example.py
@@ -10,10 +10,55 @@
 
 
 # A simple model that will be executed remotely on pathways
-@keras_remote.run(accelerator="v5litepod-1", backend="pathways")
+@keras_remote.run(
+  accelerator="v6e-16", backend="pathways", cluster="keras-team-dogfood"
+)
 def train_simple_model():
+  import jax
+  from jax import lax
+
   print("Running Pathways job on JAX Backend!")
 
+  # Verify distributed JAX setup (Pathways auto-initialization)
+  process_count = jax.process_count()
+  process_index = jax.process_index()
+  device_count = jax.device_count()
+  local_device_count = jax.local_device_count()
+
+  print("JAX Distributed Environment:")
+  print(f"  Process Count: {process_count}")
+  print(f"  Process Index: {process_index}")
+  print(f"  Total Devices: {device_count}")
+  print(f"  Local Devices: {local_device_count}")
+
+  # Fail if not actually running on multiple hosts
+  if process_count <= 1:
+    raise RuntimeError(
+      f"Pathways verification failed: Expected > 1 processes, but found {process_count}. "
+      "This indicates the job is NOT running in a multi-host Pathways environment."
+    )
+
+  # Verify collective communication (cross-host psum)
+  try:
+    # Use jax.pmap to sum values across all devices in the cluster
+    x = np.ones(local_device_count)
+    distributed_sum = jax.pmap(lambda val: lax.psum(val, "i"), axis_name="i")(x)
+    total_sum = distributed_sum[0]
+
+    if total_sum != device_count:
+      raise RuntimeError(
+        f"Collective verification failed: Expected psum {device_count}, got {total_sum}"
+      )
+    print(
+      f"Successfully verified collective communication across all {total_sum} devices!"
+    )
+  except Exception as e:
+    print(f"Warning: Collective verification failed: {e}")
+    if isinstance(e, RuntimeError) and "Collective verification failed" in str(
+      e
+    ):
+      raise
+
   # Create a simple dataset
   x = np.random.rand(1000, 10)
   y = np.random.randint(0, 2, size=(1000, 1))
diff --git a/keras_remote/backend/gke_client.py b/keras_remote/backend/gke_client.py
@@ -238,13 +238,16 @@ def _parse_accelerator(accelerator):
     }
 
   if isinstance(parsed, TpuConfig):
+    # For TPU Podslices (multi-node), resource requests must be per-node.
+    # num_nodes is 1 for single-host TPUs (v3-8, v4-8, v5litepod-1/4/8).
+    chips_per_node = parsed.chips // parsed.num_nodes
     return {
       "node_selector": {
         "cloud.google.com/gke-tpu-accelerator": parsed.gke_accelerator,
         "cloud.google.com/gke-tpu-topology": parsed.topology,
       },
-      "resource_limits": {"google.com/tpu": str(parsed.chips)},
-      "resource_requests": {"google.com/tpu": str(parsed.chips)},
+      "resource_limits": {"google.com/tpu": str(chips_per_node)},
+      "resource_requests": {"google.com/tpu": str(chips_per_node)},
       "tolerations": [
         {"key": "google.com/tpu", "operator": "Exists", "effect": "NoSchedule"}
       ],
@@ -330,8 +333,10 @@ def _create_job_spec(
     ],
     env=env_vars,
     resources=client.V1ResourceRequirements(
-      limits=accel_config["resource_limits"],
-      requests=accel_config["resource_requests"],
+      limits={k: str(v) for k, v in accel_config["resource_limits"].items()},
+      requests={
+        k: str(v) for k, v in accel_config["resource_requests"].items()
+      },
     ),
   )
 
@@ -466,7 +471,9 @@ def _check_node_pool_exists_cached(selector_items) -> bool:
       for tpu_spec in accelerators.TPUS.values():
         for chips, topo_spec in tpu_spec.topologies.items():
           if topo_spec.machine_type == machine_type:
-            pool_labels["cloud.google.com/gke-accelerator-count"] = str(chips)
+            pool_labels["cloud.google.com/gke-accelerator-count"] = str(
+              chips // topo_spec.num_nodes
+            )
             break
 
       if all(pool_labels.get(k) == str(v) for k, v in selector.items()):
diff --git a/keras_remote/backend/gke_client_test.py b/keras_remote/backend/gke_client_test.py
@@ -58,6 +58,15 @@ def test_tpu_v3_8(self):
     self.assertLen(result["tolerations"], 1)
     self.assertEqual(result["tolerations"][0]["key"], "google.com/tpu")
 
+  def test_tpu_v3_16_multi_node(self):
+    # v3-16 has 4 nodes and 16 total chips -> 4 chips per node
+    result = _parse_accelerator("v3-16")
+    self.assertEqual(result["resource_limits"], {"google.com/tpu": "4"})
+    self.assertEqual(result["resource_requests"], {"google.com/tpu": "4"})
+    self.assertEqual(
+      result["node_selector"]["cloud.google.com/gke-tpu-topology"], "4x4"
+    )
+
   def test_tpu_v5litepod_4(self):
     result = _parse_accelerator("v5litepod-4")
     self.assertEqual(
@@ -421,6 +430,29 @@ def test_tpu_match(self):
     )
     self.assertTrue(result)
 
+  def test_tpu_multi_node_match(self):
+    """Test that it correctly identifies a 4-chip-per-node pool for v6e-16."""
+    self.mock_run.return_value = json.dumps(
+      [
+        {
+          "config": {
+            "machineType": "ct6e-standard-4t",
+            "accelerators": [{"acceleratorType": "tpu-v6e-slice"}],
+            "labels": {},
+          }
+        }
+      ]
+    )
+
+    result = _check_node_pool_exists_cached(
+      (
+        ("cloud.google.com/gke-tpu-accelerator", "tpu-v6e-slice"),
+        ("cloud.google.com/gke-tpu-topology", "4x4"),
+        ("cloud.google.com/gke-accelerator-count", "4"),
+      )
+    )
+    self.assertTrue(result)
+
   def test_no_match(self):
     self.mock_run.return_value = json.dumps(
       [
diff --git a/keras_remote/backend/pathways_client.py b/keras_remote/backend/pathways_client.py
@@ -137,6 +137,7 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
   # The leader pod is suffixed with '-0' by LWS
   leader_pod_name = f"{job_name}-0"
 
+  logged_pending = set()
   with LogStreamer(core_v1, namespace) as streamer:
     while True:
       elapsed = time.time() - start_time
@@ -160,7 +161,7 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
           raise RuntimeError(f"Pathways job {job_name} failed")
 
         elif pod.status.phase == "Pending":
-          _check_pod_scheduling(core_v1, job_name, namespace)
+          _check_pod_scheduling(core_v1, job_name, namespace, logged_pending)
           logging.debug("Pod is Pending...")
 
         elif pod.status.phase == "Running":
@@ -288,8 +289,12 @@ def _create_lws_spec(
           ],
           "env": env_vars,
           "resources": {
-            "limits": accel_config["resource_limits"],
-            "requests": accel_config["resource_requests"],
+            "limits": {
+              k: str(v) for k, v in accel_config["resource_limits"].items()
+            },
+            "requests": {
+              k: str(v) for k, v in accel_config["resource_requests"].items()
+            },
           },
         }
       ],