google
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/benchmark-config.yaml‎
Lines changed: 59 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/benchmark-config.yaml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/kueue-config.yaml‎
Lines changed: 28 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/kueue-config.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_16.yaml‎
Lines changed: 28 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_16.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_2.yaml‎
Lines changed: 28 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_2.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_4.yaml‎
Lines changed: 28 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_4.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_4_no_broadcast.yaml‎
Lines changed: 28 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-70b_replicas_4_no_broadcast.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_16.yaml‎
Lines changed: 31 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_16.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_16_no_broadcast.yaml‎
Lines changed: 31 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_16_no_broadcast.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_2.yaml‎
Lines changed: 31 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_2.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_2_no_broadcast.yaml‎
Lines changed: 31 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/evaluations/restore_and_broadcast/llama-8b_replicas_2_no_broadcast.yaml‎
Lines changed: 31 additions & 0 deletions
@@ -0,0 +1,59 @@
+---
+# 1. Headless Service: Required for distributed pods to discover each other
+apiVersion: v1
+kind: Service
+metadata:
+  name: ${JOB_NAME}
+  namespace: default
+spec:
+  clusterIP: None
+  selector:
+    job-name: ${JOB_NAME}
+---
+# 2. Indexed Job: Manages the distributed workload and queues via Kueue
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ${JOB_NAME}
+  namespace: default
+  labels:
+    kueue.x-k8s.io/queue-name: multislice-queue
+spec:
+  completions: ${TOTAL_PODS}
+  parallelism: ${TOTAL_PODS}
+  completionMode: Indexed
+  template:
+    metadata:
+      labels:
+        job-name: ${JOB_NAME}
+    spec:
+      subdomain: ${JOB_NAME}
+      restartPolicy: Never
+      containers:
+      - name: benchmark
+        image: ${IMAGE}
+        
+        # ---> IMPORTANT: UPDATE THIS COMMAND <---
+        command:
+        - "python3"
+        - "/path/to/your/benchmark_script.py" 
+        - "--config_file=${FULL_CONFIG_PATH}"
+        - "--output_directory=${OUTPUT_DIR}"
+        
+        # 3. Distributed Setup: Injecting JAX environment variables natively
+        env:
+        - name: JAX_COORDINATOR_ADDRESS
+          value: "${JOB_NAME}-0.${JOB_NAME}.default.svc.cluster.local"
+        - name: JAX_COORDINATOR_PORT
+          value: "1234"
+        - name: JAX_PROCESS_COUNT
+          value: "${TOTAL_PODS}"
+        - name: JAX_PROCESS_INDEX
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+        
+        # 4. Resource constraint tailored to your cluster
+        resources:
+          requests:
+            cpu: "1"
@@ -0,0 +1,28 @@
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ResourceFlavor
+metadata:
+  name: "spot-flavor"
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: "xpk-cluster-queue"
+spec:
+  namespaceSelector: {} # Allows jobs from any namespace
+  resourceGroups:
+  - coveredResources: ["cpu", "memory"]
+    flavors:
+    - name: "spot-flavor"
+      resources:
+      - name: "cpu"
+        nominalQuota: 1000  # Set artificially high to allow scaling
+      - name: "memory"
+        nominalQuota: 4000Gi
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  name: "multislice-queue" # XPK strictly looks for this name by default
+  namespace: "default"
+spec:
+  clusterQueue: "xpk-cluster-queue"
@@ -0,0 +1,28 @@
+# The name for the entire test suite run.
+# Assumes n2-standard-32-32 (32 machines) X 16 replicas
+suite_name: "llama-70b_replicas_16"
+num_repeats: 1
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 32}
+  dcn_parallelism: {"replica": 16}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      reference_checkpoint_path: "gs://orbax-benchmarks/checkpoints/llama-70b_generate_4-8-4_subchunked/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-70b-v5p-128-data-1-fsdp-64-tensor-1/abstract_state.json"
+      use_load_and_broadcast: true
@@ -0,0 +1,28 @@
+# The name for the entire test suite run.
+# Assumes n2-standard-2-64 (64 machines) X 2 replicas
+suite_name: "llama-70b_replicas_2"
+num_repeats: 1
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 64}
+  dcn_parallelism: {"replica": 2}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      reference_checkpoint_path: "gs://orbax-benchmarks/checkpoints/llama-70b_generate_4-8-4_subchunked/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-70b-v5p-128-data-1-fsdp-64-tensor-1/abstract_state.json"
+      use_load_and_broadcast: true
@@ -0,0 +1,28 @@
+# The name for the entire test suite run.
+# Assumes n2-standard-32-32 (32 machines) X 4 replicas
+suite_name: "llama-70b_replicas_4"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 64}
+  dcn_parallelism: {"replica": 4}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      reference_checkpoint_path: "gs://orbax-benchmarks/checkpoints/llama-70b_generate_4-8-4_subchunked/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-70b-v5p-128-data-1-fsdp-64-tensor-1/abstract_state.json"
+      use_load_and_broadcast: true
@@ -0,0 +1,28 @@
+# The name for the entire test suite run.
+# Assumes n2-standard-32-32 (32 machines) X 4 replicas
+suite_name: "llama-70b_replicas_4_no_broadcast"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 64}
+  dcn_parallelism: {"replica": 4}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      reference_checkpoint_path: "gs://orbax-benchmarks/checkpoints/llama-70b_generate_4-8-4/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-70b-v5p-256-data-4-fsdp-8-tensor-4/abstract_state.json"
+      use_load_and_broadcast: False
@@ -0,0 +1,31 @@
+# The name for the entire test suite run.
+# Assumes 32 devices X 16 replicas
+suite_name: "llama-8b_replicas_16"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 32}
+  dcn_parallelism: {"replica": 16}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      # Note, uses a bucket in EU, assuming the benchmark will run from a cell in the US. This
+      # should increase the storage latency and make the effect of broadcasting more pronounced,
+      # since the scale we can run at is too small to see much difference otherwise.
+      reference_checkpoint_path: "gs://cpgaffney-eu-bucket/checkpoints/llama-8b_generate_8-2-1/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-8b-v5p-8-data-1-fsdp-4-tensor-1/abstract_state.json"
+      use_load_and_broadcast: true
@@ -0,0 +1,31 @@
+# The name for the entire test suite run.
+# Assumes 32 devices X 16 replicas
+suite_name: "llama-8b_replicas_16_no_broadcast"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 32}
+  dcn_parallelism: {"replica": 16}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      # Note, uses a bucket in EU, assuming the benchmark will run from a cell in the US. This
+      # should increase the storage latency and make the effect of broadcasting more pronounced,
+      # since the scale we can run at is too small to see much difference otherwise.
+      reference_checkpoint_path: "gs://cpgaffney-eu-bucket/checkpoints/llama-8b_generate_8-2-1/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-8b-v5p-8-data-1-fsdp-4-tensor-1/abstract_state.json"
+      use_load_and_broadcast: false
@@ -0,0 +1,31 @@
+# The name for the entire test suite run.
+# Assumes v5p-8 (4 chips) X 2 replicas
+suite_name: "llama-8b_replicas_2"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 4}
+  dcn_parallelism: {"replica": 2}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      # Note, uses a bucket in EU, assuming the benchmark will run from a cell in the US. This
+      # should increase the storage latency and make the effect of broadcasting more pronounced,
+      # since the scale we can run at is too small to see much difference otherwise.
+      reference_checkpoint_path: "gs://cpgaffney-eu-bucket/checkpoints/llama-8b_generate_8-2-1/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-8b-v5p-8-data-1-fsdp-4-tensor-1/abstract_state.json"
+      use_load_and_broadcast: true
@@ -0,0 +1,31 @@
+# The name for the entire test suite run.
+# Assumes v5p-8 (4 chips) X 2 replicas
+suite_name: "llama-8b_replicas_2_no_broadcast"
+num_repeats: 20
+
+
+mesh_config:
+  mesh_axes: ["replica", "model"]
+  # Should match reference_sharding_path.
+  ici_parallelism: {"replica": 1, "model": 4}
+  dcn_parallelism: {"replica": 2}
+
+# Note: checkpoint_config field not specified.
+
+benchmarks:
+  - generator: "orbax.checkpoint._src.testing.benchmarks.v1.restore_and_broadcast_benchmark.RestoreAndBroadcastBenchmark"
+    options:
+      # --- Generator Options ---
+      # These keys must match the attributes of the `RestoreAndBroadcastBenchmarkOptions` class
+      # associated with the `RestoreAndBroadcastBenchmark` generator.
+      async_enabled: true
+      use_ocdbt: true
+      use_zarr3: true
+      use_replica_parallel: false
+      use_compression: true
+      # Note, uses a bucket in EU, assuming the benchmark will run from a cell in the US. This
+      # should increase the storage latency and make the effect of broadcasting more pronounced,
+      # since the scale we can run at is too small to see much difference otherwise.
+      reference_checkpoint_path: "gs://cpgaffney-eu-bucket/checkpoints/llama-8b_generate_8-2-1/ckpt"
+      reference_sharding_path: "gs://orbax-benchmarks/sharding-configs/llama3.1-8b-v5p-8-data-1-fsdp-4-tensor-1/abstract_state.json"
+      use_load_and_broadcast: false