Skip to content

Commit 834c1b1

Browse files
authored
Enable tpu7x daily tests to run on Spot VMs (#5072)
1 parent 1f86318 commit 834c1b1

File tree

2 files changed

+22
-7
lines changed

2 files changed

+22
-7
lines changed

tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,26 @@ steps:
3636
env:
3737
- "ANSIBLE_HOST_KEY_CHECKING=false"
3838
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
39+
- "MACHINE_TYPE=tpu7x-standard-4t"
40+
- "PROJECT_ID=$PROJECT_ID"
41+
- "NUM_NODES=2"
42+
- "INSTANCE_PREFIX=tpu7xsp"
43+
- "BUILD_ID=$BUILD_ID"
44+
- "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpu7xoptions.txt"
3945
args:
4046
- -c
4147
- |
42-
set -x -e
48+
set -e -u -o pipefail
49+
echo "Sourcing find_available_zone.sh to determine zone."
50+
source /workspace/tools/cloud-build/find_available_zone.sh
51+
if [ -z "$${ZONE:-}" ]; then
52+
echo "ERROR: ZONE not found" >&2
53+
exit 1
54+
fi
55+
set -x
4356
cd /workspace && make
44-
BUILD_ID_FULL=$BUILD_ID
45-
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
57+
REGION="$${ZONE%-*}"
58+
BUILD_ID_SHORT="$${BUILD_ID:0:6}"
4659
EXAMPLE_BP=examples/gke-tpu-7x/gke-tpu-7x.yaml
4760
# adding vm to act as remote node
4861
echo ' - id: remote-node' >> $${EXAMPLE_BP}
@@ -64,6 +77,9 @@ steps:
6477
echo ' - python -c '\''import jax; print(jax.device_count(), "TPU cores")'\''' >> $${EXAMPLE_BP}
6578
echo ' node_count: 1' >> $${EXAMPLE_BP}
6679
echo ' outputs: [instructions]' >> $${EXAMPLE_BP}
80+
sed -i -e '/reservation_affinity:/,+3c\ spot: true' $${EXAMPLE_BP}
81+
sed -i '/reservation/d' $${EXAMPLE_BP}
6782
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
6883
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
84+
--extra-vars="region=$${REGION} zone=$${ZONE}" \
6985
--extra-vars="@tools/cloud-build/daily-tests/tests/gke-tpu-7x.yml"

tools/cloud-build/daily-tests/tests/gke-tpu-7x.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,8 @@ deployment_name: gke-tpu-7x-{{ build }}
1818
workspace: /workspace
1919
blueprint_yaml: "{{ workspace }}/examples/gke-tpu-7x/gke-tpu-7x.yaml"
2020
network: "{{ deployment_name }}-net"
21-
region: us-central1
22-
zone: us-central1-c
2321
remote_node: "{{ deployment_name }}-remote-node-0"
2422
machine_type: tpu7x-standard-4t
25-
extended_reservation: cloudtpu-20251001223000-1287580847
2623
num_slices: 1
2724
tpu_topology: 2x2x1
2825
static_node_count: 1
@@ -34,9 +31,11 @@ cli_deployment_vars:
3431
tpu_topology: "{{ tpu_topology }}"
3532
static_node_count: "{{ static_node_count }}"
3633
authorized_cidr: "{{ build_ip.stdout }}/32"
37-
reservation: "{{ extended_reservation }}"
3834
custom_vars:
3935
project: "{{ project }}"
4036
expected_tpu_count: 8
37+
instance_labels:
38+
tpu7x_onspot: true
39+
enable_spot: true
4140
post_deploy_tests:
4241
- test-validation/test-gke-tpu.yml

0 commit comments

Comments
 (0)