Skip to content

Commit 023b390

Browse files
committed
TPU v6e DWS flex integration tests
Change-Id: I610b93194747bdfba9c58d3b489142f5b289af80
1 parent fac6ef3 commit 023b390

File tree

3 files changed

+296
-0
lines changed

3 files changed

+296
-0
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
- name: Assert variables are defined
16+
ansible.builtin.assert:
17+
that:
18+
- custom_vars.project is defined
19+
20+
- name: Get cluster credentials for kubectl
21+
delegate_to: localhost
22+
ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ region }} --project {{ custom_vars.project }} --verbosity=debug
23+
24+
# --- Flex Start Specific Checks (Pre-Job) ---
25+
26+
- name: Wait for nodes to scale down to 0 (Pre-test)
27+
delegate_to: localhost
28+
ansible.builtin.shell: |
29+
kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l
30+
register: pre_test_node_count
31+
retries: 90
32+
delay: 10 # Wait up to 15 minutes for scale down if needed
33+
until: pre_test_node_count.stdout | int == 0
34+
35+
- name: Verify Initial Node Count (Should be 0 for Flex)
36+
delegate_to: localhost
37+
ansible.builtin.shell: |
38+
kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l
39+
register: initial_node_count
40+
failed_when: initial_node_count.stdout | int != 0
41+
42+
- name: Identify Job File
43+
delegate_to: localhost
44+
ansible.builtin.shell: ls {{ workspace }}/{{ deployment_name }}/primary/my-job*.yaml | head -n 1
45+
register: job_file_result
46+
changed_when: False
47+
48+
- name: Register Job Variables
49+
ansible.builtin.set_fact:
50+
job_file_path: "{{ job_file_result.stdout }}"
51+
job_name: "{{ job_file_result.stdout | basename | splitext | first }}"
52+
53+
- name: Execute the job
54+
delegate_to: localhost
55+
ansible.builtin.command: kubectl create -f {{ job_file_path }}
56+
changed_when: False
57+
58+
# --- Flex Start Specific Checks (Scale Up) ---
59+
60+
- name: Block to wait for job completion, verify scaling, and capture failures
61+
block:
62+
# Check for TriggeredScaleUp event (This confirms Flex/Autoscaling kicked in)
63+
- name: Wait for TriggeredScaleUp event
64+
delegate_to: localhost
65+
ansible.builtin.shell: |
66+
kubectl get events --sort-by='.lastTimestamp' | grep "TriggeredScaleUp" | grep -E "gke-tpu|instanceGroups"
67+
register: scale_up_event
68+
retries: 30 # Wait up to ~5 minutes (30 * 10s)
69+
delay: 10
70+
until: scale_up_event.rc == 0
71+
ignore_errors: true
72+
73+
- name: Wait for nodes to become Ready
74+
delegate_to: localhost
75+
ansible.builtin.shell: |
76+
kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | grep "Ready" | wc -l
77+
register: ready_node_count
78+
retries: 60 # Wait up to 10 minutes for nodes to boot
79+
delay: 10
80+
until: ready_node_count.stdout | int == (custom_vars.expected_node_count | default(4) | int)
81+
82+
- name: Verify Flex Start label on Nodes
83+
delegate_to: localhost
84+
ansible.builtin.shell: |
85+
kubectl get nodes -l cloud.google.com/gke-flex-start=true --no-headers | wc -l
86+
register: labeled_node_count
87+
failed_when: labeled_node_count.stdout | int == 0
88+
89+
- name: Wait for job to complete
90+
delegate_to: localhost
91+
ansible.builtin.command: kubectl wait --for=condition=complete "job/{{ job_name }}" --timeout=20m
92+
register: job_wait_result
93+
94+
- name: Verify Expected Pods Completed
95+
delegate_to: localhost
96+
ansible.builtin.shell: |
97+
kubectl get pods --selector=job-name={{ job_name }} --field-selector=status.phase=Succeeded --no-headers | wc -l
98+
register: succeeded_pod_count
99+
failed_when: succeeded_pod_count.stdout | int != (custom_vars.expected_node_count | default(4) | int)
100+
101+
- name: Get job logs
102+
delegate_to: localhost
103+
# Using label selector to get logs from all pods in the job
104+
ansible.builtin.shell: |
105+
kubectl logs -l job-name={{ job_name }} --all-containers --prefix --tail=10
106+
register: job_logs
107+
changed_when: false
108+
ignore_errors: true
109+
no_log: true
110+
111+
- name: Print job output
112+
ansible.builtin.debug:
113+
msg: "Job Output (Last 10 lines): {{ job_logs.stdout if (job_logs.stdout is defined and job_logs.stdout | length > 0) else 'No logs found' }}"
114+
115+
- name: Wait for nodes to scale down to 0
116+
delegate_to: localhost
117+
ansible.builtin.shell: |
118+
kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l
119+
register: final_node_count
120+
retries: 120
121+
delay: 10 # Wait up to 20 minutes for scale down
122+
until: final_node_count.stdout | int == 0
123+
124+
rescue:
125+
- name: "FAILURE: Job did not complete or verification failed. Capturing debug info."
126+
ansible.builtin.debug:
127+
msg: "The Job failed to complete within the expected time or failed verification. See debug output below."
128+
129+
- name: Get all pods status on failure
130+
delegate_to: localhost
131+
ansible.builtin.shell: |
132+
kubectl get pods -o wide
133+
register: pods_on_failure
134+
changed_when: false
135+
136+
- name: Print all pods status on failure
137+
ansible.builtin.debug:
138+
msg: "{{ pods_on_failure.stdout_lines }}"
139+
140+
- name: Describe the job on failure
141+
delegate_to: localhost
142+
ansible.builtin.shell: |
143+
kubectl describe job {{ job_name }}
144+
register: job_describe_on_failure
145+
changed_when: false
146+
147+
- name: Print job description on failure
148+
ansible.builtin.debug:
149+
msg: "{{ job_describe_on_failure.stdout_lines }}"
150+
151+
- name: Get events on failure
152+
delegate_to: localhost
153+
ansible.builtin.shell: |
154+
kubectl get events --sort-by='.lastTimestamp'
155+
register: events_on_failure
156+
changed_when: false
157+
158+
- name: Print events on failure
159+
ansible.builtin.debug:
160+
msg: "{{ events_on_failure.stdout_lines }}"
161+
162+
- name: Fail the playbook
163+
ansible.builtin.fail:
164+
msg: "GKE DWS Flex Job validation failed."
165+
166+
always:
167+
- name: Clean up
168+
delegate_to: localhost
169+
ansible.builtin.shell: |
170+
kubectl delete job {{ job_name }}
171+
ignore_errors: true
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
tags:
17+
- m.vpc
18+
- m.service-account
19+
- m.gke-cluster
20+
- m.gke-node-pool
21+
- m.kubectl-apply
22+
- m.cloud-storage-bucket
23+
- m.gke-job-template
24+
- m.gke-persistent-volume
25+
- m.pre-existing-network-storage
26+
- gke
27+
28+
timeout: 7200s
29+
30+
steps:
31+
# While using static network names we are guarding against more than 1 instance running at a time (for multi-group tests)
32+
- id: check_for_running_build
33+
name: gcr.io/cloud-builders/gcloud
34+
script: "tools/cloud-build/check_running_build.sh tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml"
35+
36+
- id: gke-tpu-v6e-flex
37+
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
38+
entrypoint: /bin/bash
39+
env:
40+
- "ANSIBLE_HOST_KEY_CHECKING=false"
41+
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
42+
- "PROJECT_ID=$PROJECT_ID"
43+
- "BUILD_ID=$BUILD_ID"
44+
args:
45+
- -c
46+
- |
47+
set -e -u -o pipefail
48+
set -x
49+
cd /workspace && make
50+
ZONE=asia-northeast1-b
51+
REGION=asia-northeast1
52+
BUILD_ID_SHORT="$${BUILD_ID:0:6}"
53+
54+
# We use the DWS Flex example blueprint
55+
EXAMPLE_BP=examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e.yaml
56+
57+
echo ''
58+
echo ' - id: remote-node' >> $${EXAMPLE_BP}
59+
echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP}
60+
echo ' use: [gke-tpu-v6-net-0]' >> $${EXAMPLE_BP}
61+
echo ' settings:' >> $${EXAMPLE_BP}
62+
echo ' machine_type: n2-standard-2' >> $${EXAMPLE_BP}
63+
echo ' name_prefix: remote-node' >> $${EXAMPLE_BP}
64+
echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
65+
66+
# IMPORTANT: Ensure auto_repair is FALSE for Flex, and remove conflicting spot/reservation settings if any
67+
# The base blueprint might have defaults we want to override via sed if CLI vars aren't enough or to be safe
68+
# But cli_deployment_vars in the test definition should handle most overrides.
69+
70+
# Run the test
71+
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
72+
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
73+
--extra-vars="region=$${REGION} zone=$${ZONE}" \
74+
--extra-vars="@tools/cloud-build/daily-tests/tests/gke-tpu-v6e-flex.yml"
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
test_name: gke-tpu-v6e-flex
17+
deployment_name: gk-t-v6-flx-{{ build }}
18+
workspace: /workspace
19+
blueprint_yaml: "{{ workspace }}/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e.yaml"
20+
network: "{{ deployment_name }}-net-0"
21+
remote_node: "{{ deployment_name }}-remote-node-0"
22+
machine_type: ct6e-standard-4t
23+
region: asia-northeast1
24+
zone: asia-northeast1-b
25+
num_slices: 1
26+
tpu_topology: 4x4
27+
enable_flex_start: true
28+
autoscaling_min_node_count_per_zone: 0
29+
autoscaling_max_node_count_per_zone: 4
30+
31+
cli_deployment_vars:
32+
project_id: "{{ project }}"
33+
region: "{{ region }}"
34+
zone: "{{ zone }}"
35+
num_slices: "{{ num_slices }}"
36+
machine_type: "{{ machine_type }}"
37+
tpu_topology: "{{ tpu_topology }}"
38+
enable_flex_start: "{{ enable_flex_start }}"
39+
autoscaling_min_node_count_per_zone: "{{ autoscaling_min_node_count_per_zone }}"
40+
autoscaling_max_node_count_per_zone: "{{ autoscaling_max_node_count_per_zone }}"
41+
authorized_cidr: "{{ build_ip.stdout }}/32"
42+
system_node_pool_disk_size_gb: 200
43+
v6e_node_pool_disk_size_gb: 100
44+
45+
custom_vars:
46+
project: "{{ project }}"
47+
expected_node_count: 4
48+
tpu_accelerator_type: tpu-v6e-slice
49+
50+
post_deploy_tests:
51+
- test-validation/test-gke-tpu-flex-autoscaling.yml

0 commit comments

Comments
 (0)