|
| 1 | +# Copyright 2026 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +- name: Assert variables are defined |
| 16 | + ansible.builtin.assert: |
| 17 | + that: |
| 18 | + - custom_vars.project is defined |
| 19 | + |
| 20 | +- name: Get cluster credentials for kubectl |
| 21 | + delegate_to: localhost |
| 22 | + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ region }} --project {{ custom_vars.project }} --verbosity=debug |
| 23 | + |
| 24 | +# --- Flex Start Specific Checks (Pre-Job) --- |
| 25 | + |
| 26 | +- name: Wait for nodes to scale down to 0 (Pre-test) |
| 27 | + delegate_to: localhost |
| 28 | + ansible.builtin.shell: | |
| 29 | + kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l |
| 30 | + register: pre_test_node_count |
| 31 | + retries: 90 |
| 32 | + delay: 10 # Wait up to 15 minutes for scale down if needed |
| 33 | + until: pre_test_node_count.stdout | int == 0 |
| 34 | + |
| 35 | +- name: Verify Initial Node Count (Should be 0 for Flex) |
| 36 | + delegate_to: localhost |
| 37 | + ansible.builtin.shell: | |
| 38 | + kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l |
| 39 | + register: initial_node_count |
| 40 | + failed_when: initial_node_count.stdout | int != 0 |
| 41 | + |
| 42 | +- name: Identify Job File |
| 43 | + delegate_to: localhost |
| 44 | + ansible.builtin.shell: ls {{ workspace }}/{{ deployment_name }}/primary/my-job*.yaml | head -n 1 |
| 45 | + register: job_file_result |
| 46 | + changed_when: False |
| 47 | + |
| 48 | +- name: Register Job Variables |
| 49 | + ansible.builtin.set_fact: |
| 50 | + job_file_path: "{{ job_file_result.stdout }}" |
| 51 | + job_name: "{{ job_file_result.stdout | basename | splitext | first }}" |
| 52 | + |
| 53 | +- name: Execute the job |
| 54 | + delegate_to: localhost |
| 55 | + ansible.builtin.command: kubectl create -f {{ job_file_path }} |
| 56 | + changed_when: False |
| 57 | + |
| 58 | +# --- Flex Start Specific Checks (Scale Up) --- |
| 59 | + |
| 60 | +- name: Block to wait for job completion, verify scaling, and capture failures |
| 61 | + block: |
| 62 | + # Check for TriggeredScaleUp event (This confirms Flex/Autoscaling kicked in) |
| 63 | + - name: Wait for TriggeredScaleUp event |
| 64 | + delegate_to: localhost |
| 65 | + ansible.builtin.shell: | |
| 66 | + kubectl get events --sort-by='.lastTimestamp' | grep "TriggeredScaleUp" | grep -E "gke-tpu|instanceGroups" |
| 67 | + register: scale_up_event |
| 68 | + retries: 30 # Wait up to ~5 minutes (30 * 10s) |
| 69 | + delay: 10 |
| 70 | + until: scale_up_event.rc == 0 |
| 71 | + ignore_errors: true |
| 72 | + |
| 73 | + - name: Wait for nodes to become Ready |
| 74 | + delegate_to: localhost |
| 75 | + ansible.builtin.shell: | |
| 76 | + kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | grep "Ready" | wc -l |
| 77 | + register: ready_node_count |
| 78 | + retries: 60 # Wait up to 10 minutes for nodes to boot |
| 79 | + delay: 10 |
| 80 | + until: ready_node_count.stdout | int == (custom_vars.expected_node_count | default(4) | int) |
| 81 | + |
| 82 | + - name: Verify Flex Start label on Nodes |
| 83 | + delegate_to: localhost |
| 84 | + ansible.builtin.shell: | |
| 85 | + kubectl get nodes -l cloud.google.com/gke-flex-start=true --no-headers | wc -l |
| 86 | + register: labeled_node_count |
| 87 | + failed_when: labeled_node_count.stdout | int == 0 |
| 88 | + |
| 89 | + - name: Wait for job to complete |
| 90 | + delegate_to: localhost |
| 91 | + ansible.builtin.command: kubectl wait --for=condition=complete "job/{{ job_name }}" --timeout=20m |
| 92 | + register: job_wait_result |
| 93 | + |
| 94 | + - name: Verify Expected Pods Completed |
| 95 | + delegate_to: localhost |
| 96 | + ansible.builtin.shell: | |
| 97 | + kubectl get pods --selector=job-name={{ job_name }} --field-selector=status.phase=Succeeded --no-headers | wc -l |
| 98 | + register: succeeded_pod_count |
| 99 | + failed_when: succeeded_pod_count.stdout | int != (custom_vars.expected_node_count | default(4) | int) |
| 100 | + |
| 101 | + - name: Get job logs |
| 102 | + delegate_to: localhost |
| 103 | + # Using label selector to get logs from all pods in the job |
| 104 | + ansible.builtin.shell: | |
| 105 | + kubectl logs -l job-name={{ job_name }} --all-containers --prefix --tail=10 |
| 106 | + register: job_logs |
| 107 | + changed_when: false |
| 108 | + ignore_errors: true |
| 109 | + no_log: true |
| 110 | + |
| 111 | + - name: Print job output |
| 112 | + ansible.builtin.debug: |
| 113 | + msg: "Job Output (Last 10 lines): {{ job_logs.stdout if (job_logs.stdout is defined and job_logs.stdout | length > 0) else 'No logs found' }}" |
| 114 | + |
| 115 | + - name: Wait for nodes to scale down to 0 |
| 116 | + delegate_to: localhost |
| 117 | + ansible.builtin.shell: | |
| 118 | + kubectl get nodes -l cloud.google.com/gke-tpu-accelerator={{ custom_vars.tpu_accelerator_type | default('tpu-v6e-slice') }} --no-headers | wc -l |
| 119 | + register: final_node_count |
| 120 | + retries: 120 |
| 121 | + delay: 10 # Wait up to 20 minutes for scale down |
| 122 | + until: final_node_count.stdout | int == 0 |
| 123 | + |
| 124 | + rescue: |
| 125 | + - name: "FAILURE: Job did not complete or verification failed. Capturing debug info." |
| 126 | + ansible.builtin.debug: |
| 127 | + msg: "The Job failed to complete within the expected time or failed verification. See debug output below." |
| 128 | + |
| 129 | + - name: Get all pods status on failure |
| 130 | + delegate_to: localhost |
| 131 | + ansible.builtin.shell: | |
| 132 | + kubectl get pods -o wide |
| 133 | + register: pods_on_failure |
| 134 | + changed_when: false |
| 135 | + |
| 136 | + - name: Print all pods status on failure |
| 137 | + ansible.builtin.debug: |
| 138 | + msg: "{{ pods_on_failure.stdout_lines }}" |
| 139 | + |
| 140 | + - name: Describe the job on failure |
| 141 | + delegate_to: localhost |
| 142 | + ansible.builtin.shell: | |
| 143 | + kubectl describe job {{ job_name }} |
| 144 | + register: job_describe_on_failure |
| 145 | + changed_when: false |
| 146 | + |
| 147 | + - name: Print job description on failure |
| 148 | + ansible.builtin.debug: |
| 149 | + msg: "{{ job_describe_on_failure.stdout_lines }}" |
| 150 | + |
| 151 | + - name: Get events on failure |
| 152 | + delegate_to: localhost |
| 153 | + ansible.builtin.shell: | |
| 154 | + kubectl get events --sort-by='.lastTimestamp' |
| 155 | + register: events_on_failure |
| 156 | + changed_when: false |
| 157 | + |
| 158 | + - name: Print events on failure |
| 159 | + ansible.builtin.debug: |
| 160 | + msg: "{{ events_on_failure.stdout_lines }}" |
| 161 | + |
| 162 | + - name: Fail the playbook |
| 163 | + ansible.builtin.fail: |
| 164 | + msg: "GKE DWS Flex Job validation failed." |
| 165 | + |
| 166 | + always: |
| 167 | + - name: Clean up |
| 168 | + delegate_to: localhost |
| 169 | + ansible.builtin.shell: | |
| 170 | + kubectl delete job {{ job_name }} |
| 171 | + ignore_errors: true |
0 commit comments