Skip to content

Commit cc591ea

Browse files
Merge pull request #3722 from RachaelSTamakloe/rc-cherry-pick-3718
Cherry-pick "A4 GKE integration test"
2 parents 80fc4ca + 9fb64d7 commit cc591ea

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
tags:
17+
- m.gke-job-template
18+
- gke
19+
- m.gke-cluster
20+
- m.gke-node-pool
21+
- m.service-account
22+
- m.gpu-rdma-vpc
23+
- m.kubectl-apply
24+
- m.vpc
25+
26+
timeout: 14400s # 4hr
27+
steps:
28+
- id: gke-a4-highgpu
29+
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
30+
entrypoint: /bin/bash
31+
env:
32+
- "ANSIBLE_HOST_KEY_CHECKING=false"
33+
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
34+
args:
35+
- -c
36+
- |
37+
set -x -e
38+
cd /workspace && make
39+
BUILD_ID_FULL=$BUILD_ID
40+
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
41+
EXAMPLE_BP=examples/gke-a4-highgpu/gke-a4-highgpu.yaml
42+
43+
44+
# adding vm to act as remote node
45+
echo ' - id: remote-node' >> $${EXAMPLE_BP}
46+
echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP}
47+
echo ' use: [gke-a4-high-net-0]' >> $${EXAMPLE_BP}
48+
echo ' settings:' >> $${EXAMPLE_BP}
49+
echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP}
50+
echo ' name_prefix: remote-node' >> $${EXAMPLE_BP}
51+
echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
52+
echo ''
53+
echo ' - id: job_template_hostname' >> $${EXAMPLE_BP}
54+
echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP}
55+
echo ' use: [a4-highgpu-pool]' >> $${EXAMPLE_BP}
56+
echo ' settings:' >> $${EXAMPLE_BP}
57+
echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
58+
echo ' command:' >> $${EXAMPLE_BP}
59+
echo ' - nvidia-smi' >> $${EXAMPLE_BP}
60+
echo ' node_count: 1' >> $${EXAMPLE_BP}
61+
echo ' outputs: [instructions]' >> $${EXAMPLE_BP}
62+
63+
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
64+
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
65+
--extra-vars="@tools/cloud-build/daily-tests/tests/gke-a4-highgpu.yml"
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
---
16+
17+
# region, zone must be defined
18+
# in build file with --extra-vars flag!
19+
test_name: gke-a4high
20+
deployment_name: gke-a4high-{{ build }}
21+
workspace: /workspace
22+
blueprint_yaml: "{{ workspace }}/examples/gke-a4-highgpu/gke-a4-highgpu.yaml"
23+
network: "{{ deployment_name }}-net-0"
24+
region: us-central1
25+
zone: us-central1-b
26+
remote_node: "{{ deployment_name }}-remote-node-0"
27+
extended_reservation: a4-exr-hpc-toolkit-dev
28+
static_node_count: 2
29+
k8s_service_account_name: workload-identity-k8s-sa
30+
cli_deployment_vars:
31+
region: "{{ region }}"
32+
zone: "{{ zone }}"
33+
static_node_count: "{{ static_node_count }}"
34+
extended_reservation: "{{ extended_reservation }}"
35+
authorized_cidr: "{{ build_ip.stdout }}/32"
36+
gcp_public_cidrs_access_enabled: true
37+
k8s_service_account_name: "{{ k8s_service_account_name}}"
38+
custom_vars:
39+
project: "{{ project }}"
40+
post_deploy_tests:
41+
- test-validation/test-gke-job.yml

0 commit comments

Comments
 (0)