Skip to content

Commit c89c3f9

Browse files
committed
Re-enable EKS tests
1 parent b1dd823 commit c89c3f9

File tree

2 files changed

+146
-153
lines changed

2 files changed

+146
-153
lines changed

.github/workflows/_test_nccl.yaml

Lines changed: 139 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -20,147 +20,142 @@ jobs:
2020
JAX_IMAGE: ${{ inputs.CONTAINER }}
2121
secrets: inherit
2222

23-
# EKS cluster offline due to maintenance - to run manually
24-
#
25-
# build-mpi-operator-compatible-base:
26-
# runs-on: [self-hosted, "amd64", "large"]
27-
# steps:
28-
# - name: Login to nvcr.io Container Registry
29-
# uses: docker/login-action@v3
30-
# with:
31-
# registry: nvcr.io
32-
# username: $oauthtoken
33-
# password: ${{ secrets.NVCR_TOKEN }}
34-
# - name: Checkout repository
35-
# uses: actions/checkout@v4
36-
# - name: Build MPI operator compatible base container
37-
# id: build
38-
# uses: ./.github/actions/build-container
39-
# with:
40-
# ARCHITECTURE: amd64
41-
# ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
42-
# BADGE_FILENAME: badge-mpi-operator-compatible-base-build
43-
# BUILD_DATE: 0000-00-00 # not important; this image is never published
44-
# BASE_IMAGE: ${{ inputs.CONTAINER }}
45-
# CONTAINER_NAME: mpi-operator-compatible-base
46-
# DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
47-
# RUNNER_SIZE: small
48-
# ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
49-
# ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
50-
# github-token: ${{ secrets.GITHUB_TOKEN }}
51-
# bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
52-
# outputs:
53-
# DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
54-
# DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
55-
#
56-
#
57-
# EKS cluster offline due to maintenance - to run manually
58-
#
59-
# nccl-test:
60-
# needs: build-mpi-operator-compatible-base
61-
# strategy:
62-
# matrix:
63-
# test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
64-
# runs-on: eks
65-
# env:
66-
# BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
67-
# TEST_NAME: ${{ matrix.test }}
68-
# steps:
69-
# - name: Checkout repository
70-
# uses: actions/checkout@v4
71-
# - name: Login to GitHub Container Registry
72-
# uses: docker/login-action@v3
73-
# with:
74-
# registry: ghcr.io
75-
# username: ${{ github.repository_owner }}
76-
# password: ${{ secrets.GITHUB_TOKEN }}
77-
# - name: Create env vars
78-
# id: var
79-
# shell: bash
80-
# run: |
81-
# JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
82-
# LAUNCHER_NAME="${JOB_NAME}-launcher"
83-
# TOKEN_NAME="${JOB_NAME}-token"
84-
# # Make these available to later steps
85-
# echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
86-
# echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
87-
# - name: K8s GHCR store and delete token
88-
# id: store-token
89-
# uses: ./.github/actions/store-delete-k8s-ghcr
90-
# - name: Configure Kubernetes job
91-
# run: |
92-
# export WORKER_NAME="${JOB_NAME}-worker"
93-
# yq -i '.metadata.name = strenv(JOB_NAME)
94-
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
95-
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
96-
# | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
97-
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
98-
# | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
99-
# | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
100-
# | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
101-
# .github/eks-workflow-files/mpi-nccl-test.yml
102-
# git diff .github/eks-workflow-files/mpi-nccl-test.yml
103-
# - name: Submit Kubernetes job
104-
# id: submit_job
105-
# run: |
106-
# echo "Check whether the cluster is under maintenance"
107-
# if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
108-
# echo "Cluster is under maintenance, skipping job submission"
109-
# echo "continue-run=false" >> "$GITHUB_OUTPUT"
110-
# else
111-
# kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
112-
# echo "continue-run=true" >> "$GITHUB_OUTPUT"
113-
# fi
114-
# - name: Wait for Kubernetes job to start
115-
# if: steps.submit_job.outputs.continue-run == 'true'
116-
# # Note that this is *not* using JOB_NAME
117-
# run: |
118-
# # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
119-
# # resources are available, but that is where there can be a long wait if the
120-
# # cluster is busy executing other jobs.
121-
# kubectl wait --for=create job/${LAUNCHER_NAME}
122-
# kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
123-
# - name: Stream Kubernetes job output
124-
# if: steps.submit_job.outputs.continue-run == 'true'
125-
# # Note that this is *not* JOB_NAME
126-
# run: |
127-
# # Streaming logs will fail if the container/pod is still pending
128-
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
129-
# sleep 1
130-
# done
131-
# # TODO: --all-containers=true --all-pods=true could make sense here, but it
132-
# # prefixes lines with a rather verbose tag
133-
# kubectl logs --follow job/${LAUNCHER_NAME}
134-
# - name: Retrieve Kubernetes job status
135-
# if: steps.submit_job.outputs.continue-run == 'true'
136-
# shell: bash -exo pipefail {0}
137-
# run: |
138-
# while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
139-
# failure=${status[0]:-0}
140-
# success=${status[1]:-0}
141-
# total=$((failure+success))
142-
# if [[ ${total} < 1 ]]; then
143-
# sleep 1
144-
# elif [[ ${total} == 1 ]]; then
145-
# break
146-
# else
147-
# # Shouldn't happen, maybe a sign the job being monitored does not have a
148-
# # single launcher pod?
149-
# exit 255
150-
# fi
151-
# done
152-
# exit ${failure}
153-
# # Provide more debug output in case of failure; note that some kinds of launch
154-
# # failure do not produce any log output.
155-
# - name: Debug failed Kubernetes job
156-
# if: failure() && steps.submit_job.outputs.continue-run == 'true'
157-
# run: |
158-
# # Provide better debug in case of launch failures that will not produce log output
159-
# pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
160-
# if [[ -n "${pods}" ]]; then
161-
# kubectl describe ${pods}
162-
# fi
163-
# # Clean up in case of errors as well as success
164-
# - name: Delete Kubernetes job
165-
# if: always()
166-
# run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
23+
build-mpi-operator-compatible-base:
24+
runs-on: [self-hosted, "amd64", "large"]
25+
steps:
26+
- name: Login to nvcr.io Container Registry
27+
uses: docker/login-action@v3
28+
with:
29+
registry: nvcr.io
30+
username: $oauthtoken
31+
password: ${{ secrets.NVCR_TOKEN }}
32+
- name: Checkout repository
33+
uses: actions/checkout@v4
34+
- name: Build MPI operator compatible base container
35+
id: build
36+
uses: ./.github/actions/build-container
37+
with:
38+
ARCHITECTURE: amd64
39+
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
40+
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
41+
BUILD_DATE: 0000-00-00 # not important; this image is never published
42+
BASE_IMAGE: ${{ inputs.CONTAINER }}
43+
CONTAINER_NAME: mpi-operator-compatible-base
44+
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
45+
RUNNER_SIZE: small
46+
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
47+
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
48+
github-token: ${{ secrets.GITHUB_TOKEN }}
49+
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
50+
outputs:
51+
DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
52+
DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
53+
54+
nccl-test:
55+
needs: build-mpi-operator-compatible-base
56+
strategy:
57+
matrix:
58+
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
59+
runs-on: eks
60+
env:
61+
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
62+
TEST_NAME: ${{ matrix.test }}
63+
steps:
64+
- name: Checkout repository
65+
uses: actions/checkout@v4
66+
- name: Login to GitHub Container Registry
67+
uses: docker/login-action@v3
68+
with:
69+
registry: ghcr.io
70+
username: ${{ github.repository_owner }}
71+
password: ${{ secrets.GITHUB_TOKEN }}
72+
- name: Create env vars
73+
id: var
74+
shell: bash
75+
run: |
76+
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
77+
LAUNCHER_NAME="${JOB_NAME}-launcher"
78+
TOKEN_NAME="${JOB_NAME}-token"
79+
# Make these available to later steps
80+
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
81+
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
82+
- name: K8s GHCR store and delete token
83+
id: store-token
84+
uses: ./.github/actions/store-delete-k8s-ghcr
85+
- name: Configure Kubernetes job
86+
run: |
87+
export WORKER_NAME="${JOB_NAME}-worker"
88+
yq -i '.metadata.name = strenv(JOB_NAME)
89+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
90+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
91+
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
92+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
93+
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
94+
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
95+
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
96+
.github/eks-workflow-files/mpi-nccl-test.yml
97+
git diff .github/eks-workflow-files/mpi-nccl-test.yml
98+
- name: Submit Kubernetes job
99+
id: submit_job
100+
run: |
101+
echo "Check whether the cluster is under maintenance"
102+
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
103+
echo "Cluster is under maintenance, skipping job submission"
104+
echo "continue-run=false" >> "$GITHUB_OUTPUT"
105+
else
106+
kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
107+
echo "continue-run=true" >> "$GITHUB_OUTPUT"
108+
fi
109+
- name: Wait for Kubernetes job to start
110+
if: steps.submit_job.outputs.continue-run == 'true'
111+
# Note that this is *not* using JOB_NAME
112+
run: |
113+
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
114+
# resources are available, but that is where there can be a long wait if the
115+
# cluster is busy executing other jobs.
116+
kubectl wait --for=create job/${LAUNCHER_NAME}
117+
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
118+
- name: Stream Kubernetes job output
119+
if: steps.submit_job.outputs.continue-run == 'true'
120+
# Note that this is *not* JOB_NAME
121+
run: |
122+
# Streaming logs will fail if the container/pod is still pending
123+
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
124+
sleep 1
125+
done
126+
# TODO: --all-containers=true --all-pods=true could make sense here, but it
127+
# prefixes lines with a rather verbose tag
128+
kubectl logs --follow job/${LAUNCHER_NAME}
129+
- name: Retrieve Kubernetes job status
130+
if: steps.submit_job.outputs.continue-run == 'true'
131+
shell: bash -exo pipefail {0}
132+
run: |
133+
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
134+
failure=${status[0]:-0}
135+
success=${status[1]:-0}
136+
total=$((failure+success))
137+
if [[ ${total} < 1 ]]; then
138+
sleep 1
139+
elif [[ ${total} == 1 ]]; then
140+
break
141+
else
142+
# Shouldn't happen, maybe a sign the job being monitored does not have a
143+
# single launcher pod?
144+
exit 255
145+
fi
146+
done
147+
exit ${failure}
148+
# Provide more debug output in case of failure; note that some kinds of launch
149+
# failure do not produce any log output.
150+
- name: Debug failed Kubernetes job
151+
if: failure() && steps.submit_job.outputs.continue-run == 'true'
152+
run: |
153+
# Provide better debug in case of launch failures that will not produce log output
154+
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
155+
if [[ -n "${pods}" ]]; then
156+
kubectl describe ${pods}
157+
fi
158+
# Clean up in case of errors as well as success
159+
- name: Delete Kubernetes job
160+
if: always()
161+
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml

.github/workflows/ngc-release-testing.yaml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,12 @@ jobs:
3131
CONTAINER: ${{ inputs.JAX_IMAGE }}
3232
secrets: inherit
3333

34-
# EKS cluster offline due to maintenance - to run manually
35-
#
36-
# test-maxtext-eks:
37-
# if: inputs.MAXTEXT_IMAGE != ''
38-
# uses: ./.github/workflows/_test_maxtext_k8s.yaml
39-
# with:
40-
# MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
41-
# secrets: inherit
34+
test-maxtext-eks:
35+
if: inputs.MAXTEXT_IMAGE != ''
36+
uses: ./.github/workflows/_test_maxtext_k8s.yaml
37+
with:
38+
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
39+
secrets: inherit
4240

4341
test-maxtext-gke:
4442
if: inputs.MAXTEXT_IMAGE != ''
@@ -48,7 +46,7 @@ jobs:
4846
secrets: inherit
4947

5048
finalize:
51-
needs: [ test-nccl, test-maxtext-gke] # ,test-maxtext-eks ]
49+
needs: [ test-nccl, test-maxtext-gke,test-maxtext-eks ]
5250
if: "!cancelled()"
5351
uses: ./.github/workflows/_finalize.yaml
5452
secrets: inherit

0 commit comments

Comments
 (0)