@@ -20,147 +20,142 @@ jobs:
2020 JAX_IMAGE : ${{ inputs.CONTAINER }}
2121 secrets : inherit
2222
23- # EKS cluster offline due to maintenance - to run manually
24- #
25- # build-mpi-operator-compatible-base:
26- # runs-on: [self-hosted, "amd64", "large"]
27- # steps:
28- # - name: Login to nvcr.io Container Registry
29- # uses: docker/login-action@v3
30- # with:
31- # registry: nvcr.io
32- # username: $oauthtoken
33- # password: ${{ secrets.NVCR_TOKEN }}
34- # - name: Checkout repository
35- # uses: actions/checkout@v4
36- # - name: Build MPI operator compatible base container
37- # id: build
38- # uses: ./.github/actions/build-container
39- # with:
40- # ARCHITECTURE: amd64
41- # ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
42- # BADGE_FILENAME: badge-mpi-operator-compatible-base-build
43- # BUILD_DATE: 0000-00-00 # not important; this image is never published
44- # BASE_IMAGE: ${{ inputs.CONTAINER }}
45- # CONTAINER_NAME: mpi-operator-compatible-base
46- # DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
47- # RUNNER_SIZE: small
48- # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
49- # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
50- # github-token: ${{ secrets.GITHUB_TOKEN }}
51- # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
52- # outputs:
53- # DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
54- # DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
55- #
56- #
57- # EKS cluster offline due to maintenance - to run manually
58- #
59- # nccl-test:
60- # needs: build-mpi-operator-compatible-base
61- # strategy:
62- # matrix:
63- # test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
64- # runs-on: eks
65- # env:
66- # BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
67- # TEST_NAME: ${{ matrix.test }}
68- # steps:
69- # - name: Checkout repository
70- # uses: actions/checkout@v4
71- # - name: Login to GitHub Container Registry
72- # uses: docker/login-action@v3
73- # with:
74- # registry: ghcr.io
75- # username: ${{ github.repository_owner }}
76- # password: ${{ secrets.GITHUB_TOKEN }}
77- # - name: Create env vars
78- # id: var
79- # shell: bash
80- # run: |
81- # JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
82- # LAUNCHER_NAME="${JOB_NAME}-launcher"
83- # TOKEN_NAME="${JOB_NAME}-token"
84- # # Make these available to later steps
85- # echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
86- # echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
87- # - name: K8s GHCR store and delete token
88- # id: store-token
89- # uses: ./.github/actions/store-delete-k8s-ghcr
90- # - name: Configure Kubernetes job
91- # run: |
92- # export WORKER_NAME="${JOB_NAME}-worker"
93- # yq -i '.metadata.name = strenv(JOB_NAME)
94- # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
95- # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
96- # | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
97- # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
98- # | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
99- # | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
100- # | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
101- # .github/eks-workflow-files/mpi-nccl-test.yml
102- # git diff .github/eks-workflow-files/mpi-nccl-test.yml
103- # - name: Submit Kubernetes job
104- # id: submit_job
105- # run: |
106- # echo "Check whether the cluster is under maintenance"
107- # if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
108- # echo "Cluster is under maintenance, skipping job submission"
109- # echo "continue-run=false" >> "$GITHUB_OUTPUT"
110- # else
111- # kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
112- # echo "continue-run=true" >> "$GITHUB_OUTPUT"
113- # fi
114- # - name: Wait for Kubernetes job to start
115- # if: steps.submit_job.outputs.continue-run == 'true'
116- # # Note that this is *not* using JOB_NAME
117- # run: |
118- # # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
119- # # resources are available, but that is where there can be a long wait if the
120- # # cluster is busy executing other jobs.
121- # kubectl wait --for=create job/${LAUNCHER_NAME}
122- # kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
123- # - name: Stream Kubernetes job output
124- # if: steps.submit_job.outputs.continue-run == 'true'
125- # # Note that this is *not* JOB_NAME
126- # run: |
127- # # Streaming logs will fail if the container/pod is still pending
128- # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
129- # sleep 1
130- # done
131- # # TODO: --all-containers=true --all-pods=true could make sense here, but it
132- # # prefixes lines with a rather verbose tag
133- # kubectl logs --follow job/${LAUNCHER_NAME}
134- # - name: Retrieve Kubernetes job status
135- # if: steps.submit_job.outputs.continue-run == 'true'
136- # shell: bash -exo pipefail {0}
137- # run: |
138- # while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
139- # failure=${status[0]:-0}
140- # success=${status[1]:-0}
141- # total=$((failure+success))
142- # if [[ ${total} < 1 ]]; then
143- # sleep 1
144- # elif [[ ${total} == 1 ]]; then
145- # break
146- # else
147- # # Shouldn't happen, maybe a sign the job being monitored does not have a
148- # # single launcher pod?
149- # exit 255
150- # fi
151- # done
152- # exit ${failure}
153- # # Provide more debug output in case of failure; note that some kinds of launch
154- # # failure do not produce any log output.
155- # - name: Debug failed Kubernetes job
156- # if: failure() && steps.submit_job.outputs.continue-run == 'true'
157- # run: |
158- # # Provide better debug in case of launch failures that will not produce log output
159- # pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
160- # if [[ -n "${pods}" ]]; then
161- # kubectl describe ${pods}
162- # fi
163- # # Clean up in case of errors as well as success
164- # - name: Delete Kubernetes job
165- # if: always()
166- # run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
23+ build-mpi-operator-compatible-base :
24+ runs-on : [self-hosted, "amd64", "large"]
25+ steps :
26+ - name : Login to nvcr.io Container Registry
27+ uses : docker/login-action@v3
28+ with :
29+ registry : nvcr.io
30+ username : $oauthtoken
31+ password : ${{ secrets.NVCR_TOKEN }}
32+ - name : Checkout repository
33+ uses : actions/checkout@v4
34+ - name : Build MPI operator compatible base container
35+ id : build
36+ uses : ./.github/actions/build-container
37+ with :
38+ ARCHITECTURE : amd64
39+ ARTIFACT_NAME : artifact-mpi-operator-compatible-base-build
40+ BADGE_FILENAME : badge-mpi-operator-compatible-base-build
41+ BUILD_DATE : 0000-00-00 # not important; this image is never published
42+ BASE_IMAGE : ${{ inputs.CONTAINER }}
43+ CONTAINER_NAME : mpi-operator-compatible-base
44+ DOCKERFILE : .github/container/Dockerfile.mpi-operator-compatible-base
45+ RUNNER_SIZE : small
46+ ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
47+ ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
48+ github-token : ${{ secrets.GITHUB_TOKEN }}
49+ bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
50+ outputs :
51+ DOCKER_TAG_MEALKIT : ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
52+ DOCKER_TAG_FINAL : ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
53+
54+ nccl-test :
55+ needs : build-mpi-operator-compatible-base
56+ strategy :
57+ matrix :
58+ test : [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
59+ runs-on : eks
60+ env :
61+ BASE_IMAGE : ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
62+ TEST_NAME : ${{ matrix.test }}
63+ steps :
64+ - name : Checkout repository
65+ uses : actions/checkout@v4
66+ - name : Login to GitHub Container Registry
67+ uses : docker/login-action@v3
68+ with :
69+ registry : ghcr.io
70+ username : ${{ github.repository_owner }}
71+ password : ${{ secrets.GITHUB_TOKEN }}
72+ - name : Create env vars
73+ id : var
74+ shell : bash
75+ run : |
76+ JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
77+ LAUNCHER_NAME="${JOB_NAME}-launcher"
78+ TOKEN_NAME="${JOB_NAME}-token"
79+ # Make these available to later steps
80+ echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
81+ echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
82+ - name : K8s GHCR store and delete token
83+ id : store-token
84+ uses : ./.github/actions/store-delete-k8s-ghcr
85+ - name : Configure Kubernetes job
86+ run : |
87+ export WORKER_NAME="${JOB_NAME}-worker"
88+ yq -i '.metadata.name = strenv(JOB_NAME)
89+ | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
90+ | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
91+ | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
92+ | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
93+ | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
94+ | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
95+ | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
96+ .github/eks-workflow-files/mpi-nccl-test.yml
97+ git diff .github/eks-workflow-files/mpi-nccl-test.yml
98+ - name : Submit Kubernetes job
99+ id : submit_job
100+ run : |
101+ echo "Check whether the cluster is under maintenance"
102+ if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
103+ echo "Cluster is under maintenance, skipping job submission"
104+ echo "continue-run=false" >> "$GITHUB_OUTPUT"
105+ else
106+ kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
107+ echo "continue-run=true" >> "$GITHUB_OUTPUT"
108+ fi
109+ - name : Wait for Kubernetes job to start
110+ if : steps.submit_job.outputs.continue-run == 'true'
111+ # Note that this is *not* using JOB_NAME
112+ run : |
113+ # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
114+ # resources are available, but that is where there can be a long wait if the
115+ # cluster is busy executing other jobs.
116+ kubectl wait --for=create job/${LAUNCHER_NAME}
117+ kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
118+ - name : Stream Kubernetes job output
119+ if : steps.submit_job.outputs.continue-run == 'true'
120+ # Note that this is *not* JOB_NAME
121+ run : |
122+ # Streaming logs will fail if the container/pod is still pending
123+ while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
124+ sleep 1
125+ done
126+ # TODO: --all-containers=true --all-pods=true could make sense here, but it
127+ # prefixes lines with a rather verbose tag
128+ kubectl logs --follow job/${LAUNCHER_NAME}
129+ - name : Retrieve Kubernetes job status
130+ if : steps.submit_job.outputs.continue-run == 'true'
131+ shell : bash -exo pipefail {0}
132+ run : |
133+ while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
134+ failure=${status[0]:-0}
135+ success=${status[1]:-0}
136+ total=$((failure+success))
137+ if [[ ${total} < 1 ]]; then
138+ sleep 1
139+ elif [[ ${total} == 1 ]]; then
140+ break
141+ else
142+ # Shouldn't happen, maybe a sign the job being monitored does not have a
143+ # single launcher pod?
144+ exit 255
145+ fi
146+ done
147+ exit ${failure}
148+ # Provide more debug output in case of failure; note that some kinds of launch
149+ # failure do not produce any log output.
150+ - name : Debug failed Kubernetes job
151+ if : failure() && steps.submit_job.outputs.continue-run == 'true'
152+ run : |
153+ # Provide better debug in case of launch failures that will not produce log output
154+ pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
155+ if [[ -n "${pods}" ]]; then
156+ kubectl describe ${pods}
157+ fi
158+ # Clean up in case of errors as well as success
159+ - name : Delete Kubernetes job
160+ if : always()
161+ run : kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
0 commit comments