Skip to content

Commit 8dde981

Browse files
authored
Run NCCL tests on EKS using MPI operator (#1188)
This tests the current latest CUDA DL image's native EFA support.
1 parent bae53fc commit 8dde981

File tree

3 files changed

+234
-0
lines changed

3 files changed

+234
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
ARG BASE_IMAGE
2+
FROM ${BASE_IMAGE} as mealkit
3+
FROM mealkit as final
4+
RUN apt-get update \
5+
&& apt install -y openssh-server \
6+
&& apt-get clean \
7+
&& rm -rf /var/lib/apt/lists/* \
8+
&& mkdir /run/sshd
9+
# https://github.com/kubeflow/mpi-operator/blob/c738a83b185b4bf3bf7e6eca9d4503653294c995/build/base/Dockerfile#L16
10+
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
11+
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
12+
&& sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
apiVersion: kubeflow.org/v2beta1
2+
kind: MPIJob
3+
metadata:
4+
name: PLACEHOLDER
5+
spec:
6+
# Without this then the first few attempts to run the launcher will result in errors
7+
# due to failed DNS resolution of the worker names. It works eventually, given a big
8+
# enough backoffLimit, but it makes it harder to handle log-streaming and identifying
9+
# the "real" exit code of the job.
10+
launcherCreationPolicy: WaitForWorkersReady
11+
runPolicy:
12+
cleanPodPolicy: Running
13+
# surface errors direct to GitHub Actions without internal retries
14+
backoffLimit: 0
15+
# 1 MPI rank per GPU
16+
slotsPerWorker: 8
17+
mpiReplicaSpecs:
18+
Launcher:
19+
replicas: 1
20+
# Without this the launcher pod will be deleted on failure, which makes it hard
21+
# to provide useful diagnostics
22+
restartPolicy: Never
23+
template:
24+
spec:
25+
containers:
26+
- image: PLACEHOLDER
27+
imagePullPolicy: IfNotPresent
28+
name: PLACEHOLDER
29+
command:
30+
- mpirun
31+
- --allow-run-as-root
32+
- -np
33+
- "16"
34+
- -N
35+
- "8"
36+
- PLACEHOLDER
37+
- -b
38+
- "8"
39+
- -e
40+
- "16G"
41+
- -f
42+
- "2"
43+
- -g
44+
- "1"
45+
- -c
46+
- "1"
47+
- -n
48+
- "100"
49+
imagePullSecrets:
50+
- name: PLACEHOLDER
51+
Worker:
52+
replicas: 2
53+
template:
54+
spec:
55+
nodeSelector:
56+
node.kubernetes.io/instance-type: "p5.48xlarge"
57+
containers:
58+
- image: PLACEHOLDER
59+
imagePullPolicy: IfNotPresent
60+
name: PLACEHOLDER
61+
volumeMounts:
62+
- name: shmem
63+
mountPath: /dev/shm
64+
resources:
65+
limits:
66+
nvidia.com/gpu: 8
67+
hugepages-2Mi: 5120Mi
68+
vpc.amazonaws.com/efa: 32
69+
memory: 32000Mi
70+
imagePullSecrets:
71+
- name: PLACEHOLDER
72+
volumes:
73+
- name: shmem
74+
hostPath:
75+
path: /dev/shm

.github/workflows/nccl-k8s.yaml

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
name: NCCL on Kubernetes
2+
on:
3+
schedule:
4+
- cron: '30 8 * * *'
5+
pull_request:
6+
types:
7+
- opened
8+
- reopened
9+
- ready_for_review
10+
- synchronize
11+
paths-ignore:
12+
- '**.md'
13+
workflow_dispatch:
14+
inputs:
15+
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
16+
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
17+
# to be modified to test one of the JAX-Toolbox containers.
18+
CUDA_IMAGE:
19+
type: string
20+
description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
21+
default: ''
22+
required: false
23+
concurrency:
24+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
25+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
26+
permissions:
27+
actions: write # to cancel previous workflows
28+
contents: read # to fetch code
29+
packages: write # to upload container
30+
jobs:
31+
build-mpi-operator-compatible-base:
32+
uses: ./.github/workflows/_build.yaml
33+
with:
34+
ARCHITECTURE: amd64
35+
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
36+
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
37+
BUILD_DATE: 0000-00-00 # not important; this image is never published
38+
BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04' }}
39+
CONTAINER_NAME: mpi-operator-compatible-base
40+
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
41+
RUNNER_SIZE: small
42+
secrets: inherit
43+
# TODO: expand beyond all-reduce
44+
nccl-test:
45+
needs: build-mpi-operator-compatible-base
46+
strategy:
47+
matrix:
48+
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
49+
runs-on: eks
50+
env:
51+
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
52+
TEST_NAME: ${{ matrix.test }}
53+
steps:
54+
- name: Check out the repository
55+
uses: actions/checkout@v4
56+
- name: Install yq
57+
run: |
58+
mkdir local_bin/
59+
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
60+
chmod 777 ./local_bin/yq
61+
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
62+
- name: Login to GitHub Container Registry
63+
uses: docker/login-action@v3
64+
with:
65+
registry: ghcr.io
66+
username: ${{ github.repository_owner }}
67+
password: ${{ secrets.GITHUB_TOKEN }}
68+
- name: Store GitHub Container Registry token as Kubernetes secret
69+
run: |
70+
# Replace underscores in TEST_NAME with - to make a valid Kubernetes name
71+
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
72+
LAUNCHER_NAME="${JOB_NAME}-launcher"
73+
TOKEN_NAME="${JOB_NAME}-token"
74+
# Make these available to later steps
75+
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
76+
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
77+
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
78+
kubectl create secret generic \
79+
${TOKEN_NAME} \
80+
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
81+
--type=kubernetes.io/dockerconfigjson
82+
- name: Configure Kubernetes job
83+
run: |
84+
export WORKER_NAME="${JOB_NAME}-worker"
85+
yq -i '.metadata.name = strenv(JOB_NAME)
86+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
87+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
88+
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
89+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
90+
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
91+
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
92+
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
93+
.github/eks-workflow-files/mpi-nccl-test.yml
94+
git diff .github/eks-workflow-files/mpi-nccl-test.yml
95+
- name: Submit Kubernetes job
96+
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
97+
- name: Wait for Kubernetes job to start
98+
# Note that this is *not* using JOB_NAME
99+
run: |
100+
# Launcher job is only created once the workers are ready; wait for its
101+
# creation. This is where we block if the cluster is busy executing other jobs,
102+
# but it might be better to impose more of a parallelism limit at the GitHub
103+
# Actions level to keep the Kubernetes queue length modest
104+
kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
105+
# Streaming logs will fail if the container/pod is still pending
106+
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
107+
sleep 1
108+
done
109+
- name: Stream Kubernetes job output
110+
# Note that this is *not* JOB_NAME
111+
# TODO: --all-containers=true --all-pods=true could make sense here
112+
run: kubectl logs --follow job/${LAUNCHER_NAME}
113+
- name: Retrieve Kubernetes job status
114+
shell: bash -exo pipefail {0}
115+
run: |
116+
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
117+
failure=${status[0]:-0}
118+
success=${status[1]:-0}
119+
total=$((failure+success))
120+
if [[ ${total} < 1 ]]; then
121+
sleep 1
122+
elif [[ ${total} == 1 ]]; then
123+
break
124+
else
125+
# Shouldn't happen, maybe a sign the job being monitored does not have a
126+
# single launcher pod?
127+
exit 255
128+
fi
129+
done
130+
exit ${failure}
131+
# Provide more debug output in case of failure; note that some kinds of launch
132+
# failure do not produce any log output.
133+
- name: Debug failed Kubernetes job
134+
if: failure()
135+
run: |
136+
# Provide better debug in case of launch failures that will not produce log output
137+
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
138+
if [[ -n "${powd}" ]]; then
139+
kubectl describe ${pods}
140+
fi
141+
# Clean up in case of errors as well as success
142+
- name: Delete Kubernetes job
143+
if: always()
144+
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
145+
- name: Delete GitHub Container Registry token
146+
if: always()
147+
run: kubectl delete secret ${TOKEN_NAME}

0 commit comments

Comments
 (0)