Skip to content

Commit 0e9abfa

Browse files
authored
eks: use kueue (#1197)
This avoids deadlocks by providing basic gang scheduling. Also, the cluster now has a few cores worth of non-GPU node capacity, so we no longer need to run the post-processing test on the large P5 nodes. `yq` is now pre-installed on the `eks` runner, as it is ~always needed.
1 parent d7cac31 commit 0e9abfa

File tree

5 files changed

+44
-51
lines changed

5 files changed

+44
-51
lines changed

.github/eks-workflow-files/job.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ apiVersion: batch/v1
1111
kind: Job
1212
metadata:
1313
name: PLACEHOLDER
14+
labels:
15+
kueue.x-k8s.io/queue-name: p5-queue
1416
spec:
1517
completions: 2 # number of nodes
1618
parallelism: 2 # number of nodes

.github/eks-workflow-files/mpi-nccl-test.yml

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,15 @@ apiVersion: kubeflow.org/v2beta1
22
kind: MPIJob
33
metadata:
44
name: PLACEHOLDER
5+
labels:
6+
kueue.x-k8s.io/queue-name: p5-queue
57
spec:
6-
# Without this then the first few attempts to run the launcher will result in errors
7-
# due to failed DNS resolution of the worker names. It works eventually, given a big
8-
# enough backoffLimit, but it makes it harder to handle log-streaming and identifying
9-
# the "real" exit code of the job.
10-
launcherCreationPolicy: WaitForWorkersReady
118
runPolicy:
129
cleanPodPolicy: Running
13-
# surface errors direct to GitHub Actions without internal retries
10+
# surface errors direct to GitHub Actions without Kubernetes-internal retries
1411
backoffLimit: 0
12+
# start suspended, let kueue unblock
13+
suspend: true
1514
# 1 MPI rank per GPU
1615
slotsPerWorker: 8
1716
mpiReplicaSpecs:
@@ -27,25 +26,31 @@ spec:
2726
imagePullPolicy: IfNotPresent
2827
name: PLACEHOLDER
2928
command:
30-
- mpirun
31-
- --allow-run-as-root
32-
- -np
33-
- "16"
34-
- -N
35-
- "8"
36-
- PLACEHOLDER
37-
- -b
38-
- "8"
39-
- -e
40-
- "16G"
41-
- -f
42-
- "2"
43-
- -g
44-
- "1"
29+
- bash
4530
- -c
46-
- "1"
47-
- -n
48-
- "100"
31+
- |
32+
# kueue breaks the WaitForWorkersReady policy that mpi-operator
33+
# nominally supports, so manually wait a while for a basic mpirun to
34+
# start working (i.e. for the workers to be ready) before doing
35+
# anything interesting, instead of relying on mpi-operator not to
36+
# start the launcher before it is expected to succeed. This issue
37+
# seems related: https://github.com/kubeflow/mpi-operator/pull/617
38+
limit=5m
39+
if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
40+
echo "Workers were still not reachable after ${limit}, exiting"
41+
exit 1
42+
fi
43+
mpirun --allow-run-as-root -np 16 -N 8 $0 \
44+
-b 8 \
45+
-e 16G \
46+
-f 2 \
47+
-g 1 \
48+
-c 1 \
49+
-n 100
50+
- PLACEHOLDER
51+
resources:
52+
limits:
53+
cpu: 1
4954
imagePullSecrets:
5055
- name: PLACEHOLDER
5156
Worker:

.github/eks-workflow-files/post-process-job.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ spec:
3232
- pipefail
3333
- -c
3434
- nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
35-
# FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
36-
resources:
37-
limits:
38-
nvidia.com/gpu: 1
3935
volumeMounts:
4036
- mountPath: /opt/output
4137
name: output

.github/workflows/_ci.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -452,12 +452,6 @@ jobs:
452452
steps:
453453
- name: Check out the repository
454454
uses: actions/checkout@v4
455-
- name: Install yq
456-
run: |
457-
mkdir local_bin/
458-
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
459-
chmod 777 ./local_bin/yq
460-
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
461455
- name: Login to GitHub Container Registry
462456
uses: docker/login-action@v3
463457
with:

.github/workflows/nccl-k8s.yaml

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,6 @@ jobs:
5353
steps:
5454
- name: Check out the repository
5555
uses: actions/checkout@v4
56-
- name: Install yq
57-
run: |
58-
mkdir local_bin/
59-
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
60-
chmod 777 ./local_bin/yq
61-
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
6256
- name: Login to GitHub Container Registry
6357
uses: docker/login-action@v3
6458
with:
@@ -86,7 +80,7 @@ jobs:
8680
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
8781
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
8882
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
89-
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
83+
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
9084
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
9185
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
9286
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
@@ -97,19 +91,21 @@ jobs:
9791
- name: Wait for Kubernetes job to start
9892
# Note that this is *not* using JOB_NAME
9993
run: |
100-
# Launcher job is only created once the workers are ready; wait for its
101-
# creation. This is where we block if the cluster is busy executing other jobs,
102-
# but it might be better to impose more of a parallelism limit at the GitHub
103-
# Actions level to keep the Kubernetes queue length modest
104-
kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
94+
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
95+
# resources are available, but that is where there can be a long wait if the
96+
# cluster is busy executing other jobs.
97+
kubectl wait --for=create job/${LAUNCHER_NAME}
98+
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
99+
- name: Stream Kubernetes job output
100+
# Note that this is *not* JOB_NAME
101+
run: |
105102
# Streaming logs will fail if the container/pod is still pending
106103
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
107104
sleep 1
108105
done
109-
- name: Stream Kubernetes job output
110-
# Note that this is *not* JOB_NAME
111-
# TODO: --all-containers=true --all-pods=true could make sense here
112-
run: kubectl logs --follow job/${LAUNCHER_NAME}
106+
# TODO: --all-containers=true --all-pods=true could make sense here, but it
107+
# prefixes lines with a rather verbose tag
108+
kubectl logs --follow job/${LAUNCHER_NAME}
113109
- name: Retrieve Kubernetes job status
114110
shell: bash -exo pipefail {0}
115111
run: |
@@ -135,7 +131,7 @@ jobs:
135131
run: |
136132
# Provide better debug in case of launch failures that will not produce log output
137133
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
138-
if [[ -n "${powd}" ]]; then
134+
if [[ -n "${pods}" ]]; then
139135
kubectl describe ${pods}
140136
fi
141137
# Clean up in case of errors as well as success

0 commit comments

Comments
 (0)