eks: use kueue (#1197)

olupton · web-flow · commit 0e9abfa69b75 · 2024-12-16T15:01:30.000+01:00
This avoids deadlocks by providing basic gang scheduling. Also, the
cluster now has a few cores worth of non-GPU node capacity, so we no
longer need to run the post-processing test on the large P5 nodes.

`yq` is now pre-installed on the `eks` runner, as it is ~always needed.
diff --git a/.github/eks-workflow-files/job.yml b/.github/eks-workflow-files/job.yml
@@ -11,6 +11,8 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
 spec:
   completions: 2 # number of nodes
   parallelism: 2 # number of nodes
diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml
@@ -2,16 +2,15 @@ apiVersion: kubeflow.org/v2beta1
 kind: MPIJob
 metadata:
   name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
 spec:
-  # Without this then the first few attempts to run the launcher will result in errors
-  # due to failed DNS resolution of the worker names. It works eventually, given a big
-  # enough backoffLimit, but it makes it harder to handle log-streaming and identifying
-  # the "real" exit code of the job.
-  launcherCreationPolicy: WaitForWorkersReady
   runPolicy:
     cleanPodPolicy: Running
-    # surface errors direct to GitHub Actions without internal retries
+    # surface errors direct to GitHub Actions without Kubernetes-internal retries
     backoffLimit: 0
+    # start suspended, let kueue unblock
+    suspend: true
   # 1 MPI rank per GPU
   slotsPerWorker: 8
   mpiReplicaSpecs:
@@ -27,25 +26,31 @@ spec:
               imagePullPolicy: IfNotPresent
               name: PLACEHOLDER
               command:
-                - mpirun
-                - --allow-run-as-root
-                - -np
-                - "16"
-                - -N
-                - "8"
-                - PLACEHOLDER
-                - -b
-                - "8"
-                - -e
-                - "16G"
-                - -f
-                - "2"
-                - -g
-                - "1"
+                - bash
                 - -c
-                - "1"
-                - -n
-                - "100"
+                - |
+                  # kueue breaks the WaitForWorkersReady policy that mpi-operator
+                  # nominally supports, so manually wait a while for a basic mpirun to
+                  # start working (i.e. for the workers to be ready) before doing
+                  # anything interesting, instead of relying on mpi-operator not to
+                  # start the launcher before it is expected to succeed. This issue
+                  # seems related: https://github.com/kubeflow/mpi-operator/pull/617
+                  limit=5m
+                  if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
+                    echo "Workers were still not reachable after ${limit}, exiting"
+                    exit 1
+                  fi
+                  mpirun --allow-run-as-root -np 16 -N 8 $0 \
+                    -b 8 \
+                    -e 16G \
+                    -f 2 \
+                    -g 1 \
+                    -c 1 \
+                    -n 100
+                - PLACEHOLDER
+              resources:
+                limits:
+                  cpu: 1
           imagePullSecrets:
             - name: PLACEHOLDER
     Worker:
diff --git a/.github/eks-workflow-files/post-process-job.yml b/.github/eks-workflow-files/post-process-job.yml
@@ -32,10 +32,6 @@ spec:
             - pipefail
             - -c
             - nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
-          # FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
-          resources:
-            limits:
-              nvidia.com/gpu: 1
           volumeMounts:
             - mountPath: /opt/output
               name: output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -452,12 +452,6 @@ jobs:
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
-    - name: Install yq
-      run: |
-        mkdir local_bin/
-        curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
-        chmod 777 ./local_bin/yq
-        echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
     - name: Login to GitHub Container Registry
       uses: docker/login-action@v3
       with:
diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml
@@ -53,12 +53,6 @@ jobs:
     steps:
       - name: Check out the repository
         uses: actions/checkout@v4
-      - name: Install yq
-        run: |
-          mkdir local_bin/
-          curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
-          chmod 777 ./local_bin/yq
-          echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
@@ -86,7 +80,7 @@ jobs:
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
             | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
@@ -97,19 +91,21 @@ jobs:
       - name: Wait for Kubernetes job to start
         # Note that this is *not* using JOB_NAME
         run: |
-          # Launcher job is only created once the workers are ready; wait for its
-          # creation. This is where we block if the cluster is busy executing other jobs,
-          # but it might be better to impose more of a parallelism limit at the GitHub
-          # Actions level to keep the Kubernetes queue length modest
-          kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${LAUNCHER_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        # Note that this is *not* JOB_NAME
+        run: |
           # Streaming logs will fail if the container/pod is still pending
           while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
             sleep 1
           done
-      - name: Stream Kubernetes job output
-        # Note that this is *not* JOB_NAME
-        # TODO: --all-containers=true --all-pods=true could make sense here
-        run: kubectl logs --follow job/${LAUNCHER_NAME}
+          # TODO: --all-containers=true --all-pods=true could make sense here, but it
+          # prefixes lines with a rather verbose tag
+          kubectl logs --follow job/${LAUNCHER_NAME}
       - name: Retrieve Kubernetes job status
         shell: bash -exo pipefail {0}
         run: |
@@ -135,7 +131,7 @@ jobs:
         run: |
           # Provide better debug in case of launch failures that will not produce log output
           pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
-          if [[ -n "${powd}" ]]; then
+          if [[ -n "${pods}" ]]; then
             kubectl describe ${pods}
           fi
       # Clean up in case of errors as well as success