diff --git a/.ci/Dockerfile.slurm_executor b/.ci/Dockerfile.slurm_executor new file mode 100644 index 00000000000..01487c7d405 --- /dev/null +++ b/.ci/Dockerfile.slurm_executor @@ -0,0 +1,13 @@ +FROM nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest + +RUN apk update && \ + apk add \ + shadow \ + openssh-client \ + sudo + + +RUN groupadd -g 30 hardware +RUN useradd --no-create-home --uid 149917 --gid 30 -d /custom_home/svcnbu-swx-hpcx --no-create-home --shell /bin/bash svcnbu-swx-hpcx + +USER svcnbu-swx-hpcx diff --git a/.ci/scripts/run_test_nvls.sh b/.ci/scripts/run_test_nvls.sh new file mode 100755 index 00000000000..b11d74ffccb --- /dev/null +++ b/.ci/scripts/run_test_nvls.sh @@ -0,0 +1,4 @@ +OMPI_MCA_coll=^hcoll \ +OMPI_MCA_coll_ucc_enable=0 \ +UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ +/opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum diff --git a/.ci/scripts/run_test_nvls_slurm.sh b/.ci/scripts/run_test_nvls_slurm.sh new file mode 100755 index 00000000000..864771513e4 --- /dev/null +++ b/.ci/scripts/run_test_nvls_slurm.sh @@ -0,0 +1,15 @@ +#! /bin/bash + +# NOTE: script is preprocessed by envsubst +# ensure all variables to be set are in stand alone and simple format +# complex bash string operations are not supported in envsubst + +srun \ + --jobid=${SLM_JOB_ID} \ + --nodes=${SLM_NODES} \ + --ntasks-per-node=1 \ + --gpus-per-node=1 \ + --mpi=pmix \ + --cpu-bind=verbose \ + --container-image=${UCC_ENROOT_IMAGE_NAME} \ + bash -l /opt/nvidia/src/ucc/.ci/scripts/run_test_nvls.sh diff --git a/.ci/slurm_tests/README.md b/.ci/slurm_tests/README.md new file mode 100644 index 00000000000..6314df74a86 --- /dev/null +++ b/.ci/slurm_tests/README.md @@ -0,0 +1,14 @@ +# Slurm Tests + +This directory contains configuration and scripts for running UCC tests on Slurm clusters. + +## Required Environment Variables for Job Matrix + +| Variable | Description | +|----------|-------------| +| `UCC_ENROOT_IMAGE_NAME` | The container image name to use for running tests. Must be in Enroot-compatible format with `#` after the registry host (e.g., `harbor.mellanox.com#torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9:31`). | +| `SLM_JOB_NAME` | The name assigned to the Slurm job allocation. Used with `salloc --job-name`. Typically includes the build number for traceability (e.g., `ucc_tests_${BUILD_NUMBER}`). | +| `SLM_NODES` | The number of nodes to allocate for the Slurm job. Used with `salloc -N`. | +| `SLM_HEAD_NODE` | The Slurm head node to connect to. Can be `scctl` (uses scctl client) or a hostname for direct SSH access (e.g., `hpchead`). | +| `SLM_PARTITION` | The Slurm partition to submit the job to. Used with `salloc -p` (e.g., `funk`, `soul`). | + diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml new file mode 100644 index 00000000000..b5d522675e6 --- /dev/null +++ b/.ci/slurm_tests/job_matrix.yaml @@ -0,0 +1,85 @@ +--- +job: "ucc" + +registry_host: "harbor.mellanox.com" +registry_path: "/torch-ucc" +registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89" + +volumes: + - { mountPath: "/custom_home/svcnbu-swx-hpcx", hostPath: "/labhome/svcnbu-swx-hpcx" } + +env: + CUDA_VER: 12.9 + UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" + NVIDIA_ROOT_DIR: "/opt/nvidia" + SRC_DIR: "${NVIDIA_ROOT_DIR}/src" + BIN_DIR: "${NVIDIA_ROOT_DIR}/bin" + # Enroot requires # in the image name after the registry host + UCC_ENROOT_IMAGE_NAME: "${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${BUILD_NUMBER}" + SLM_JOB_NAME: "ucc_tests_${BUILD_NUMBER}" + SLM_NODES: "2" + + +kubernetes: + cloud: il-ipp-blossom-prod + namespace: hpcx + limits: "{memory: 16Gi, cpu: 16000m}" + requests: "{memory: 16Gi, cpu: 16000m}" + +credentials: + - { + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass", + usernameVariable: "SERVICE_USER_USERNAME", + passwordVariable: "SERVICE_USER_PASSWORD", + } + +runs_on_dockers: + # cloud pod to build the shared docker image + - { + file: ".ci/Dockerfile.ngc_pytorch", + name: "ngc_pytorch", + tag: "${BUILD_NUMBER}", + arch: "x86_64", + uri: "${UCC_URI_SUFFIX}", + build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}", + category: 'tool' # tool category makes this container not to participate in standard execution flow, one has to explicitly specify it in the job matrix + } + - { + name: 'slurm_executor_1', + file: ".ci/Dockerfile.slurm_executor", + arch: 'x86_64', + tag: '${BUILD_NUMBER}', + SLM_HEAD_NODE: 'scctl', + SLM_PARTITION: 'funk' + } + - { + name: 'slurm_executor_2', + file: ".ci/Dockerfile.slurm_executor", + arch: 'x86_64', + tag: '${BUILD_NUMBER}', + SLM_HEAD_NODE: 'hpchead', + SLM_PARTITION: 'soul' + } + + + +timeout_minutes: 60 + +steps: + - name: Allocate Slurm resources + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" + run: | + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh + + + - name: Run UCC tests on slurm cluster + run: | + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh ${WORKSPACE}/.ci/scripts/run_test_nvls_slurm.sh + + +pipeline_stop: + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" + containerSelector: "{name: 'slurm_executor_1'}" + run: | + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_release.sh + diff --git a/.ci/slurm_tests/proj_jjb.yaml b/.ci/slurm_tests/proj_jjb.yaml new file mode 100644 index 00000000000..1660a89e4ba --- /dev/null +++ b/.ci/slurm_tests/proj_jjb.yaml @@ -0,0 +1,75 @@ +- job-template: + name: "{jjb_proj}" + project-type: pipeline + properties: + - github: + url: "{jjb_git}" + - build-discarder: + days-to-keep: 30 + num-to-keep: 20 + - inject: + keep-system-variables: true + properties-content: | + jjb_proj={jjb_proj} + description: Do NOT edit this job through the Web GUI ! + concurrent: true + sandbox: true + parameters: + - string: + name: "sha1" + default: "master" + description: "Commit to be checked, set by PR" + - bool: + name: "build_dockers" + default: true + description: "Rebuild docker containers" + - string: + name: "conf_file" + default: ".ci/slurm_tests/job_matrix.yaml" + description: "Regex to select job config file" + - string: + name: "script" + default: "{jjb_jenkinsfile}" + description: "Jenkinsfile to load on trigger" + - string: + name: "DEBUG" + default: 0 + description: "Enable debug prints and traces, valid values are 0-9" + - string: + name: "UCC_VERSION" + default: "1.0.0" + description: "UCC version" +# triggers: +# - github-pull-request: +# cron: 'H/5 * * * *' +# trigger-phrase: '.*\bbot:retest\b.*' +# status-context: "ucc" +# success-status: "Test PASSed." +# failure-status: "Test FAILed." +# error-status: "Test FAILed." +# status-add-test-results: true +# # swx-jenkins2 from GitHub Pull Request Builder +# auth-id: 'cb48aefb-4f90-4d52-a9bc-63d92382e0be' +# org-list: ["Mellanox","openucx"] +# white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"] +# allow-whitelist-orgs-as-admins: true + pipeline-scm: + scm: + - git: + url: "{jjb_git}" + credentials-id: 'b042dbee-a0cf-4e23-9efc-a6dc6586f49c' + branches: [ '$sha1' ] + shallow-clone: true + depth: 10 + refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*" + browser: githubweb + browser-url: "{jjb_git}" + script-path: "$script" + +- project: + name: proj_name + jjb_proj: 'UCC/ucc-ci-nvls-tests' + jjb_git: 'git@github.com:openucx/ucc.git' + jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' + jobs: + - "{jjb_proj}" diff --git a/.ci/slurm_tests/slurm_cmd.sh b/.ci/slurm_tests/slurm_cmd.sh new file mode 100755 index 00000000000..c3616a6665d --- /dev/null +++ b/.ci/slurm_tests/slurm_cmd.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +script_cmd="${1}" +slurm_via=${SLURM_VIA:-"scctl"} +slurm_cmd=${SLURM_CMD} + +echo "INFO: Allocate Slurm resources" + +if [ "${slurm_via}" == "scctl" ]; then + if [ "${script_cmd}" == "init" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh" + elif [ "${script_cmd}" == "exec" ]; then + scctl --raw-errors client connect "${slurm_cmd}" + elif [ "${script_cmd}" == "exec_file" ]; then + scctl --raw-errors client connect -s "${slurm_cmd}" + else + echo "ERROR: invalid script command: ${script_cmd}" + exit 1 + fi +elif [ "${slurm_via}" == "ssh" ]; then + if [ "${script_cmd}" == "init" ]; then + : # TODO: implement ssh allocation, run fix_enroot.sh + elif [ "${script_cmd}" == "exec" ]; then + : # TODO: implement ssh allocation, run slurm_cmd + else + echo "ERROR: invalid script command: ${script_cmd}" + exit 1 + fi +else + echo "ERROR: invalid allocation via: ${slurm_via}" + exit 1 +fi \ No newline at end of file diff --git a/.ci/slurm_tests/slurm_scripts/common.sh b/.ci/slurm_tests/slurm_scripts/common.sh new file mode 100644 index 00000000000..732751fb29e --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/common.sh @@ -0,0 +1,16 @@ +# shellcheck shell=bash + +function trap_error() { + local lineno=$1 + local msg=$2 + echo "Error at line $lineno: $msg" + if [ "${DEBUG}" == "9" ]; then + echo "Debug mode, sleeping for 3600 seconds to allow for debugging of the pod" + sleep 3600 + fi + exit 1 +} + +trap 'trap_error $LINENO "Error in script"' ERR + +export ssh_cmd="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -l svcnbu-swx-hpcx" diff --git a/.ci/slurm_tests/slurm_scripts/fix_enroot.sh b/.ci/slurm_tests/slurm_scripts/fix_enroot.sh new file mode 100755 index 00000000000..f4d81b4697a --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/fix_enroot.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -xvEe -o pipefail + +mkdir -p ~/.config/enroot +if [ ! -s ~/.config/enroot/.credentials ]; then + echo "INFO: Create enroot credentials file with some content" + echo "this mitigates error reporting for anonimous image pull" + echo "# This comment is to mitigate Enroot credentials file missing error" > ~/.config/enroot/.credentials +fi diff --git a/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh b/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh new file mode 100755 index 00000000000..c6421cd9efd --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -xvEe -o pipefail + +sudo -u svcnbu-swx-hpcx cp /custom_home/svcnbu-swx-hpcx/.ssh/id_rsa /tmp/id_rsa +sudo chown $(id -u):$(id -g) /tmp/id_rsa +mkdir -p ~/.ssh +chown $(id -u):$(id -g) ~/.ssh +chmod 700 ~/.ssh +mv /tmp/id_rsa ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa diff --git a/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh new file mode 100755 index 00000000000..ec709728183 --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" +: "${SLM_PARTITION:?SLM_PARTITION is not set}" +: "${SLM_NODES:?SLM_NODES is not set}" +: "${SLM_HEAD_NODE:?SLM_HEAD_NODE is not set}" + +slurm_cmd="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell" + +if [ "${SLM_HEAD_NODE}" == "scctl" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh" + scctl --raw-errors client connect "${slurm_cmd}" +else + "${SCRIPT_DIR}/fix_ssh_key.sh" + ${ssh_cmd} "${SLM_HEAD_NODE}" "${slurm_cmd}" +fi + +echo "INFO: Allocate Slurm resources" diff --git a/.ci/slurm_tests/slurm_scripts/slurm_release.sh b/.ci/slurm_tests/slurm_scripts/slurm_release.sh new file mode 100755 index 00000000000..95a35b42c8f --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_release.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" + +slurm_headnode_list="scctl hpchead" +for slurm_headnode in ${slurm_headnode_list}; do + slurm_cmd="scancel --name=${SLM_JOB_NAME}" + if [ "${slurm_headnode}" == "scctl" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect "${slurm_cmd}" + else + ${ssh_cmd} "${slurm_headnode}" "${slurm_cmd}" + fi +done diff --git a/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh new file mode 100755 index 00000000000..aaddd26221e --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" + +test_script="${1}" +: "${test_script:?test_script is not set}" + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" +: "${SLM_HEAD_NODE:?SLM_HEAD_NODE is not set}" +: "${UCC_ENROOT_IMAGE_NAME:?UCC_ENROOT_IMAGE_NAME is not set}" + +if [ "${SLM_HEAD_NODE}" == "scctl" ]; then + slurm_test_script="${WORKSPACE}"/slurm_test.sh + SLM_JOB_ID=$(scctl client connect "squeue --noheader --name=${SLM_JOB_NAME} -o '%i'") + export SLM_JOB_ID + envsubst < "${test_script}" > "${slurm_test_script}" + cat "${slurm_test_script}" + scctl --raw-errors client connect -s "${slurm_test_script}" +else + SLM_JOB_ID=$(${ssh_cmd} "${SLM_HEAD_NODE}" "squeue --noheader --name=${SLM_JOB_NAME} -o '%i'") + export SLM_JOB_ID +#shellcheck disable=SC2087 +${ssh_cmd} "${SLM_HEAD_NODE}" << EOF +$(envsubst < "${test_script}") +EOF + +fi