From 735d835d8f7ed5ececbf981fc62ff7a6714d3ae2 Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Sun, 30 Nov 2025 18:08:33 +0200 Subject: [PATCH 1/7] CI on slurm: Copy Jenkins job from existing AS IS Comes to show further changes Issue: HPCINFRA-3983 Signed-off-by: Michael Braverman --- .ci/slurm_tests/job_matrix.yaml | 132 ++++++++++++++++++++++++++++++++ .ci/slurm_tests/proj_jjb.yaml | 85 ++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 .ci/slurm_tests/job_matrix.yaml create mode 100644 .ci/slurm_tests/proj_jjb.yaml diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml new file mode 100644 index 00000000000..21fcd72672a --- /dev/null +++ b/.ci/slurm_tests/job_matrix.yaml @@ -0,0 +1,132 @@ +--- +job: "ucc" + +step_allow_single_selector: true + +registry_host: "harbor.mellanox.com" +registry_path: "/torch-ucc" +registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89" + +volumes: + - { mountPath: "/home/swx-jenkins", hostPath: "/labhome/swx-jenkins" } + +env: + CUDA_VER: 12.9 + UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" + UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" + NVIDIA_ROOT_DIR: "/opt/nvidia" + SRC_DIR: "${NVIDIA_ROOT_DIR}/src" + BIN_DIR: "${NVIDIA_ROOT_DIR}/bin" + DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all" + +kubernetes: + cloud: il-ipp-blossom-prod + namespace: hpcx + limits: "{memory: 16Gi, cpu: 16000m}" + requests: "{memory: 16Gi, cpu: 16000m}" + +credentials: + - { + credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0", + usernameVariable: "UCC_USERNAME", + passwordVariable: "UCC_PASSWORD", + } + +# cloud pod to build the shared docker image +runs_on_dockers: + - { + file: ".ci/Dockerfile.ngc_pytorch", + name: "ngc_pytorch", + tag: "${BUILD_NUMBER}", + arch: "x86_64", + uri: "${UCC_URI_SUFFIX}", + build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}", + } + - { + file: ".ci/dockerfiles/Dockerfile.build_helper", + name: "build_helper", + tag: "latest", + arch: "x86_64", + uri: "$arch/$name", + build_args: "--no-cache", + } + +# bare metal +runs_on_agents: + - nodeLabel: "swx-clx01" + - nodeLabel: "swx-clx02" + - nodeLabel: "ml-test-node-gpu" + +timeout_minutes: 360 + +steps: + #============================================================================ + - name: Init docker + agentSelector: + - "{nodeLabel: 'swx-clx01'}" + - "{nodeLabel: 'swx-clx02'}" + run: | + set -x + # make sure we always have base image for faster pull of CI image + docker pull "${UCC_DOCKER_IMAGE_NAME}:base" + # pull the CI image + docker pull "${UCC_DOCKER_IMAGE_NAME}:${BUILD_NUMBER}" + + #============================================================================ + - name: Run Coverity + credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0" + agentSelector: "{nodeLabel: 'ml-test-node-gpu'}" + run: | + export UCC_PASSWORD=$UCC_PASSWORD + export UCC_USERNAME=$UCC_USERNAME + echo "Running coverity" + ${WORKSPACE}/.ci/scripts/coverity.sh + archiveArtifacts: .ci/scripts/cov-build/* + + #============================================================================ + - name: Run UCC / Torch-UCC tests + agentSelector: "{nodeLabel: 'swx-clx02'}" + run: | + set -x + echo "INFO: Run UCC tests" + hostname + timeout -k 20 90m docker run -t --rm --name="ucc_tests_${BUILD_NUMBER}" $DOCKER_OPT "${UCC_DOCKER_IMAGE_NAME}:${BUILD_NUMBER}" bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh" + always: | + docker rm --force "ucc_tests_${BUILD_NUMBER}" || true + + #============================================================================ + - name: Run docker containers + agentSelector: "{nodeLabel: 'swx-clx01'}" + run: | + echo "INFO: Run docker containers" + ${WORKSPACE}/.ci/scripts/run_docker.sh + onfail: | + ${WORKSPACE}/.ci/scripts/stop_docker.sh + + #============================================================================ + - name: Run UCC MPI tests + agentSelector: "{nodeLabel: 'swx-clx01'}" + run: | + [ "$UCC_MPI_TESTS" = "false" ] && { echo "MPI tests were skipped !!!";exit 0; } + echo "INFO: Run UCC MPI tests" + ${WORKSPACE}/.ci/scripts/run_tests_ucc_mpi_docker.sh + onfail: | + ${WORKSPACE}/.ci/scripts/stop_docker.sh + + #============================================================================ + - name: Run DLRM tests (UCC/GPU) + agentSelector: "{nodeLabel: 'swx-clx01'}" + run: | + echo "INFO: Run DLRM tests (UCC/GPU)" + ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh + always: | + ${WORKSPACE}/.ci/scripts/stop_docker.sh + +pipeline_stop: + containerSelector: "{name:'build_helper'}" + run: | + set -x + export HOSTFILE=${WORKSPACE}/.ci/configs/swx-clx01/hostfile.txt + export BUILD_NUMBER=${BUILD_NUMBER} + export UCC_DOCKER_IMAGE_NAME=${UCC_DOCKER_IMAGE_NAME} + sudo -E -u swx-jenkins ${WORKSPACE}/.ci/scripts/clean_docker.sh diff --git a/.ci/slurm_tests/proj_jjb.yaml b/.ci/slurm_tests/proj_jjb.yaml new file mode 100644 index 00000000000..caa3bd018ae --- /dev/null +++ b/.ci/slurm_tests/proj_jjb.yaml @@ -0,0 +1,85 @@ +- job-template: + name: "{jjb_proj}" + project-type: pipeline + properties: + - github: + url: "{jjb_git}" + - build-discarder: + days-to-keep: 30 + num-to-keep: 20 + - inject: + keep-system-variables: true + properties-content: | + jjb_proj={jjb_proj} + description: Do NOT edit this job through the Web GUI ! + concurrent: true + sandbox: true + parameters: + - string: + name: "sha1" + default: "master" + description: "Commit to be checked, set by PR" + - bool: + name: "build_dockers" + default: true + description: "Rebuild docker containers" + - string: + name: "conf_file" + default: ".ci/job_matrix.yaml" + description: "Regex to select job config file" + - string: + name: "script" + default: "{jjb_jenkinsfile}" + description: "Jenkinsfile to load on trigger" + - string: + name: "DEBUG" + default: 0 + description: "Enable debug prints and traces, valid values are 0-9" + - string: + name: "UCC_VERSION" + default: "1.0.0" + description: "UCC version" + triggers: +# - github-pull-request: +# cron: 'H/5 * * * *' +# trigger-phrase: '.*\bbot:retest\b.*' +# status-add-test-results: true +# auth-id: '549927eb-7f38-4a8f-997a-81dd63605782' +# org-list: ["Mellanox","openucx"] +# white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"] +# allow-whitelist-orgs-as-admins: true + - github-pull-request: + cron: 'H/5 * * * *' + trigger-phrase: '.*\bbot:retest\b.*' + status-context: "ucc" + success-status: "Test PASSed." + failure-status: "Test FAILed." + error-status: "Test FAILed." + status-add-test-results: true + # swx-jenkins2 from GitHub Pull Request Builder + auth-id: 'cb48aefb-4f90-4d52-a9bc-63d92382e0be' + org-list: ["Mellanox","openucx"] + white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"] + allow-whitelist-orgs-as-admins: true + pipeline-scm: + scm: + - git: + url: "{jjb_git}" + credentials-id: '549927eb-7f38-4a8f-997a-81dd63605782' + branches: [ '$sha1' ] + shallow-clone: true + depth: 10 + refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*" + browser: githubweb + browser-url: "{jjb_git}" + script-path: "$script" + +- project: + name: proj_name + jjb_email: 'artemry@nvidia.com' + jjb_proj: 'ucc' + jjb_git: 'git@github.com:openucx/ucc.git' + jjb_owner: 'artemry' + jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' + jobs: + - "{jjb_proj}" From e4a89fb337b93fd116b2b6e629af0bdabd9c624d Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Wed, 26 Nov 2025 16:22:32 +0200 Subject: [PATCH 2/7] CI on slurm: run on scctl managed clusters Add support for slurm clusters managed by scctl slurm_cmd.sh - provides abstraction to execute slurm based commands run_slurm_tests_ucc.sh - basic test file fix_enroot.sh - comes to mitigate enroot issue to handle errors with anonymous image access` Issue: HPCINFRA-3983 Signed-off-by: Michael Braverman --- .ci/scripts/run_slurm_tests_ucc.sh | 22 ++++++ .ci/slurm_tests/fix_enroot.sh | 10 +++ .ci/slurm_tests/job_matrix.yaml | 116 +++++++++-------------------- .ci/slurm_tests/proj_jjb.yaml | 32 +++----- .ci/slurm_tests/slurm_cmd.sh | 44 +++++++++++ 5 files changed, 123 insertions(+), 101 deletions(-) create mode 100755 .ci/scripts/run_slurm_tests_ucc.sh create mode 100644 .ci/slurm_tests/fix_enroot.sh create mode 100755 .ci/slurm_tests/slurm_cmd.sh diff --git a/.ci/scripts/run_slurm_tests_ucc.sh b/.ci/scripts/run_slurm_tests_ucc.sh new file mode 100755 index 00000000000..b4624bec737 --- /dev/null +++ b/.ci/scripts/run_slurm_tests_ucc.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -xvEe -o pipefail + +# NOTE: script is preprocessed by envsubst +# ensure all variables to be set are in stand alone and simple format +# complex bash string operations are not supported in envsubst + +# Note2: docker image name format should be converted to enroot format (replace first / with #) +# Example: harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9 -> harbor.mellanox.com#torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9 + +srun --job-name=${SLM_JOB_NAME} --nodes=${SLM_NODES} --partition=${SLM_PARTITION} \ + --ntasks-per-node=1 \ + --gpus-per-node=1 \ + --mpi=pmix \ + --cpu-bind=verbose \ + --container-image=$(echo "${UCC_DOCKER_IMAGE_NAME}" |sed 's/\//#/'):${BUILD_NUMBER} \ + bash -lc ' + OMPI_MCA_coll=^hcoll \ + OMPI_MCA_coll_ucc_enable=0 \ + UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ + /opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum \ + ' \ No newline at end of file diff --git a/.ci/slurm_tests/fix_enroot.sh b/.ci/slurm_tests/fix_enroot.sh new file mode 100644 index 00000000000..f4d81b4697a --- /dev/null +++ b/.ci/slurm_tests/fix_enroot.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -xvEe -o pipefail + +mkdir -p ~/.config/enroot +if [ ! -s ~/.config/enroot/.credentials ]; then + echo "INFO: Create enroot credentials file with some content" + echo "this mitigates error reporting for anonimous image pull" + echo "# This comment is to mitigate Enroot credentials file missing error" > ~/.config/enroot/.credentials +fi diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml index 21fcd72672a..e653251b7ba 100644 --- a/.ci/slurm_tests/job_matrix.yaml +++ b/.ci/slurm_tests/job_matrix.yaml @@ -18,6 +18,10 @@ env: SRC_DIR: "${NVIDIA_ROOT_DIR}/src" BIN_DIR: "${NVIDIA_ROOT_DIR}/bin" DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all" + SLM_JOB_NAME: "ucc_tests_${BUILD_NUMBER}" + SLM_PARTITION: "funk" + SLM_NODES: "2" + kubernetes: cloud: il-ipp-blossom-prod @@ -27,13 +31,13 @@ kubernetes: credentials: - { - credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0", - usernameVariable: "UCC_USERNAME", - passwordVariable: "UCC_PASSWORD", + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass", + usernameVariable: "SERVICE_USER_USERNAME", + passwordVariable: "SERVICE_USER_PASSWORD", } -# cloud pod to build the shared docker image runs_on_dockers: + # cloud pod to build the shared docker image - { file: ".ci/Dockerfile.ngc_pytorch", name: "ngc_pytorch", @@ -43,90 +47,40 @@ runs_on_dockers: build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}", } - { - file: ".ci/dockerfiles/Dockerfile.build_helper", - name: "build_helper", - tag: "latest", - arch: "x86_64", - uri: "$arch/$name", - build_args: "--no-cache", + name: 'slurm_executor', + url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest', + arch: 'x86_64' } -# bare metal -runs_on_agents: - - nodeLabel: "swx-clx01" - - nodeLabel: "swx-clx02" - - nodeLabel: "ml-test-node-gpu" - -timeout_minutes: 360 +timeout_minutes: 60 steps: - #============================================================================ - - name: Init docker - agentSelector: - - "{nodeLabel: 'swx-clx01'}" - - "{nodeLabel: 'swx-clx02'}" - run: | - set -x - # make sure we always have base image for faster pull of CI image - docker pull "${UCC_DOCKER_IMAGE_NAME}:base" - # pull the CI image - docker pull "${UCC_DOCKER_IMAGE_NAME}:${BUILD_NUMBER}" - - #============================================================================ - - name: Run Coverity - credentialsId: "bc9a18d3-1153-449c-b924-7fc9249c9cc0" - agentSelector: "{nodeLabel: 'ml-test-node-gpu'}" - run: | - export UCC_PASSWORD=$UCC_PASSWORD - export UCC_USERNAME=$UCC_USERNAME - echo "Running coverity" - ${WORKSPACE}/.ci/scripts/coverity.sh - archiveArtifacts: .ci/scripts/cov-build/* - - #============================================================================ - - name: Run UCC / Torch-UCC tests - agentSelector: "{nodeLabel: 'swx-clx02'}" - run: | - set -x - echo "INFO: Run UCC tests" - hostname - timeout -k 20 90m docker run -t --rm --name="ucc_tests_${BUILD_NUMBER}" $DOCKER_OPT "${UCC_DOCKER_IMAGE_NAME}:${BUILD_NUMBER}" bash -c "\${SRC_DIR}/ucc/.ci/scripts/run_tests_ucc.sh" - always: | - docker rm --force "ucc_tests_${BUILD_NUMBER}" || true - - #============================================================================ - - name: Run docker containers - agentSelector: "{nodeLabel: 'swx-clx01'}" - run: | - echo "INFO: Run docker containers" - ${WORKSPACE}/.ci/scripts/run_docker.sh - onfail: | - ${WORKSPACE}/.ci/scripts/stop_docker.sh - - #============================================================================ - - name: Run UCC MPI tests - agentSelector: "{nodeLabel: 'swx-clx01'}" + - name: Allocate Slurm resources + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" + containerSelector: "{name: 'slurm_executor'}" run: | - [ "$UCC_MPI_TESTS" = "false" ] && { echo "MPI tests were skipped !!!";exit 0; } - echo "INFO: Run UCC MPI tests" - ${WORKSPACE}/.ci/scripts/run_tests_ucc_mpi_docker.sh - onfail: | - ${WORKSPACE}/.ci/scripts/stop_docker.sh + export SLURM_VIA="scctl" + export SLURM_CMD="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell" + ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init + ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec - #============================================================================ - - name: Run DLRM tests (UCC/GPU) - agentSelector: "{nodeLabel: 'swx-clx01'}" + - name: Run UCC tests on slurm cluster + containerSelector: "{name: 'slurm_executor'}" run: | - echo "INFO: Run DLRM tests (UCC/GPU)" - ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh - always: | - ${WORKSPACE}/.ci/scripts/stop_docker.sh + set -xv + echo "INFO: Run UCC tests on Slurm cluster" + envsubst < ${WORKSPACE}/.ci/scripts/run_slurm_tests_ucc.sh > ${WORKSPACE}/slurm_test.sh + cat ${WORKSPACE}/slurm_test.sh + export SLURM_VIA="scctl" + export SLURM_CMD="${WORKSPACE}/slurm_test.sh" + ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec_file pipeline_stop: - containerSelector: "{name:'build_helper'}" + containerSelector: "{name:'slurm_executor'}" + credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" run: | - set -x - export HOSTFILE=${WORKSPACE}/.ci/configs/swx-clx01/hostfile.txt - export BUILD_NUMBER=${BUILD_NUMBER} - export UCC_DOCKER_IMAGE_NAME=${UCC_DOCKER_IMAGE_NAME} - sudo -E -u swx-jenkins ${WORKSPACE}/.ci/scripts/clean_docker.sh + set -xv + export SLURM_VIA="scctl" + export SLURM_CMD="scancel --jobname=${SLM_JOB_NAME}" + ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init + ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec diff --git a/.ci/slurm_tests/proj_jjb.yaml b/.ci/slurm_tests/proj_jjb.yaml index caa3bd018ae..ee94fb524a6 100644 --- a/.ci/slurm_tests/proj_jjb.yaml +++ b/.ci/slurm_tests/proj_jjb.yaml @@ -25,7 +25,7 @@ description: "Rebuild docker containers" - string: name: "conf_file" - default: ".ci/job_matrix.yaml" + default: ".ci/slurm_tests/job_matrix.yaml" description: "Regex to select job config file" - string: name: "script" @@ -39,33 +39,25 @@ name: "UCC_VERSION" default: "1.0.0" description: "UCC version" - triggers: +# triggers: # - github-pull-request: # cron: 'H/5 * * * *' # trigger-phrase: '.*\bbot:retest\b.*' +# status-context: "ucc" +# success-status: "Test PASSed." +# failure-status: "Test FAILed." +# error-status: "Test FAILed." # status-add-test-results: true -# auth-id: '549927eb-7f38-4a8f-997a-81dd63605782' +# # swx-jenkins2 from GitHub Pull Request Builder +# auth-id: 'cb48aefb-4f90-4d52-a9bc-63d92382e0be' # org-list: ["Mellanox","openucx"] # white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"] # allow-whitelist-orgs-as-admins: true - - github-pull-request: - cron: 'H/5 * * * *' - trigger-phrase: '.*\bbot:retest\b.*' - status-context: "ucc" - success-status: "Test PASSed." - failure-status: "Test FAILed." - error-status: "Test FAILed." - status-add-test-results: true - # swx-jenkins2 from GitHub Pull Request Builder - auth-id: 'cb48aefb-4f90-4d52-a9bc-63d92382e0be' - org-list: ["Mellanox","openucx"] - white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"] - allow-whitelist-orgs-as-admins: true pipeline-scm: scm: - git: url: "{jjb_git}" - credentials-id: '549927eb-7f38-4a8f-997a-81dd63605782' + credentials-id: 'b042dbee-a0cf-4e23-9efc-a6dc6586f49c' branches: [ '$sha1' ] shallow-clone: true depth: 10 @@ -76,10 +68,10 @@ - project: name: proj_name - jjb_email: 'artemry@nvidia.com' - jjb_proj: 'ucc' + jjb_email: 'michaelbr@nvidia.com' + jjb_proj: 'UCC/ucc_slurm_tests' jjb_git: 'git@github.com:openucx/ucc.git' - jjb_owner: 'artemry' + jjb_owner: 'michaelbr' jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' jobs: - "{jjb_proj}" diff --git a/.ci/slurm_tests/slurm_cmd.sh b/.ci/slurm_tests/slurm_cmd.sh new file mode 100755 index 00000000000..c3616a6665d --- /dev/null +++ b/.ci/slurm_tests/slurm_cmd.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +script_cmd="${1}" +slurm_via=${SLURM_VIA:-"scctl"} +slurm_cmd=${SLURM_CMD} + +echo "INFO: Allocate Slurm resources" + +if [ "${slurm_via}" == "scctl" ]; then + if [ "${script_cmd}" == "init" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh" + elif [ "${script_cmd}" == "exec" ]; then + scctl --raw-errors client connect "${slurm_cmd}" + elif [ "${script_cmd}" == "exec_file" ]; then + scctl --raw-errors client connect -s "${slurm_cmd}" + else + echo "ERROR: invalid script command: ${script_cmd}" + exit 1 + fi +elif [ "${slurm_via}" == "ssh" ]; then + if [ "${script_cmd}" == "init" ]; then + : # TODO: implement ssh allocation, run fix_enroot.sh + elif [ "${script_cmd}" == "exec" ]; then + : # TODO: implement ssh allocation, run slurm_cmd + else + echo "ERROR: invalid script command: ${script_cmd}" + exit 1 + fi +else + echo "ERROR: invalid allocation via: ${slurm_via}" + exit 1 +fi \ No newline at end of file From 9e16ad6a8a871bee043377ea96bea42bd037fd17 Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Wed, 3 Dec 2025 14:49:05 +0200 Subject: [PATCH 3/7] Rename Jenkins job Also remove stale variables Signed-off-by: Michael Braverman --- .ci/slurm_tests/proj_jjb.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.ci/slurm_tests/proj_jjb.yaml b/.ci/slurm_tests/proj_jjb.yaml index ee94fb524a6..1660a89e4ba 100644 --- a/.ci/slurm_tests/proj_jjb.yaml +++ b/.ci/slurm_tests/proj_jjb.yaml @@ -68,10 +68,8 @@ - project: name: proj_name - jjb_email: 'michaelbr@nvidia.com' - jjb_proj: 'UCC/ucc_slurm_tests' + jjb_proj: 'UCC/ucc-ci-nvls-tests' jjb_git: 'git@github.com:openucx/ucc.git' - jjb_owner: 'michaelbr' jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' jobs: - "{jjb_proj}" From cf35910df116a9c961e1ce32249c7cea47a8ee99 Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Wed, 3 Dec 2025 21:56:42 +0200 Subject: [PATCH 4/7] Add README Signed-off-by: Michael Braverman --- .ci/slurm_tests/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .ci/slurm_tests/README.md diff --git a/.ci/slurm_tests/README.md b/.ci/slurm_tests/README.md new file mode 100644 index 00000000000..6314df74a86 --- /dev/null +++ b/.ci/slurm_tests/README.md @@ -0,0 +1,14 @@ +# Slurm Tests + +This directory contains configuration and scripts for running UCC tests on Slurm clusters. + +## Required Environment Variables for Job Matrix + +| Variable | Description | +|----------|-------------| +| `UCC_ENROOT_IMAGE_NAME` | The container image name to use for running tests. Must be in Enroot-compatible format with `#` after the registry host (e.g., `harbor.mellanox.com#torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9:31`). | +| `SLM_JOB_NAME` | The name assigned to the Slurm job allocation. Used with `salloc --job-name`. Typically includes the build number for traceability (e.g., `ucc_tests_${BUILD_NUMBER}`). | +| `SLM_NODES` | The number of nodes to allocate for the Slurm job. Used with `salloc -N`. | +| `SLM_HEAD_NODE` | The Slurm head node to connect to. Can be `scctl` (uses scctl client) or a hostname for direct SSH access (e.g., `hpchead`). | +| `SLM_PARTITION` | The Slurm partition to submit the job to. Used with `salloc -p` (e.g., `funk`, `soul`). | + From b165777858b30357e3e1dd841560e908f8c12fcb Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Wed, 3 Dec 2025 11:54:59 +0200 Subject: [PATCH 5/7] Hide slurm business logic from pipeline Move initialization and allocation to slurm_allocate.sh Move slurm cleanup logic to slurm_release.sh Move slurm test execution logic to run_slurm_tests_ucc.sh Signed-off-by: Michael Braverman --- .ci/scripts/run_slurm_test_ucc_nvls.sh | 19 +++++++ .ci/scripts/run_slurm_tests_ucc.sh | 22 --------- .ci/slurm_tests/job_matrix.yaml | 49 +++++++++---------- .../{ => slurm_scripts}/fix_enroot.sh | 0 .../slurm_scripts/slurm_allocate.sh | 29 +++++++++++ .../slurm_scripts/slurm_release.sh | 23 +++++++++ .../slurm_scripts/slurm_run_test_script.sh | 18 +++++++ 7 files changed, 112 insertions(+), 48 deletions(-) create mode 100755 .ci/scripts/run_slurm_test_ucc_nvls.sh delete mode 100755 .ci/scripts/run_slurm_tests_ucc.sh rename .ci/slurm_tests/{ => slurm_scripts}/fix_enroot.sh (100%) mode change 100644 => 100755 create mode 100755 .ci/slurm_tests/slurm_scripts/slurm_allocate.sh create mode 100755 .ci/slurm_tests/slurm_scripts/slurm_release.sh create mode 100755 .ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh diff --git a/.ci/scripts/run_slurm_test_ucc_nvls.sh b/.ci/scripts/run_slurm_test_ucc_nvls.sh new file mode 100755 index 00000000000..0c27f936333 --- /dev/null +++ b/.ci/scripts/run_slurm_test_ucc_nvls.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -xvEe -o pipefail + +# NOTE: script is preprocessed by envsubst +# ensure all variables to be set are in stand alone and simple format +# complex bash string operations are not supported in envsubst + +srun --job-name=${SLM_JOB_NAME} \ + --ntasks-per-node=1 \ + --gpus-per-node=1 \ + --mpi=pmix \ + --cpu-bind=verbose \ + --container-image=${UCC_ENROOT_IMAGE_NAME} \ + bash -lc ' + OMPI_MCA_coll=^hcoll \ + OMPI_MCA_coll_ucc_enable=0 \ + UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ + /opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum \ + ' diff --git a/.ci/scripts/run_slurm_tests_ucc.sh b/.ci/scripts/run_slurm_tests_ucc.sh deleted file mode 100755 index b4624bec737..00000000000 --- a/.ci/scripts/run_slurm_tests_ucc.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -xvEe -o pipefail - -# NOTE: script is preprocessed by envsubst -# ensure all variables to be set are in stand alone and simple format -# complex bash string operations are not supported in envsubst - -# Note2: docker image name format should be converted to enroot format (replace first / with #) -# Example: harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9 -> harbor.mellanox.com#torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9 - -srun --job-name=${SLM_JOB_NAME} --nodes=${SLM_NODES} --partition=${SLM_PARTITION} \ - --ntasks-per-node=1 \ - --gpus-per-node=1 \ - --mpi=pmix \ - --cpu-bind=verbose \ - --container-image=$(echo "${UCC_DOCKER_IMAGE_NAME}" |sed 's/\//#/'):${BUILD_NUMBER} \ - bash -lc ' - OMPI_MCA_coll=^hcoll \ - OMPI_MCA_coll_ucc_enable=0 \ - UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ - /opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum \ - ' \ No newline at end of file diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml index e653251b7ba..291b432efca 100644 --- a/.ci/slurm_tests/job_matrix.yaml +++ b/.ci/slurm_tests/job_matrix.yaml @@ -1,8 +1,6 @@ --- job: "ucc" -step_allow_single_selector: true - registry_host: "harbor.mellanox.com" registry_path: "/torch-ucc" registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89" @@ -13,13 +11,12 @@ volumes: env: CUDA_VER: 12.9 UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}" - UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}" NVIDIA_ROOT_DIR: "/opt/nvidia" SRC_DIR: "${NVIDIA_ROOT_DIR}/src" BIN_DIR: "${NVIDIA_ROOT_DIR}/bin" - DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all" + # Enroot requires # in the image name after the registry host + UCC_ENROOT_IMAGE_NAME: "${registry_host}#torch-ucc/${UCC_URI_SUFFIX}:${BUILD_NUMBER}" SLM_JOB_NAME: "ucc_tests_${BUILD_NUMBER}" - SLM_PARTITION: "funk" SLM_NODES: "2" @@ -45,42 +42,42 @@ runs_on_dockers: arch: "x86_64", uri: "${UCC_URI_SUFFIX}", build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}", + category: 'tool' # tool category makes this container not to participate in standard execution flow, one has to explicitly specify it in the job matrix } - { - name: 'slurm_executor', + name: 'slurm_executor_1', url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest', - arch: 'x86_64' + arch: 'x86_64', + SLM_HEAD_NODE: 'scctl', + SLM_PARTITION: 'funk' } + - { + name: 'slurm_executor_2', + url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest', + arch: 'x86_64', + SLM_HEAD_NODE: 'hpchead', + SLM_PARTITION: 'soul' + } + + timeout_minutes: 60 steps: - name: Allocate Slurm resources credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" - containerSelector: "{name: 'slurm_executor'}" run: | - export SLURM_VIA="scctl" - export SLURM_CMD="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell" - ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init - ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh + - name: Run UCC tests on slurm cluster - containerSelector: "{name: 'slurm_executor'}" run: | - set -xv - echo "INFO: Run UCC tests on Slurm cluster" - envsubst < ${WORKSPACE}/.ci/scripts/run_slurm_tests_ucc.sh > ${WORKSPACE}/slurm_test.sh - cat ${WORKSPACE}/slurm_test.sh - export SLURM_VIA="scctl" - export SLURM_CMD="${WORKSPACE}/slurm_test.sh" - ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec_file + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh ${WORKSPACE}/.ci/scripts/run_slurm_test_ucc_nvls.sh + pipeline_stop: - containerSelector: "{name:'slurm_executor'}" credentialsId: "svcnbu-swx-hpcx-corporate-user-pass" + containerSelector: "{name: 'slurm_executor_1'}" run: | - set -xv - export SLURM_VIA="scctl" - export SLURM_CMD="scancel --jobname=${SLM_JOB_NAME}" - ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init - ${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_release.sh + diff --git a/.ci/slurm_tests/fix_enroot.sh b/.ci/slurm_tests/slurm_scripts/fix_enroot.sh old mode 100644 new mode 100755 similarity index 100% rename from .ci/slurm_tests/fix_enroot.sh rename to .ci/slurm_tests/slurm_scripts/fix_enroot.sh diff --git a/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh new file mode 100755 index 00000000000..bc860a9c4bf --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -xvEe -o pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" +: "${SLM_PARTITION:?SLM_PARTITION is not set}" +: "${SLM_NODES:?SLM_NODES is not set}" +: "${SLM_HEAD_NODE:?SLM_HEAD_NODE is not set}" + +slurm_cmd="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell" + +if [ "${SLM_HEAD_NODE}" == "scctl" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh" + scctl --raw-errors client connect "${slurm_cmd}" +else + : # TODO: implement ssh allocation, run fix_enroot.sh +fi + +echo "INFO: Allocate Slurm resources" diff --git a/.ci/slurm_tests/slurm_scripts/slurm_release.sh b/.ci/slurm_tests/slurm_scripts/slurm_release.sh new file mode 100755 index 00000000000..0e6d249cd21 --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_release.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -xvEe -o pipefail + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" + +slurm_headnode_list="scctl hpchead" +for slurm_headnode in ${slurm_headnode_list}; do + slurm_cmd="scancel --name=${SLM_JOB_NAME}" + if [ "${slurm_headnode}" == "scctl" ]; then + export RANCHER_USER=${SERVICE_USER_USERNAME} + export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD} + + scctl -v + scctl --raw-errors login + result=$(scctl --raw-errors client exists) + if [ "$result" == "client does not exist" ]; then + scctl --raw-errors client create + fi + scctl --raw-errors client connect "${slurm_cmd}" + else + ssh "${slurm_headnode}" "${slurm_cmd}" + fi +done diff --git a/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh new file mode 100755 index 00000000000..7e41e503a7e --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -xvEe -o pipefail + +test_script="${1}" +: "${test_script:?test_script is not set}" + +: "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" +: "${SLM_HEAD_NODE:?SLM_HEAD_NODE is not set}" +: "${UCC_ENROOT_IMAGE_NAME:?UCC_ENROOT_IMAGE_NAME is not set}" + +if [ "${SLM_HEAD_NODE}" == "scctl" ]; then + slurm_test_script="${WORKSPACE}"/slurm_test.sh + envsubst < "${test_script}" > "${slurm_test_script}" + cat "${slurm_test_script}" + scctl --raw-errors client connect -s "${slurm_test_script}" +else + : # TODO: implement ssh script execution, use heredoc to eliminate need for envsubst +fi From 66c32cd84d6a6948f5fcd75a970711c69d0d2a45 Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Wed, 3 Dec 2025 21:35:18 +0200 Subject: [PATCH 6/7] Add ssh slurm flow Signed-off-by: Michael Braverman --- .ci/Dockerfile.slurm_executor | 13 +++++++++++++ .ci/scripts/run_slurm_test_ucc_nvls.sh | 4 +++- .ci/slurm_tests/job_matrix.yaml | 8 +++++--- .ci/slurm_tests/slurm_scripts/common.sh | 16 ++++++++++++++++ .ci/slurm_tests/slurm_scripts/fix_ssh_key.sh | 10 ++++++++++ .ci/slurm_tests/slurm_scripts/slurm_allocate.sh | 4 +++- .ci/slurm_tests/slurm_scripts/slurm_release.sh | 5 ++++- .../slurm_scripts/slurm_run_test_script.sh | 13 ++++++++++++- 8 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 .ci/Dockerfile.slurm_executor create mode 100644 .ci/slurm_tests/slurm_scripts/common.sh create mode 100755 .ci/slurm_tests/slurm_scripts/fix_ssh_key.sh diff --git a/.ci/Dockerfile.slurm_executor b/.ci/Dockerfile.slurm_executor new file mode 100644 index 00000000000..27a9cdc72f6 --- /dev/null +++ b/.ci/Dockerfile.slurm_executor @@ -0,0 +1,13 @@ +FROM nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest + +RUN apk update && \ + apk add \ + shadow \ + openssh-client \ + sudo + + +RUN groupadd -g 30 hardware +RUN useradd --no-create-home --uid 149917 --gid 30 --shell /bin/bash svcnbu-swx-hpcx + +USER svcnbu-swx-hpcx diff --git a/.ci/scripts/run_slurm_test_ucc_nvls.sh b/.ci/scripts/run_slurm_test_ucc_nvls.sh index 0c27f936333..2eff57f25ed 100755 --- a/.ci/scripts/run_slurm_test_ucc_nvls.sh +++ b/.ci/scripts/run_slurm_test_ucc_nvls.sh @@ -5,7 +5,9 @@ set -xvEe -o pipefail # ensure all variables to be set are in stand alone and simple format # complex bash string operations are not supported in envsubst -srun --job-name=${SLM_JOB_NAME} \ +srun \ + --jobid=${SLM_JOB_ID} \ + --nodes=${SLM_NODES} \ --ntasks-per-node=1 \ --gpus-per-node=1 \ --mpi=pmix \ diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml index 291b432efca..50c7a98df6a 100644 --- a/.ci/slurm_tests/job_matrix.yaml +++ b/.ci/slurm_tests/job_matrix.yaml @@ -6,7 +6,7 @@ registry_path: "/torch-ucc" registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89" volumes: - - { mountPath: "/home/swx-jenkins", hostPath: "/labhome/swx-jenkins" } + - { mountPath: "/home/svcnbu-swx-hpcx", hostPath: "/labhome/svcnbu-swx-hpcx" } env: CUDA_VER: 12.9 @@ -46,15 +46,17 @@ runs_on_dockers: } - { name: 'slurm_executor_1', - url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest', + file: ".ci/Dockerfile.slurm_executor", arch: 'x86_64', + tag: '${BUILD_NUMBER}', SLM_HEAD_NODE: 'scctl', SLM_PARTITION: 'funk' } - { name: 'slurm_executor_2', - url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest', + file: ".ci/Dockerfile.slurm_executor", arch: 'x86_64', + tag: '${BUILD_NUMBER}', SLM_HEAD_NODE: 'hpchead', SLM_PARTITION: 'soul' } diff --git a/.ci/slurm_tests/slurm_scripts/common.sh b/.ci/slurm_tests/slurm_scripts/common.sh new file mode 100644 index 00000000000..732751fb29e --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/common.sh @@ -0,0 +1,16 @@ +# shellcheck shell=bash + +function trap_error() { + local lineno=$1 + local msg=$2 + echo "Error at line $lineno: $msg" + if [ "${DEBUG}" == "9" ]; then + echo "Debug mode, sleeping for 3600 seconds to allow for debugging of the pod" + sleep 3600 + fi + exit 1 +} + +trap 'trap_error $LINENO "Error in script"' ERR + +export ssh_cmd="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -l svcnbu-swx-hpcx" diff --git a/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh b/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh new file mode 100755 index 00000000000..c6421cd9efd --- /dev/null +++ b/.ci/slurm_tests/slurm_scripts/fix_ssh_key.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -xvEe -o pipefail + +sudo -u svcnbu-swx-hpcx cp /custom_home/svcnbu-swx-hpcx/.ssh/id_rsa /tmp/id_rsa +sudo chown $(id -u):$(id -g) /tmp/id_rsa +mkdir -p ~/.ssh +chown $(id -u):$(id -g) ~/.ssh +chmod 700 ~/.ssh +mv /tmp/id_rsa ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa diff --git a/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh index bc860a9c4bf..ec709728183 100755 --- a/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh +++ b/.ci/slurm_tests/slurm_scripts/slurm_allocate.sh @@ -2,6 +2,7 @@ set -xvEe -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" : "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" : "${SLM_PARTITION:?SLM_PARTITION is not set}" @@ -23,7 +24,8 @@ if [ "${SLM_HEAD_NODE}" == "scctl" ]; then scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh" scctl --raw-errors client connect "${slurm_cmd}" else - : # TODO: implement ssh allocation, run fix_enroot.sh + "${SCRIPT_DIR}/fix_ssh_key.sh" + ${ssh_cmd} "${SLM_HEAD_NODE}" "${slurm_cmd}" fi echo "INFO: Allocate Slurm resources" diff --git a/.ci/slurm_tests/slurm_scripts/slurm_release.sh b/.ci/slurm_tests/slurm_scripts/slurm_release.sh index 0e6d249cd21..95a35b42c8f 100755 --- a/.ci/slurm_tests/slurm_scripts/slurm_release.sh +++ b/.ci/slurm_tests/slurm_scripts/slurm_release.sh @@ -1,6 +1,9 @@ #!/bin/bash set -xvEe -o pipefail +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" + : "${SLM_JOB_NAME:?SLM_JOB_NAME is not set}" slurm_headnode_list="scctl hpchead" @@ -18,6 +21,6 @@ for slurm_headnode in ${slurm_headnode_list}; do fi scctl --raw-errors client connect "${slurm_cmd}" else - ssh "${slurm_headnode}" "${slurm_cmd}" + ${ssh_cmd} "${slurm_headnode}" "${slurm_cmd}" fi done diff --git a/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh index 7e41e503a7e..aaddd26221e 100755 --- a/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh +++ b/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh @@ -1,6 +1,9 @@ #!/bin/bash set -xvEe -o pipefail +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source "${SCRIPT_DIR}/common.sh" + test_script="${1}" : "${test_script:?test_script is not set}" @@ -10,9 +13,17 @@ test_script="${1}" if [ "${SLM_HEAD_NODE}" == "scctl" ]; then slurm_test_script="${WORKSPACE}"/slurm_test.sh + SLM_JOB_ID=$(scctl client connect "squeue --noheader --name=${SLM_JOB_NAME} -o '%i'") + export SLM_JOB_ID envsubst < "${test_script}" > "${slurm_test_script}" cat "${slurm_test_script}" scctl --raw-errors client connect -s "${slurm_test_script}" else - : # TODO: implement ssh script execution, use heredoc to eliminate need for envsubst + SLM_JOB_ID=$(${ssh_cmd} "${SLM_HEAD_NODE}" "squeue --noheader --name=${SLM_JOB_NAME} -o '%i'") + export SLM_JOB_ID +#shellcheck disable=SC2087 +${ssh_cmd} "${SLM_HEAD_NODE}" << EOF +$(envsubst < "${test_script}") +EOF + fi From 3473706836b6660d8290c02d12cba01de2f716da Mon Sep 17 00:00:00 2001 From: Michael Braverman Date: Thu, 4 Dec 2025 14:17:52 +0200 Subject: [PATCH 7/7] Split test in to 2 files: dispatcher and the test itself Signed-off-by: Michael Braverman --- .ci/Dockerfile.slurm_executor | 2 +- .ci/scripts/run_test_nvls.sh | 4 ++++ ...n_slurm_test_ucc_nvls.sh => run_test_nvls_slurm.sh} | 10 ++-------- .ci/slurm_tests/job_matrix.yaml | 4 ++-- 4 files changed, 9 insertions(+), 11 deletions(-) create mode 100755 .ci/scripts/run_test_nvls.sh rename .ci/scripts/{run_slurm_test_ucc_nvls.sh => run_test_nvls_slurm.sh} (52%) diff --git a/.ci/Dockerfile.slurm_executor b/.ci/Dockerfile.slurm_executor index 27a9cdc72f6..01487c7d405 100644 --- a/.ci/Dockerfile.slurm_executor +++ b/.ci/Dockerfile.slurm_executor @@ -8,6 +8,6 @@ RUN apk update && \ RUN groupadd -g 30 hardware -RUN useradd --no-create-home --uid 149917 --gid 30 --shell /bin/bash svcnbu-swx-hpcx +RUN useradd --no-create-home --uid 149917 --gid 30 -d /custom_home/svcnbu-swx-hpcx --no-create-home --shell /bin/bash svcnbu-swx-hpcx USER svcnbu-swx-hpcx diff --git a/.ci/scripts/run_test_nvls.sh b/.ci/scripts/run_test_nvls.sh new file mode 100755 index 00000000000..b11d74ffccb --- /dev/null +++ b/.ci/scripts/run_test_nvls.sh @@ -0,0 +1,4 @@ +OMPI_MCA_coll=^hcoll \ +OMPI_MCA_coll_ucc_enable=0 \ +UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ +/opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum diff --git a/.ci/scripts/run_slurm_test_ucc_nvls.sh b/.ci/scripts/run_test_nvls_slurm.sh similarity index 52% rename from .ci/scripts/run_slurm_test_ucc_nvls.sh rename to .ci/scripts/run_test_nvls_slurm.sh index 2eff57f25ed..864771513e4 100755 --- a/.ci/scripts/run_slurm_test_ucc_nvls.sh +++ b/.ci/scripts/run_test_nvls_slurm.sh @@ -1,5 +1,4 @@ -#!/bin/bash -set -xvEe -o pipefail +#! /bin/bash # NOTE: script is preprocessed by envsubst # ensure all variables to be set are in stand alone and simple format @@ -13,9 +12,4 @@ srun \ --mpi=pmix \ --cpu-bind=verbose \ --container-image=${UCC_ENROOT_IMAGE_NAME} \ - bash -lc ' - OMPI_MCA_coll=^hcoll \ - OMPI_MCA_coll_ucc_enable=0 \ - UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \ - /opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum \ - ' + bash -l /opt/nvidia/src/ucc/.ci/scripts/run_test_nvls.sh diff --git a/.ci/slurm_tests/job_matrix.yaml b/.ci/slurm_tests/job_matrix.yaml index 50c7a98df6a..b5d522675e6 100644 --- a/.ci/slurm_tests/job_matrix.yaml +++ b/.ci/slurm_tests/job_matrix.yaml @@ -6,7 +6,7 @@ registry_path: "/torch-ucc" registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89" volumes: - - { mountPath: "/home/svcnbu-swx-hpcx", hostPath: "/labhome/svcnbu-swx-hpcx" } + - { mountPath: "/custom_home/svcnbu-swx-hpcx", hostPath: "/labhome/svcnbu-swx-hpcx" } env: CUDA_VER: 12.9 @@ -74,7 +74,7 @@ steps: - name: Run UCC tests on slurm cluster run: | - ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh ${WORKSPACE}/.ci/scripts/run_slurm_test_ucc_nvls.sh + ${WORKSPACE}/.ci/slurm_tests/slurm_scripts/slurm_run_test_script.sh ${WORKSPACE}/.ci/scripts/run_test_nvls_slurm.sh pipeline_stop: