Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .ci/scripts/run_slurm_tests_ucc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -xvEe -o pipefail

# NOTE: script is preprocessed by envsubst
# ensure all variables to be set are in stand alone and simple format
# complex bash string operations are not supported in envsubst

# Note2: docker image name format should be converted to enroot format (replace first / with #)
# Example: harbor.mellanox.com/torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9 -> harbor.mellanox.com#torch-ucc/ucc/1.0.0/x86_64/centos8/cuda12.9

srun --job-name=${SLM_JOB_NAME} --nodes=${SLM_NODES} --partition=${SLM_PARTITION} \
--ntasks-per-node=1 \
--gpus-per-node=1 \
--mpi=pmix \
--cpu-bind=verbose \
--container-image=$(echo "${UCC_DOCKER_IMAGE_NAME}" |sed 's/\//#/'):${BUILD_NUMBER} \
bash -lc '
OMPI_MCA_coll=^hcoll \
OMPI_MCA_coll_ucc_enable=0 \
UCC_TLS=cuda,ucp UCC_LOG_LEVEL=info UCC_TL_CUDA_NVLS_SM_COUNT=20 UCC_TL_CUDA_TUNE=allreduce:cuda:@0 \
/opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum \
'
10 changes: 10 additions & 0 deletions .ci/slurm_tests/fix_enroot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

set -xvEe -o pipefail

mkdir -p ~/.config/enroot
if [ ! -s ~/.config/enroot/.credentials ]; then
echo "INFO: Create enroot credentials file with some content"
echo "this mitigates error reporting for anonimous image pull"
echo "# This comment is to mitigate Enroot credentials file missing error" > ~/.config/enroot/.credentials
fi
86 changes: 86 additions & 0 deletions .ci/slurm_tests/job_matrix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
---
job: "ucc"

step_allow_single_selector: true

registry_host: "harbor.mellanox.com"
registry_path: "/torch-ucc"
registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89"

volumes:
- { mountPath: "/home/swx-jenkins", hostPath: "/labhome/swx-jenkins" }

env:
CUDA_VER: 12.9
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
SRC_DIR: "${NVIDIA_ROOT_DIR}/src"
BIN_DIR: "${NVIDIA_ROOT_DIR}/bin"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all"
SLM_JOB_NAME: "ucc_tests_${BUILD_NUMBER}"
SLM_PARTITION: "funk"
SLM_NODES: "2"


kubernetes:
cloud: il-ipp-blossom-prod
namespace: hpcx
limits: "{memory: 16Gi, cpu: 16000m}"
requests: "{memory: 16Gi, cpu: 16000m}"

credentials:
- {
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass",
usernameVariable: "SERVICE_USER_USERNAME",
passwordVariable: "SERVICE_USER_PASSWORD",
}

runs_on_dockers:
# cloud pod to build the shared docker image
- {
file: ".ci/Dockerfile.ngc_pytorch",
name: "ngc_pytorch",
tag: "${BUILD_NUMBER}",
arch: "x86_64",
uri: "${UCC_URI_SUFFIX}",
build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}",
}
- {
name: 'slurm_executor',
url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest',
arch: 'x86_64'
}

timeout_minutes: 60

steps:
- name: Allocate Slurm resources
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass"
containerSelector: "{name: 'slurm_executor'}"
run: |
export SLURM_VIA="scctl"
export SLURM_CMD="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec

- name: Run UCC tests on slurm cluster
containerSelector: "{name: 'slurm_executor'}"
run: |
set -xv
echo "INFO: Run UCC tests on Slurm cluster"
envsubst < ${WORKSPACE}/.ci/scripts/run_slurm_tests_ucc.sh > ${WORKSPACE}/slurm_test.sh
cat ${WORKSPACE}/slurm_test.sh
export SLURM_VIA="scctl"
export SLURM_CMD="${WORKSPACE}/slurm_test.sh"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec_file

pipeline_stop:
containerSelector: "{name:'slurm_executor'}"
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass"
run: |
set -xv
export SLURM_VIA="scctl"
export SLURM_CMD="scancel --jobname=${SLM_JOB_NAME}"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec
77 changes: 77 additions & 0 deletions .ci/slurm_tests/proj_jjb.yaml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the existing jjb yaml, we dont want to duplicate as many variables are common to all jobs

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
- job-template:
name: "{jjb_proj}"
project-type: pipeline
properties:
- github:
url: "{jjb_git}"
- build-discarder:
days-to-keep: 30
num-to-keep: 20
- inject:
keep-system-variables: true
properties-content: |
jjb_proj={jjb_proj}
description: Do NOT edit this job through the Web GUI !
concurrent: true
sandbox: true
parameters:
- string:
name: "sha1"
default: "master"
description: "Commit to be checked, set by PR"
- bool:
name: "build_dockers"
default: true
description: "Rebuild docker containers"
- string:
name: "conf_file"
default: ".ci/slurm_tests/job_matrix.yaml"
description: "Regex to select job config file"
- string:
name: "script"
default: "{jjb_jenkinsfile}"
description: "Jenkinsfile to load on trigger"
- string:
name: "DEBUG"
default: 0
description: "Enable debug prints and traces, valid values are 0-9"
- string:
name: "UCC_VERSION"
default: "1.0.0"
description: "UCC version"
# triggers:
# - github-pull-request:
# cron: 'H/5 * * * *'
# trigger-phrase: '.*\bbot:retest\b.*'
# status-context: "ucc"
# success-status: "Test PASSed."
# failure-status: "Test FAILed."
# error-status: "Test FAILed."
# status-add-test-results: true
# # swx-jenkins2 from GitHub Pull Request Builder
# auth-id: 'cb48aefb-4f90-4d52-a9bc-63d92382e0be'
# org-list: ["Mellanox","openucx"]
# white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mellanox-github"]
# allow-whitelist-orgs-as-admins: true
pipeline-scm:
scm:
- git:
url: "{jjb_git}"
credentials-id: 'b042dbee-a0cf-4e23-9efc-a6dc6586f49c'
branches: [ '$sha1' ]
shallow-clone: true
depth: 10
refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*"
browser: githubweb
browser-url: "{jjb_git}"
script-path: "$script"

- project:
name: proj_name
jjb_email: 'michaelbr@nvidia.com'
jjb_proj: 'UCC/ucc_slurm_tests'
jjb_git: 'git@github.com:openucx/ucc.git'
jjb_owner: 'michaelbr'
jjb_jenkinsfile: '.ci/Jenkinsfile.shlib'
jobs:
- "{jjb_proj}"
44 changes: 44 additions & 0 deletions .ci/slurm_tests/slurm_cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
set -xvEe -o pipefail

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

script_cmd="${1}"
slurm_via=${SLURM_VIA:-"scctl"}
slurm_cmd=${SLURM_CMD}

echo "INFO: Allocate Slurm resources"

if [ "${slurm_via}" == "scctl" ]; then
if [ "${script_cmd}" == "init" ]; then
export RANCHER_USER=${SERVICE_USER_USERNAME}
export RANCHER_PASSWORD=${SERVICE_USER_PASSWORD}

scctl -v
scctl --raw-errors login
result=$(scctl --raw-errors client exists)
if [ "$result" == "client does not exist" ]; then
scctl --raw-errors client create
fi
scctl --raw-errors client connect -s "${SCRIPT_DIR}/fix_enroot.sh"
elif [ "${script_cmd}" == "exec" ]; then
scctl --raw-errors client connect "${slurm_cmd}"
elif [ "${script_cmd}" == "exec_file" ]; then
scctl --raw-errors client connect -s "${slurm_cmd}"
else
echo "ERROR: invalid script command: ${script_cmd}"
exit 1
fi
elif [ "${slurm_via}" == "ssh" ]; then
if [ "${script_cmd}" == "init" ]; then
: # TODO: implement ssh allocation, run fix_enroot.sh
elif [ "${script_cmd}" == "exec" ]; then
: # TODO: implement ssh allocation, run slurm_cmd
else
echo "ERROR: invalid script command: ${script_cmd}"
exit 1
fi
else
echo "ERROR: invalid allocation via: ${slurm_via}"
exit 1
fi
Loading