-
Notifications
You must be signed in to change notification settings - Fork 128
Expand file tree
/
Copy pathjob_matrix.yaml
More file actions
86 lines (74 loc) · 2.73 KB
/
job_matrix.yaml
File metadata and controls
86 lines (74 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
---
job: "ucc"
step_allow_single_selector: true
registry_host: "harbor.mellanox.com"
registry_path: "/torch-ucc"
registry_auth: "05d98651-e11c-4a57-9cc6-52df79014b89"
volumes:
- { mountPath: "/home/swx-jenkins", hostPath: "/labhome/swx-jenkins" }
env:
CUDA_VER: 12.9
UCC_URI_SUFFIX: "ucc/${UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}"
UCC_DOCKER_IMAGE_NAME: "${registry_host}${registry_path}/${UCC_URI_SUFFIX}"
NVIDIA_ROOT_DIR: "/opt/nvidia"
SRC_DIR: "${NVIDIA_ROOT_DIR}/src"
BIN_DIR: "${NVIDIA_ROOT_DIR}/bin"
DOCKER_OPT: "--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all"
SLM_JOB_NAME: "ucc_tests_${BUILD_NUMBER}"
SLM_PARTITION: "funk"
SLM_NODES: "2"
kubernetes:
cloud: il-ipp-blossom-prod
namespace: hpcx
limits: "{memory: 16Gi, cpu: 16000m}"
requests: "{memory: 16Gi, cpu: 16000m}"
credentials:
- {
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass",
usernameVariable: "SERVICE_USER_USERNAME",
passwordVariable: "SERVICE_USER_PASSWORD",
}
runs_on_dockers:
# cloud pod to build the shared docker image
- {
file: ".ci/Dockerfile.ngc_pytorch",
name: "ngc_pytorch",
tag: "${BUILD_NUMBER}",
arch: "x86_64",
uri: "${UCC_URI_SUFFIX}",
build_args: "--no-cache --build-arg CUDA_VER=${CUDA_VER}",
}
- {
name: 'slurm_executor',
url: 'nbu-harbor.gtm.nvidia.com/swx-lab-platform/scctl:latest',
arch: 'x86_64'
}
timeout_minutes: 60
steps:
- name: Allocate Slurm resources
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass"
containerSelector: "{name: 'slurm_executor'}"
run: |
export SLURM_VIA="scctl"
export SLURM_CMD="salloc -N ${SLM_NODES} -p ${SLM_PARTITION} --job-name=${SLM_JOB_NAME} --immediate=120 --time=00:30:00 --no-shell"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec
- name: Run UCC tests on slurm cluster
containerSelector: "{name: 'slurm_executor'}"
run: |
set -xv
echo "INFO: Run UCC tests on Slurm cluster"
envsubst < ${WORKSPACE}/.ci/scripts/run_slurm_tests_ucc.sh > ${WORKSPACE}/slurm_test.sh
cat ${WORKSPACE}/slurm_test.sh
export SLURM_VIA="scctl"
export SLURM_CMD="${WORKSPACE}/slurm_test.sh"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec_file
pipeline_stop:
containerSelector: "{name:'slurm_executor'}"
credentialsId: "svcnbu-swx-hpcx-corporate-user-pass"
run: |
set -xv
export SLURM_VIA="scctl"
export SLURM_CMD="scancel --jobname=${SLM_JOB_NAME}"
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh init
${WORKSPACE}/.ci/slurm_tests/slurm_cmd.sh exec