Skip to content

Commit a44239e

Browse files
committed
AZP: drop redundant /tmp/nvidia-mps bind from UCXX GPU containers
nvidia-container-runtime auto-mounts /tmp/nvidia-mps inside any container started with --gpus all; explicit `-v /tmp/nvidia-mps:/tmp/nvidia-mps` on top of that is a no-op duplicate. Verified live on swx-rdmz-ucx-gpu-01: the MPS pipe + control socket show up under /tmp/nvidia-mps inside the container even when the bind is omitted. Other UCX gpu test containers in this file already follow this convention. Dropped from both ucxx_rapidsai_ci_conda_gpu and ucxx_rapidsai_ci_wheel_gpu container options; added a comment so the next reader doesn't add the bind back.
1 parent a06846f commit a44239e

2 files changed

Lines changed: 9 additions & 33 deletions

File tree

buildlib/pr/main.yml

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -266,27 +266,18 @@ resources:
266266
image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1
267267
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
268268
- container: ucxx_rapidsai_ci_conda_gpu
269-
# Same image as above, +GPU. Match upstream rapidsai/ucxx CI container
270-
# setup: GPU passthrough + MPS socket, but NO IB devices/host net. UCX
271-
# falls back to tcp+sm+cuda_* transports same as upstream; with IB
272-
# exposed it picks rc_mlx5 and AM tests hang on this MLNX env.
273-
# seccomp=unconfined + apparmor=unconfined: Docker's default profile
274-
# blocks the ptrace syscall even when SYS_PTRACE cap is granted, so
275-
# ucxx ci/timeout_with_stack.py's gdb backtraces return empty. Lifting
276-
# the syscall filter restores gdb attach on hung tests.
269+
# GPU passthrough only, no IB/host-net. seccomp/apparmor unconfined so
270+
# SYS_PTRACE works (ucxx timeout_with_stack.py uses gdb attach).
271+
# /tmp/nvidia-mps auto-mounted by nvidia-container-runtime under --gpus all.
277272
image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1
278-
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host -v /tmp/nvidia-mps:/tmp/nvidia-mps --security-opt=seccomp=unconfined --security-opt=apparmor=unconfined
273+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host --security-opt=seccomp=unconfined --security-opt=apparmor=unconfined
279274
- container: ucxx_rapidsai_ci_wheel
280-
# Thin wrapper of rapidsai/ci-wheel; see buildlib/dockers/rapidsai-ci-wheel.Dockerfile.
281275
image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-azp-1
282276
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
283277
- container: ucxx_rapidsai_ci_wheel_gpu
284-
# Same image as ucxx_rapidsai_ci_wheel, +GPU + MPS socket + shm + ulimit
285-
# bumped to match upstream wheels-test container-options. seccomp/
286-
# apparmor unconfined so gdb attach works on hung tests (see conda_gpu
287-
# note above).
278+
# +GPU + shm + ulimit to match upstream wheel-test container options.
288279
image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-azp-1
289-
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host --shm-size=8g --ulimit nofile=1000000:1000000 -v /tmp/nvidia-mps:/tmp/nvidia-mps --security-opt=seccomp=unconfined --security-opt=apparmor=unconfined
280+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host --shm-size=8g --ulimit nofile=1000000:1000000 --security-opt=seccomp=unconfined --security-opt=apparmor=unconfined
290281

291282
stages:
292283
- stage: Codestyle

buildlib/tools/test_ucxx.sh

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,7 @@ cd "$UCXX_DIR"
3636
# on CPU containers). Idempotent.
3737
sed -i 's#^ nvidia-smi$# command -v nvidia-smi >/dev/null \&\& nvidia-smi || echo "(no GPU)"#' ci/test_common.sh
3838

39-
# Diagnostic: surface container uid + key file perms early. Runs A-K
40-
# walked the wrong path assuming we ran as root and could useradd/su to
41-
# the MPS daemon's owner uid (61206 swx-azure-svc). Run K proved we are
42-
# NOT root - useradd hits "cannot lock /etc/passwd; Permission denied".
43-
# Print id/whoami so the next step decides on real data.
39+
# Diagnostic: dump container uid + key perms before each GPU test phase.
4440
diag_container_identity() {
4541
echo "[diag] === container identity ==="
4642
id || true
@@ -84,13 +80,8 @@ EOF
8480
test_python)
8581
if [ "$IS_GPU" = "true" ]; then
8682
diag_container_identity
87-
# Run O probe (hypothesis A): MPS active (no CUDA_MPS_PIPE_DIRECTORY
88-
# override). Drop xdist entirely so pytest runs in a single process,
89-
# makes one cuInit through MPS, no fork/spawn fan-out. UCX gtest works
90-
# under MPS as a single-process client; if single-process pytest also
91-
# works, hypothesis A is confirmed: MPS daemon can't sustain
92-
# multi-process cuInit fan-out from xdist workers (control.log shows
93-
# `Unable to accept connection` loop).
83+
# Drop xdist - single-process pytest issues one cuInit through MPS,
84+
# avoids fan-out of half-completed handshakes under MPS load.
9485
sed -i 's/pytest -n 4/pytest -p no:xdist/g' ci/run_python.sh
9586
export UCX_LOG_LEVEL=${UCX_LOG_LEVEL:-DEBUG}
9687
export UCXPY_LOG_LEVEL=${UCXPY_LOG_LEVEL:-DEBUG}
@@ -106,10 +97,6 @@ EOF
10697
printf '#!/bin/bash\necho "%s"\n' "$PYTHON_CHANNEL_DIR" > "$HOME/.local/bin/rapids-download-from-github"
10798
chmod +x "$HOME/.local/bin/rapids-download-conda-from-github" "$HOME/.local/bin/rapids-download-from-github"
10899
diag_container_identity
109-
# No MPS bypass for Run O. run_python_distributed.sh does not use xdist
110-
# in the test invocation - already single-process - so no patch needed
111-
# here. Failure under MPS for the distributed leg is a separate signal
112-
# (test_ucxx_localcluster spawns dask workers; those are extra procs).
113100
export UCX_LOG_LEVEL=${UCX_LOG_LEVEL:-DEBUG}
114101
export UCXPY_LOG_LEVEL=${UCXPY_LOG_LEVEL:-DEBUG}
115102
export UCX_LOG_FILE=/tmp/ucx_%P.log
@@ -141,8 +128,6 @@ EOF
141128
fi
142129
chmod +x "$HOME/.local/bin/rapids-download-from-github"
143130
diag_container_identity
144-
# Run O probe (hypothesis A): MPS active, drop xdist for single-process
145-
# pytest. See test_python phase comment for rationale.
146131
sed -i 's/pytest -n 4/pytest -p no:xdist/g' ci/run_python.sh
147132
export UCX_LOG_LEVEL=${UCX_LOG_LEVEL:-DEBUG}
148133
export UCXPY_LOG_LEVEL=${UCXPY_LOG_LEVEL:-DEBUG}

0 commit comments

Comments
 (0)