Skip to content

Commit cda0a2d

Browse files
authored
Merge branch 'main' into feat/drainGPUPods
2 parents cb354d2 + 7c22ee0 commit cda0a2d

8 files changed

Lines changed: 515 additions & 10 deletions

File tree

docs/designs/040-external-remediation-request.md

Lines changed: 503 additions & 0 deletions
Large diffs are not rendered by default.

health-monitors/gpu-health-monitor/Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
4949
apt-get install -y --no-install-recommends \
5050
apt-transport-https \
5151
ca-certificates \
52-
gnupg \
5352
python3-pip && \
5453
rm -rf /var/lib/apt/lists/*
5554

log-collector/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
FROM public.ecr.aws/docker/library/ubuntu:24.04
2121

2222
ARG DEBIAN_FRONTEND=noninteractive
23-
ARG KUBECTL_VERSION=v1.34.1
23+
ARG KUBECTL_VERSION=v1.34.8
2424

2525
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
2626
--mount=type=cache,target=/var/lib/apt,sharing=locked \

log-collector/Dockerfile.cleanup

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
FROM public.ecr.aws/docker/library/python:3.13-alpine AS build
15+
FROM public.ecr.aws/docker/library/python:3.14-alpine AS build
1616

1717
RUN --mount=type=cache,target=/root/.cache/pip \
1818
pip install poetry==2.3.3 poetry-plugin-export==1.10.0
@@ -31,7 +31,7 @@ RUN --mount=type=cache,target=/tmp/poetry_cache \
3131
poetry build --format wheel
3232
RUN poetry export --format requirements.txt --output constraints.txt --without-hashes
3333

34-
FROM public.ecr.aws/docker/library/python:3.13-alpine AS runtime
34+
FROM public.ecr.aws/docker/library/python:3.14-alpine AS runtime
3535

3636
ENV PYTHONUNBUFFERED=1
3737

preflight-checks/dcgm-diag/Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
4747
apt-get install -y --no-install-recommends \
4848
apt-transport-https \
4949
ca-certificates \
50-
gnupg \
5150
python3-pip && \
5251
rm -rf /var/lib/apt/lists/*
5352

preflight-checks/nccl-allreduce/Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222

2323
# PyTorch version determines CUDA runtime version - must match cluster's GPU driver
2424
# See: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id4
25-
# 26.03-py3 = CUDA 13.2 (Ubuntu 24.04, Python 3.12)
26-
ARG PYTORCH_VERSION=26.03-py3
25+
# 26.04-py3 = CUDA 13.2.1 (Ubuntu 24.04, Python 3.12)
26+
ARG PYTORCH_VERSION=26.04-py3
2727
ARG VERSION="0.1.0"
2828

2929
# =============================================================================
@@ -67,6 +67,8 @@ FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION} AS runtime
6767

6868
ARG VERSION
6969

70+
RUN rm -f /usr/local/bin/uv /usr/local/bin/uvx
71+
7072
# Remove the NGC image's bundled EFA/OFI-NCCL stack and stale RDMA libs.
7173
# On AWS, the host's /opt/amazon is mounted at runtime and provides the
7274
# EFA libs (libfabric, ofi-nccl plugin) that match the host's kernel

preflight-checks/nccl-allreduce/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ include ../../make/common.mk
2222
include ../../make/python.mk
2323
include ../../make/docker.mk
2424

25-
PYTORCH_VERSION ?= 25.01-py3
25+
PYTORCH_VERSION ?= 26.04-py3
2626
IMAGE_NAME := preflight-nccl-allreduce
2727

2828
.PHONY: all

preflight-checks/nccl-loopback/Dockerfile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
# PyTorch version determines CUDA runtime version - must match cluster's GPU driver
1616
# See: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id4
17-
# 26.03-py3 = CUDA 13.2 (Ubuntu 24.04, Python 3.12)
18-
ARG PYTORCH_VERSION=26.03-py3
17+
# 26.04-py3 = CUDA 13.2.1 (Ubuntu 24.04, Python 3.12)
18+
ARG PYTORCH_VERSION=26.04-py3
1919
ARG NCCL_TESTS_VERSION=v2.13.11
2020

2121
# =============================================================================
@@ -67,6 +67,8 @@ FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION} AS runtime
6767

6868
ARG NCCL_TESTS_VERSION
6969

70+
RUN rm -f /usr/local/bin/uv /usr/local/bin/uvx
71+
7072
# Build nccl-tests from source
7173
# Uses NCCL bundled with PyTorch image for version compatibility
7274
# NVCC_GENCODE targets modern data center GPUs:

0 commit comments

Comments
 (0)