Skip to content

Commit 93b6143

Browse files
committed
Remove nvjpeg patching script and rebuild with normal test path
1 parent 454b693 commit 93b6143

File tree

5 files changed

+72
-111
lines changed

5 files changed

+72
-111
lines changed

dlc_developer_config.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ build_frameworks = ["pytorch"]
4141

4242

4343
# By default we build both training and inference containers. Set true/false values to determine which to build.
44-
build_training = true
44+
build_training = true
4545
build_inference = false
4646

4747
# Set do_build to "false" to skip builds and test the latest image built by this PR
4848
# Note: at least one build is required to set do_build to "false"
49-
do_build = true
49+
do_build = true
5050

5151
[notify]
5252
### Notify on test failures
@@ -71,7 +71,7 @@ ec2_benchmark_tests = true
7171
### default. If false, these types of tests will be skipped while other tests will run as usual.
7272
### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
7373
### Off by default (set to false)
74-
ec2_tests_on_heavy_instances = true
74+
ec2_tests_on_heavy_instances = true
7575
### SM specific tests
7676
### On by default
7777
sagemaker_local_tests = true
@@ -119,7 +119,7 @@ use_scheduler = false
119119
### TRAINING PR JOBS ###
120120

121121
# Standard Framework Training
122-
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
122+
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
123123
dlc-pr-tensorflow-2-training = ""
124124
dlc-pr-autogluon-training = ""
125125

pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image
1919

2020
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
2121
ENV DEBIAN_FRONTEND=noninteractive
22-
# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
2322

2423
RUN apt-get update \
2524
&& apt-get upgrade -y \
@@ -61,8 +60,6 @@ ARG NCCL_VERSION
6160
ARG EFA_VERSION
6261

6362
ENV CUDA_HOME="/usr/local/cuda"
64-
# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
65-
# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
6663
ENV PATH="${CUDA_HOME}/bin:${PATH}"
6764
ENV EFA_PATH="/opt/amazon/efa"
6865
ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
@@ -132,20 +129,6 @@ RUN apt-get update \
132129
&& rm -rf /var/lib/apt/lists/* \
133130
&& apt-get clean
134131

135-
# patch nvjpeg to fix CVE
136-
RUN mkdir -p /tmp/nvjpeg \
137-
&& cd /tmp/nvjpeg \
138-
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
139-
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
140-
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
141-
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
142-
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
143-
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
144-
&& rm -rf /tmp/nvjpeg \
145-
# patch cuobjdump and nvdisasm
146-
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
147-
&& rm -rf /usr/local/cuda/bin/nvdisasm*
148-
149132
# For EFA, below flags are needed to install EFA on docker image
150133
# -n, --no-verify Skip EFA device verification and test
151134
# -l, --skip-limit-conf Skip EFA limit configuration

test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu(
2525
)
2626

2727
test_cases = [
28-
# (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
29-
# (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
30-
# (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
31-
# (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
32-
# (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
33-
# (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
34-
# (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
35-
# (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
36-
# (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
37-
# (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
38-
# (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
39-
# (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
28+
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
29+
(common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
30+
(common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
31+
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
32+
(common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
33+
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
34+
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
35+
(common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
36+
(common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
37+
(common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
38+
(common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
39+
(common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
4040
]
4141

4242
if "sagemaker" in pytorch_training:
@@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy(
7474
)
7575

7676
test_cases = [
77-
# (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
78-
# (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
77+
(common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
78+
(common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
7979
]
8080

8181
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy")
@@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only):
118118
pytorch_training = pytorch_training___2__8
119119

120120
test_cases = [
121-
# (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
122-
# (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
123-
# (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
124-
# (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
125-
# (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
126-
# (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
127-
# (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
128-
# (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
129-
# (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
121+
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
122+
(common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
123+
(common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
124+
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
125+
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
126+
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
127+
(common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
128+
(common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
129+
(common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
130130
]
131131

132132
if "sagemaker" in pytorch_training:

test/dlc_tests/ec2/test_efa.py

Lines changed: 44 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -147,50 +147,50 @@ def test_efa_tensorflow(
147147
)
148148

149149

150-
# @pytest.mark.skip(
151-
# "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
152-
# )
153-
# @pytest.mark.processor("gpu")
154-
# @pytest.mark.model("N/A")
155-
# @pytest.mark.integration("efa")
156-
# @pytest.mark.usefixtures("sagemaker_only")
157-
# @pytest.mark.usefixtures("pt201_and_above_only")
158-
# @pytest.mark.allow_p4de_use
159-
# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
160-
# @pytest.mark.team("conda")
161-
# @pytest.mark.skipif(
162-
# is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
163-
# reason="Skip EFA test in PR context unless explicitly enabled",
164-
# )
165-
# def test_pytorch_efa_healthcheck(
166-
# pytorch_training,
167-
# efa_ec2_instances,
168-
# efa_ec2_connections,
169-
# ec2_instance_type,
170-
# region,
171-
# gpu_only,
172-
# ):
173-
# """
174-
# Run EFA Health Check tests on DLC.
175-
# :param pytorch_training: str PyTorch Training DLC image URI
176-
# :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
177-
# :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
178-
# :param ec2_instance_type: str Instance Type being tested
179-
# :param region: str Region in which EFA-enabled instances are launched
180-
# :param gpu_only: pytest fixture to limit test only to GPU DLCs
181-
# """
182-
# _setup_multinode_efa_instances(
183-
# pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
184-
# )
185-
# master_connection = efa_ec2_connections[0]
186-
# run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
187-
# run_cmd_on_container(
188-
# MASTER_CONTAINER_NAME,
189-
# master_connection,
190-
# f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
191-
# hide=False,
192-
# timeout=DEFAULT_EFA_TIMEOUT,
193-
# )
150+
@pytest.mark.skip(
151+
"EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
152+
)
153+
@pytest.mark.processor("gpu")
154+
@pytest.mark.model("N/A")
155+
@pytest.mark.integration("efa")
156+
@pytest.mark.usefixtures("sagemaker_only")
157+
@pytest.mark.usefixtures("pt201_and_above_only")
158+
@pytest.mark.allow_p4de_use
159+
@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
160+
@pytest.mark.team("conda")
161+
@pytest.mark.skipif(
162+
is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
163+
reason="Skip EFA test in PR context unless explicitly enabled",
164+
)
165+
def test_pytorch_efa_healthcheck(
166+
pytorch_training,
167+
efa_ec2_instances,
168+
efa_ec2_connections,
169+
ec2_instance_type,
170+
region,
171+
gpu_only,
172+
):
173+
"""
174+
Run EFA Health Check tests on DLC.
175+
:param pytorch_training: str PyTorch Training DLC image URI
176+
:param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
177+
:param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
178+
:param ec2_instance_type: str Instance Type being tested
179+
:param region: str Region in which EFA-enabled instances are launched
180+
:param gpu_only: pytest fixture to limit test only to GPU DLCs
181+
"""
182+
_setup_multinode_efa_instances(
183+
pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
184+
)
185+
master_connection = efa_ec2_connections[0]
186+
run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
187+
run_cmd_on_container(
188+
MASTER_CONTAINER_NAME,
189+
master_connection,
190+
f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
191+
hide=False,
192+
timeout=DEFAULT_EFA_TIMEOUT,
193+
)
194194

195195

196196
def _setup_multinode_efa_instances(
@@ -380,33 +380,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
380380
for worker_ip in worker_instance_private_ips:
381381
hosts_string += f"\n{worker_ip} slots={slots} "
382382

383-
# TODO: remove logging
384-
LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
385-
386-
LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""")
387383
run_cmd_on_container(
388384
MASTER_CONTAINER_NAME,
389385
master_connection,
390386
f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
391387
)
392388

393-
# TODO: remove logging
394-
LOGGER.info(f"Checking if hosts file exists:")
395-
run_cmd_on_container(
396-
MASTER_CONTAINER_NAME,
397-
master_connection,
398-
f"ls -l {HOSTS_FILE_LOCATION}",
399-
hide=False
400-
)
401-
402-
LOGGER.info(f"Checking hosts file contents:")
403-
run_cmd_on_container(
404-
MASTER_CONTAINER_NAME,
405-
master_connection,
406-
f"cat {HOSTS_FILE_LOCATION}",
407-
hide=False
408-
)
409-
410389

411390
def _setup_worker_efa_ssh_config(connection, master_pub_key):
412391
"""

test/testrunner.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,7 @@ def main():
444444
pytest_cmd = [
445445
"-s",
446446
"-rA",
447-
# test_path,
448-
os.path.join(test_path, "test_efa.py::test_pytorch_efa"),
447+
test_path,
449448
f"--junitxml={report}",
450449
"-n=auto",
451450
]

0 commit comments

Comments
 (0)