Remove nvjpeg patching script and rebuild with normal test path

jinyan-li1 · jinyan-li1 · commit 93b6143b5eeb · 2025-08-13T20:30:09.000-07:00
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -41,12 +41,12 @@ build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
+build_training = true 
 build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = true 
 
 [notify]
 ### Notify on test failures
@@ -71,7 +71,7 @@ ec2_benchmark_tests = true
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
 ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
 ### Off by default (set to false)
-ec2_tests_on_heavy_instances = true
+ec2_tests_on_heavy_instances = true 
 ### SM specific tests
 ### On by default
 sagemaker_local_tests = true
@@ -119,7 +119,7 @@ use_scheduler = false
 ### TRAINING PR JOBS ###
 
 # Standard Framework Training
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml"
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-ec2.yml" 
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
 
diff --git a/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu b/pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu
@@ -19,7 +19,6 @@ FROM nvidia/cuda:12.9.1-base-ubuntu22.04 AS base_image
 
 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
 ENV DEBIAN_FRONTEND=noninteractive
-# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -61,8 +60,6 @@ ARG NCCL_VERSION
 ARG EFA_VERSION
 
 ENV CUDA_HOME="/usr/local/cuda"
-# ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
-# ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
 ENV PATH="${CUDA_HOME}/bin:${PATH}"
 ENV EFA_PATH="/opt/amazon/efa"
 ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
@@ -132,20 +129,6 @@ RUN apt-get update \
  && rm -rf /var/lib/apt/lists/* \
  && apt-get clean
 
- # patch nvjpeg to fix CVE
-RUN mkdir -p /tmp/nvjpeg \
-&& cd /tmp/nvjpeg \
-&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
-&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
-&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
-&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
-&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
-&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
-&& rm -rf /tmp/nvjpeg \
-# patch cuobjdump and nvdisasm
-&& rm -rf /usr/local/cuda/bin/cuobjdump* \
-&& rm -rf /usr/local/cuda/bin/nvdisasm* 
-
 # For EFA, below flags are needed to install EFA on docker image
 #  -n, --no-verify       Skip EFA device verification and test
 #  -l, --skip-limit-conf Skip EFA limit configuration
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_8.py
@@ -25,18 +25,18 @@ def test_pytorch_2_8_gpu(
         )
 
     test_cases = [
-        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
-        # (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
@@ -74,8 +74,8 @@ def test_pytorch_2_8_gpu_heavy(
         )
 
     test_cases = [
-        # (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
     ]
 
     test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.8 GPU Heavy")
@@ -118,15 +118,15 @@ def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only):
     pytorch_training = pytorch_training___2__8
 
     test_cases = [
-        # (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
-        # (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)),
     ]
 
     if "sagemaker" in pytorch_training:
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
@@ -147,50 +147,50 @@ def test_efa_tensorflow(
     )
 
 
-# @pytest.mark.skip(
-#     "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
-# )
-# @pytest.mark.processor("gpu")
-# @pytest.mark.model("N/A")
-# @pytest.mark.integration("efa")
-# @pytest.mark.usefixtures("sagemaker_only")
-# @pytest.mark.usefixtures("pt201_and_above_only")
-# @pytest.mark.allow_p4de_use
-# @pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
-# @pytest.mark.team("conda")
-# @pytest.mark.skipif(
-#     is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
-#     reason="Skip EFA test in PR context unless explicitly enabled",
-# )
-# def test_pytorch_efa_healthcheck(
-#     pytorch_training,
-#     efa_ec2_instances,
-#     efa_ec2_connections,
-#     ec2_instance_type,
-#     region,
-#     gpu_only,
-# ):
-#     """
-#     Run EFA Health Check tests on DLC.
-#     :param pytorch_training: str PyTorch Training DLC image URI
-#     :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
-#     :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
-#     :param ec2_instance_type: str Instance Type being tested
-#     :param region: str Region in which EFA-enabled instances are launched
-#     :param gpu_only: pytest fixture to limit test only to GPU DLCs
-#     """
-#     _setup_multinode_efa_instances(
-#         pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
-#     )
-#     master_connection = efa_ec2_connections[0]
-#     run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
-#     run_cmd_on_container(
-#         MASTER_CONTAINER_NAME,
-#         master_connection,
-#         f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
-#         hide=False,
-#         timeout=DEFAULT_EFA_TIMEOUT,
-#     )
+@pytest.mark.skip(
+    "EFA healthcheck binaries are not maintained by DLC, we will skip these tests moving foward unless binaries are added otherwise."
+)
+@pytest.mark.processor("gpu")
+@pytest.mark.model("N/A")
+@pytest.mark.integration("efa")
+@pytest.mark.usefixtures("sagemaker_only")
+@pytest.mark.usefixtures("pt201_and_above_only")
+@pytest.mark.allow_p4de_use
+@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_ONLY_P4_INSTANCE_TYPE_AND_REGION)
+@pytest.mark.team("conda")
+@pytest.mark.skipif(
+    is_pr_context() and not are_heavy_instance_ec2_tests_enabled(),
+    reason="Skip EFA test in PR context unless explicitly enabled",
+)
+def test_pytorch_efa_healthcheck(
+    pytorch_training,
+    efa_ec2_instances,
+    efa_ec2_connections,
+    ec2_instance_type,
+    region,
+    gpu_only,
+):
+    """
+    Run EFA Health Check tests on DLC.
+    :param pytorch_training: str PyTorch Training DLC image URI
+    :param efa_ec2_instances: list of tuples of instance-ids and SSH-keys for EFA-enabled instances
+    :param efa_ec2_connections: list of Fabric Connection objects for EFA-enabled instances
+    :param ec2_instance_type: str Instance Type being tested
+    :param region: str Region in which EFA-enabled instances are launched
+    :param gpu_only: pytest fixture to limit test only to GPU DLCs
+    """
+    _setup_multinode_efa_instances(
+        pytorch_training, efa_ec2_instances, efa_ec2_connections, ec2_instance_type, region
+    )
+    master_connection = efa_ec2_connections[0]
+    run_cmd_on_container(MASTER_CONTAINER_NAME, master_connection, EFA_SANITY_TEST_CMD, hide=False)
+    run_cmd_on_container(
+        MASTER_CONTAINER_NAME,
+        master_connection,
+        f"{EFA_PYTORCH_HEALTHCHECK_TEST_CMD}",
+        hide=False,
+        timeout=DEFAULT_EFA_TIMEOUT,
+    )
 
 
 def _setup_multinode_efa_instances(
@@ -380,33 +380,12 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
         for worker_ip in worker_instance_private_ips:
             hosts_string += f"\n{worker_ip} slots={slots} "
 
-        # TODO: remove logging
-        LOGGER.info(f"Attempting to create hosts file with content:\n{hosts_string}")
-
-        LOGGER.info(f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""")
         run_cmd_on_container(
             MASTER_CONTAINER_NAME,
             master_connection,
             f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
         )
 
-        # TODO: remove logging
-        LOGGER.info(f"Checking if hosts file exists:")
-        run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
-            master_connection,
-            f"ls -l {HOSTS_FILE_LOCATION}",
-            hide=False
-        )
-
-        LOGGER.info(f"Checking hosts file contents:")
-        run_cmd_on_container(
-            MASTER_CONTAINER_NAME,
-            master_connection,
-            f"cat {HOSTS_FILE_LOCATION}",
-            hide=False
-        )
-
 
 def _setup_worker_efa_ssh_config(connection, master_pub_key):
     """
diff --git a/test/testrunner.py b/test/testrunner.py
@@ -444,8 +444,7 @@ def main():
         pytest_cmd = [
             "-s",
             "-rA",
-            # test_path,
-            os.path.join(test_path, "test_efa.py::test_pytorch_efa"),
+            test_path,
             f"--junitxml={report}",
             "-n=auto",
         ]