vLLM DLC (#4839)

junpuf · web-flow · commit 211c9c53f106 · 2025-05-22T14:06:04.000-07:00
* initial dockerfile change

* add vllm build type

* add oss compliance

* add python arg

* update dockerfile

* fix dockerfile

* take security update

* fix syntax

* fix dockerfile, tests

* fix tests

* revert dev changes and format
diff --git a/data/common-ecr-scan-allowlist.json b/data/common-ecr-scan-allowlist.json
@@ -1,4 +1,35 @@
 {
+    "vllm": [
+        {
+        "description": "vLLM is an inference and serving engine for large language models. In a multi-node vLLM deployment using the V0 engine, vLLM uses ZeroMQ for some multi-node communication purposes. The secondary vLLM hosts open a `SUB` ZeroMQ socket and connect to an `XPUB` socket on the primary vLLM host. When data is received on this `SUB` socket, it is deserialized with `pickle`. This is unsafe, as it can be abused to execute code on a remote machine. Since the vulnerability exists in a client that connects to the primary vLLM host, this vulnerability serves as an escalation point. If the primary vLLM host is compromised, this vulnerability could be used to compromise the rest of the hosts in the vLLM deployment. Attackers could also use other means to exploit the vulnerability without requiring access to the primary vLLM host. One example would be the use of ARP cache poisoning to redirect traffic to a malicious endpoint used to deliver a payload with arbitrary code to execute on the target machine. Note that this issue o",
+        "vulnerability_id": "CVE-2025-30165",
+        "name": "CVE-2025-30165",
+        "package_name": "vllm",
+        "package_details": {
+            "file_path": "/usr/local/lib/python3.12/dist-packages/vllm-0.8.5.dist-info/METADATA",
+            "name": "vllm",
+            "package_manager": "PYTHON",
+            "version": "0.8.5",
+            "release": null
+        },
+        "remediation": {
+            "recommendation": {
+            "text": "None Provided"
+            }
+        },
+        "cvss_v3_score": 8.0,
+        "cvss_v30_score": 0.0,
+        "cvss_v31_score": 8.0,
+        "cvss_v2_score": 0.0,
+        "cvss_v3_severity": "HIGH",
+        "source_url": "https://nvd.nist.gov/vuln/detail/CVE-2025-30165",
+        "source": "NVD",
+        "severity": "HIGH",
+        "status": "ACTIVE",
+        "title": "CVE-2025-30165 - vllm",
+        "reason_to_ignore": "this vulnerability specifically affects the vllm package when used with the v0 engine, which is not the default engine in vLLM. The default engine is v1, which does not use ZeroMQ for communication. Therefore, this vulnerability does not affect the default configuration of vLLM."
+        }
+    ],
     "pip": [
         {
             "description": "An issue was discovered in pip (all versions) because it installs the version with the highest version number, even if the user had intended to obtain a private package from a private index. This only affects use of the --extra-index-url option, and exploitation requires that the package does not already exist in the public index (and thus the attacker can put the package there with an arbitrary version number). NOTE: it has been reported that this is intended functionality and the user is responsible for using --extra-index-url securely",
diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
@@ -36,7 +36,7 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
 build_frameworks = []
 
 
diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh
@@ -30,7 +30,7 @@ function install_efa {
     echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
     echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
     echo NCCL_DEBUG=INFO >> /etc/nccl.conf
-    echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
+    echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
     
     # Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
     apt-get install -y --no-install-recommends \
diff --git a/src/image_builder.py b/src/image_builder.py
@@ -247,6 +247,8 @@ def image_builder(buildspec, image_types=[], device_types=[]):
             label_job_type = "inference"
         elif "base" in image_repo_uri:
             label_job_type = "base"
+        elif "vllm" in image_repo_uri:
+            label_job_type = "vllm"
         else:
             raise RuntimeError(
                 f"Cannot find inference, training or base job type in {image_repo_uri}. "
diff --git a/test/dlc_tests/sanity/test_pre_release.py b/test/dlc_tests/sanity/test_pre_release.py
@@ -573,6 +573,10 @@ def test_pip_check(image):
 
     :param image: ECR image URI
     """
+    if "vllm" in image:
+        pytest.skip(
+            "vLLM images do not require pip check as they are managed by vLLM devs. Skipping test."
+        )
 
     allowed_exceptions = []
 
@@ -715,9 +719,9 @@ def test_cuda_paths(gpu):
     :param gpu: gpu image uris
     """
     image = gpu
-    if "base" in image:
+    if "base" in image or "vllm" in image:
         pytest.skip(
-            "Base DLC doesn't have the same directory structure and buildspec as other images"
+            "Base/vLLM DLC doesn't have the same directory structure and buildspec as other images"
         )
     if "example" in image:
         pytest.skip("Skipping Example Dockerfiles which are not explicitly tied to a cuda version")
@@ -857,8 +861,10 @@ def _test_framework_and_cuda_version(gpu, ec2_connection):
     :param ec2_connection: fixture to establish connection with an ec2 instance
     """
     image = gpu
-    if "base" in image:
-        pytest.skip("Base DLC has no framework version in the tag. Skipping test.")
+    if "base" in image or "vllm" in image:
+        pytest.skip(
+            "Base/vLLM DLC has doesn't follow the assumptions made by inference/training. Skipping test."
+        )
     tested_framework, tag_framework_version = get_framework_and_version_from_tag(image)
 
     image_repo_name, _ = get_repository_and_tag_from_image_uri(image)
@@ -1110,8 +1116,8 @@ def test_core_package_version(image):
     In this test, we ensure that if a core_packages.json file exists for an image, the packages installed in the image
     satisfy the version constraints specified in the core_packages.json file.
     """
-    if "base" in image:
-        pytest.skip("Base images do not have core packages. Skipping test.")
+    if "base" in image or "vllm" in image:
+        pytest.skip("Base/vLLM images do not have core packages. Skipping test.")
     core_packages_path = src_utils.get_core_packages_path(image)
     if not os.path.exists(core_packages_path):
         pytest.skip(f"Core packages file {core_packages_path} does not exist for {image}")
@@ -1160,9 +1166,9 @@ def test_package_version_regression_in_image(image):
     keys in the buildspec - as these keys are used to extract the released image uri. Additionally, if the image is not already
     released, this test would be skipped.
     """
-    if "base" in image:
+    if "base" in image or "vllm" in image:
         pytest.skip(
-            "Base images don't have python packages that needs to be checked. Skipping test."
+            "Base/vLLM images don't have python packages that needs to be checked. Skipping test."
         )
     dlc_path = os.getcwd().split("/test/")[0]
     corresponding_image_spec = get_image_spec_from_buildspec(
diff --git a/test/dlc_tests/sanity/test_safety_report_file.py b/test/dlc_tests/sanity/test_safety_report_file.py
@@ -73,6 +73,10 @@ def test_safety_file_exists_and_is_valid(image):
         pytest.skip(
             "Base images do not require safety file as there isn't much python libs in it. Skipping test."
         )
+    if "vllm" in image:
+        pytest.skip(
+            "vllm images do not require safety file as they are managed by vLLM devs. Skipping test."
+        )
     repo_name, image_tag = image.split("/")[-1].split(":")
     # Make sure this container name doesn't conflict with the safety check test container name
     container_name = f"{repo_name}-{image_tag}-safety-file"
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
@@ -1892,6 +1892,7 @@ def get_framework_from_image_uri(image_uri):
         "tensorflow": "tensorflow",
         "autogluon": "autogluon",
         "base": "base",
+        "vllm": "vllm",
     }
 
     for image_pattern, framework in framework_map.items():
@@ -2019,7 +2020,7 @@ def get_job_type_from_image(image_uri):
     :return: Job Type
     """
     tested_job_type = None
-    allowed_job_types = ("training", "inference", "base")
+    allowed_job_types = ("training", "inference", "base", "vllm")
     for job_type in allowed_job_types:
         if job_type in image_uri:
             tested_job_type = job_type
diff --git a/test/test_utils/security.py b/test/test_utils/security.py
@@ -614,6 +614,14 @@ def construct_allowlist_from_ecr_scan_result(self, ecr_format_vulnerability_list
             for vulnerable_package in ecr_format_vulnerability["packageVulnerabilityDetails"][
                 "vulnerablePackages"
             ]:
+                if "fixedInVersion" in vulnerable_package:
+                    fixed_version = vulnerable_package["fixedInVersion"].lower()
+                    if "esm" in fixed_version and "ubuntu" in fixed_version:
+                        LOGGER.info(
+                            f"Skipping ESM version {fixed_version} for package {vulnerable_package['name']}"
+                        )
+                        continue
+
                 allowlist_format_vulnerability_object = AllowListFormatVulnerabilityForEnhancedScan(
                     **ecr_format_vulnerability
                 )
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
@@ -1 +1,48 @@
-# placeholder
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK vllm
+version: &VERSION "0.8.5"
+short_version: &SHORT_VERSION "0.8"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE gpu
+    root: .
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: src/deep_learning_container.py
+      target: deep_learning_container.py
+    install_efa:
+      source: scripts/install_efa.sh
+      target: install_efa.sh
+
+images:
+  vllm_x86_64:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 20000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu124
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *DEVICE_TYPE, "-", *CUDA_VERSION, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-", *ARCH_TYPE, "-ec2" ]
+    latest_release_tag: !join [ *DEVICE_TYPE, "-", *CUDA_VERSION, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-", *ARCH_TYPE, "-ec2" ]
+    docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /, *CUDA_VERSION, /Dockerfile ]
+    target: final
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
diff --git a/vllm/x86_64/gpu/cu124/Dockerfile b/vllm/x86_64/gpu/cu124/Dockerfile
@@ -0,0 +1,45 @@
+FROM docker.io/vllm/vllm-openai:v0.8.5 as final
+ARG PYTHON="python3"
+ARG EFA_VERSION="1.40.0"
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    DLC_CONTAINER_TYPE=base \
+    # Python won’t try to write .pyc or .pyo files on the import of source modules
+    # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONIOENCODING=UTF-8 \
+    LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
+    PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
+
+WORKDIR /
+
+COPY install_efa.sh install_efa.sh
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+RUN chmod +x /usr/local/bin/deep_learning_container.py && \
+    # Install EFA
+    bash install_efa.sh ${EFA_VERSION} && \
+    rm install_efa.sh && \
+    # OSS compliance and software update
+    apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --allow-change-held-packages --no-install-recommends unzip && \
+    apt-get clean && \
+    HOME_DIR=/root && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} && \
+    # create symlink for python
+    ln -s /usr/bin/python3 /usr/bin/python && \
+    # clean up
+    rm -rf ${HOME_DIR}/oss_compliance* && \
+    rm -rf /tmp/tmp* && \
+    rm -rf /tmp/uv* && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf /root/.cache | true