aws · fgbelidji · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 15, 2025
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -36,8 +36,8 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+build_frameworks = ["huggingface_vllm"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -186,5 +186,8 @@ dlc-pr-tensorflow-2-eia-inference = ""
 # vllm
 dlc-pr-vllm = ""
 
+# HuggingFace vLLM
+dlc-pr-huggingface-vllm = ""
+
 # sglang
 dlc-pr-sglang = ""
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+ARGS=(--port 8080)
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
@@ -1,2 +1,56 @@
-
-
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK vllm
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION "0.12.0"
+short_version: &SHORT_VERSION "0.12"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE inference
+    root: huggingface/vllm
+    repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+    start_cuda_compat:
+      source: build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+
+
+images:
+  BuildHuggingFaceVllmGpuPy312Cu129DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 26000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu129
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    vllm_version: &VLLM_VERSION 0.12.0
+    tag: !join [ "vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ "vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
@@ -0,0 +1,44 @@
+ARG FINAL_BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker-v1.0
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG HUGGINGFACE_HUB_VERSION=0.36.0
+ARG HF_XET_VERSION=1.2.0
+
+RUN apt-get update -y \
+&& apt-get install -y --no-install-recommends curl unzip \
+&& rm -rf /var/lib/apt/lists/*
+
+
+RUN pip install --upgrade pip && \
+   pip install --no-cache-dir \
+     huggingface-hub==${HUGGINGFACE_HUB_VERSION} \
+     hf-xet==${HF_XET_VERSION} \
+     grpcio
+
+
+FROM vllm-base AS sagemaker
+ENV HF_HUB_ENABLE_HF_TRANSFER="1" \
+    HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:inference:hf-vllm"
+
+# Copy CUDA compat and entrypoint scripts 
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+
+RUN chmod +x /usr/local/bin/start_cuda_compat.sh \
+ && chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
+RUN HOME_DIR=/root \
+ && uv pip install --system --upgrade pip requests PTable \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python3 \
+ && rm -rf ${HOME_DIR}/oss_compliance*
+
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
@@ -27,6 +27,7 @@
     "base",
     "vllm",
     "sglang",
+    "huggingface_vllm",
 }
 DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
 IMAGE_TYPES = {"training", "inference"}

@@ -0,0 +1,13 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import