diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 2ddfe8ccb932..6005bb6bf7ae 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -36,8 +36,8 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+build_frameworks = ["huggingface_vllm"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -186,5 +186,8 @@ dlc-pr-tensorflow-2-eia-inference = ""
 # vllm
 dlc-pr-vllm = ""
 
+# HuggingFace vLLM
+dlc-pr-huggingface-vllm = ""
+
 # sglang
 dlc-pr-sglang = ""
\ No newline at end of file
diff --git a/huggingface/vllm/build_artifacts/sagemaker_entrypoint.sh b/huggingface/vllm/build_artifacts/sagemaker_entrypoint.sh
new file mode 100644
index 000000000000..f6591fb1a821
--- /dev/null
+++ b/huggingface/vllm/build_artifacts/sagemaker_entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+PREFIX="SM_VLLM_"
+ARG_PREFIX="--"
+
+ARGS=(--port 8080)
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
diff --git a/huggingface/vllm/build_artifacts/start_cuda_compat.sh b/huggingface/vllm/build_artifacts/start_cuda_compat.sh
new file mode 100644
index 000000000000..791d355c5abe
--- /dev/null
+++ b/huggingface/vllm/build_artifacts/start_cuda_compat.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
diff --git a/huggingface/vllm/buildspec.yml b/huggingface/vllm/buildspec.yml
index cb9bb599a059..2865fd471413 100644
--- a/huggingface/vllm/buildspec.yml
+++ b/huggingface/vllm/buildspec.yml
@@ -1,2 +1,56 @@
- 
- 
\ No newline at end of file
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK vllm
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION "0.12.0"
+short_version: &SHORT_VERSION "0.12"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE inference
+    root: huggingface/vllm
+    repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+    start_cuda_compat:
+      source: build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+
+
+images:
+  BuildHuggingFaceVllmGpuPy312Cu129DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 26000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu129
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    transformers_version: &TRANSFORMERS_VERSION 4.57.3
+    vllm_version: &VLLM_VERSION 0.12.0
+    tag: !join [ "vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ "vllm", "-", *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
\ No newline at end of file
diff --git a/huggingface/vllm/docker/0.12/cu129/Dockerfile b/huggingface/vllm/docker/0.12/cu129/Dockerfile
new file mode 100644
index 000000000000..7eaba9d38f3c
--- /dev/null
+++ b/huggingface/vllm/docker/0.12/cu129/Dockerfile
@@ -0,0 +1,44 @@
+ARG FINAL_BASE_IMAGE=763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker-v1.0
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG HUGGINGFACE_HUB_VERSION=0.36.0
+ARG HF_XET_VERSION=1.2.0
+
+RUN apt-get update -y \
+&& apt-get install -y --no-install-recommends curl unzip \
+&& rm -rf /var/lib/apt/lists/*
+
+
+RUN pip install --upgrade pip && \
+   pip install --no-cache-dir \
+     huggingface-hub==${HUGGINGFACE_HUB_VERSION} \
+     hf-xet==${HF_XET_VERSION} \
+     grpcio
+
+
+FROM vllm-base AS sagemaker
+ENV HF_HUB_ENABLE_HF_TRANSFER="1" \
+    HF_HUB_USER_AGENT_ORIGIN="aws:sagemaker:gpu-cuda:inference:hf-vllm"
+
+# Copy CUDA compat and entrypoint scripts 
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+
+RUN chmod +x /usr/local/bin/start_cuda_compat.sh \
+ && chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
+RUN HOME_DIR=/root \
+ && uv pip install --system --upgrade pip requests PTable \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python3 \
+ && rm -rf ${HOME_DIR}/oss_compliance*
+
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
diff --git a/src/constants.py b/src/constants.py
index 73f07931c2be..bb4baa4385c3 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -27,6 +27,7 @@
     "base",
     "vllm",
     "sglang",
+    "huggingface_vllm",
 }
 DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
 IMAGE_TYPES = {"training", "inference"}
diff --git a/test/sagemaker_tests/huggingface/inference/resources/local_mode_lock b/test/sagemaker_tests/huggingface/inference/resources/local_mode_lock
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/sagemaker_tests/huggingface/vllm/__init__.py b/test/sagemaker_tests/huggingface/vllm/__init__.py
new file mode 100644
index 000000000000..199e66b95926
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
diff --git a/test/sagemaker_tests/huggingface/vllm/conftest.py b/test/sagemaker_tests/huggingface/vllm/conftest.py
new file mode 100644
index 000000000000..20daa2c701fd
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/conftest.py
@@ -0,0 +1,393 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import logging
+import os
+import platform
+import shutil
+import sys
+import tempfile
+
+import boto3
+import pytest
+
+from botocore.exceptions import ClientError
+from sagemaker import LocalSession, Session
+from sagemaker.pytorch import PyTorch
+
+from .utils import image_utils, get_ecr_registry
+
+NO_P4_REGIONS = [
+    "af-south-1",
+    "ap-east-1",
+    "ap-northeast-3",
+    "ap-southeast-1",
+    "ap-southeast-2",
+    "ap-south-1",
+    "ca-central-1",
+    "eu-central-1",
+    "eu-north-1",
+    "eu-west-2",
+    "eu-west-3",
+    "eu-south-1",
+    "me-south-1",
+    "sa-east-1",
+    "us-west-1",
+    "cn-northwest-1",
+    "il-central-1",
+]
+
+NO_G5_REGIONS = [
+    "us-west-1",
+    "ca-west-1",
+    "mx-cental-1",
+    "af-south-1",
+    "ap-east-1",
+    "ap-south-2",
+    "ap-southeast-5",
+    "ap-southeast-4",
+    "ap-northeast-3",
+    "ap-southeast-1",
+    "ap-southeast-7",
+    "eu-south-1",
+    "eu-west-3",
+    "eu-south-2",
+    "eu-central-2",
+    "me-south-1",
+]
+
+
+logger = logging.getLogger(__name__)
+logging.getLogger("boto").setLevel(logging.INFO)
+logging.getLogger("boto3").setLevel(logging.INFO)
+logging.getLogger("botocore").setLevel(logging.INFO)
+logging.getLogger("factory.py").setLevel(logging.INFO)
+logging.getLogger("auth.py").setLevel(logging.INFO)
+logging.getLogger("connectionpool.py").setLevel(logging.INFO)
+
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def pytest_addoption(parser):
+    parser.addoption("--build-image", "-D", action="store_true")
+    parser.addoption("--build-base-image", "-B", action="store_true")
+    parser.addoption("--aws-id")
+    parser.addoption("--instance-type")
+    parser.addoption("--accelerator-type", default=None)
+    parser.addoption("--docker-base-name", default="huggingface_pytorch")
+    parser.addoption("--region", default="us-west-2")
+    parser.addoption("--framework-version", default="")
+    parser.addoption(
+        "--py-version",
+        choices=["2", "3", "37", "38", "39", "310", "311", "312"],
+        default=str(sys.version_info.major),
+    )
+    # Processor is still "cpu" for EIA tests
+    parser.addoption(
+        "--processor", choices=["gpu", "cpu", "eia", "neuron", "neuronx"], default="cpu"
+    )
+    # If not specified, will default to {framework-version}-{processor}-py{py-version}
+    parser.addoption("--tag", default=None)
+    parser.addoption(
+        "--generate-coverage-doc",
+        default=False,
+        action="store_true",
+        help="use this option to generate test coverage doc",
+    )
+    parser.addoption(
+        "--efa",
+        action="store_true",
+        default=False,
+        help="Run only efa tests",
+    )
+    parser.addoption("--sagemaker-regions", default="us-west-2")
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
+
+
+def pytest_runtest_setup(item):
+    if item.config.getoption("--efa"):
+        efa_tests = [mark for mark in item.iter_markers(name="efa")]
+        if not efa_tests:
+            pytest.skip("Skipping non-efa tests")
+
+
+def pytest_collection_modifyitems(session, config, items):
+    for item in items:
+        print(f"item {item}")
+        for marker in item.iter_markers(name="team"):
+            print(f"item {marker}")
+            team_name = marker.args[0]
+            item.user_properties.append(("team_marker", team_name))
+            print(f"item.user_properties {item.user_properties}")
+
+    if config.getoption("--generate-coverage-doc"):
+        from test.test_utils.test_reporting import TestReportGenerator
+
+        report_generator = TestReportGenerator(items, is_sagemaker=True)
+        report_generator.generate_coverage_doc(
+            framework="huggingface_pytorch", job_type="inference"
+        )
+
+
+@pytest.fixture(scope="session", name="docker_base_name")
+def fixture_docker_base_name(request):
+    return request.config.getoption("--docker-base-name")
+
+
+@pytest.fixture(scope="session", name="region")
+def fixture_region(request):
+    return request.config.getoption("--region")
+
+
+@pytest.fixture(scope="session", name="framework_version")
+def fixture_framework_version(request):
+    return request.config.getoption("--framework-version")
+
+
+@pytest.fixture(scope="session", name="py_version")
+def fixture_py_version(request):
+    return "py{}".format(int(request.config.getoption("--py-version")))
+
+
+@pytest.fixture(scope="session", name="processor")
+def fixture_processor(request):
+    return request.config.getoption("--processor")
+
+
+@pytest.fixture(scope="session", name="tag")
+def fixture_tag(request, framework_version, processor, py_version):
+    provided_tag = request.config.getoption("--tag")
+    default_tag = "{}-{}-{}".format(framework_version, processor, py_version)
+    return provided_tag if provided_tag else default_tag
+
+
+@pytest.fixture(scope="session", name="docker_image")
+def fixture_docker_image(docker_base_name, tag):
+    return "{}:{}".format(docker_base_name, tag)
+
+
+@pytest.fixture
+def opt_ml():
+    tmp = tempfile.mkdtemp()
+    os.mkdir(os.path.join(tmp, "output"))
+
+    # Docker cannot mount Mac OS /var folder properly see
+    # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600
+    opt_ml_dir = "/private{}".format(tmp) if platform.system() == "Darwin" else tmp
+    yield opt_ml_dir
+
+    shutil.rmtree(tmp, True)
+
+
+@pytest.fixture(scope="session", name="use_gpu")
+def fixture_use_gpu(processor):
+    return processor == "gpu"
+
+
+@pytest.fixture(scope="session", name="build_base_image", autouse=True)
+def fixture_build_base_image(
+    request, framework_version, py_version, processor, tag, docker_base_name
+):
+    build_base_image = request.config.getoption("--build-base-image")
+    if build_base_image:
+        return image_utils.build_base_image(
+            framework_name=docker_base_name,
+            framework_version=framework_version,
+            py_version=py_version,
+            base_image_tag=tag,
+            processor=processor,
+            cwd=os.path.join(dir_path, ".."),
+        )
+
+    return tag
+
+
+@pytest.fixture(scope="session", name="sagemaker_session")
+def fixture_sagemaker_session(region):
+    return Session(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(scope="session", name="sagemaker_regions")
+def fixture_sagemaker_regions(request):
+    sagemaker_regions = request.config.getoption("--sagemaker-regions")
+    return sagemaker_regions.split(",")
+
+
+@pytest.fixture(scope="session", name="sagemaker_local_session")
+def fixture_sagemaker_local_session(region):
+    return LocalSession(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(name="aws_id", scope="session")
+def fixture_aws_id(request):
+    return request.config.getoption("--aws-id")
+
+
+@pytest.fixture(name="instance_type", scope="session")
+def fixture_instance_type(request, processor):
+    provided_instance_type = request.config.getoption("--instance-type")
+    default_instance_type = "local" if processor == "cpu" else "local_gpu"
+    return provided_instance_type or default_instance_type
+
+
+@pytest.fixture(name="accelerator_type", scope="session")
+def fixture_accelerator_type(request):
+    return request.config.getoption("--accelerator-type")
+
+
+@pytest.fixture(name="docker_registry", scope="session")
+def fixture_docker_registry(aws_id, region):
+    return get_ecr_registry(aws_id, region)
+
+
+@pytest.fixture(name="ecr_image", scope="session")
+def fixture_ecr_image(docker_registry, docker_base_name, tag):
+    return "{}/{}:{}".format(docker_registry, docker_base_name, tag)
+
+
+@pytest.fixture(autouse=True)
+def skip_by_device_type(request, use_gpu, instance_type, accelerator_type):
+    is_gpu = use_gpu or instance_type[3] in ["g", "p"]
+    is_eia = accelerator_type is not None
+    is_neuron = instance_type.startswith("ml.inf1")
+    is_neuronx = instance_type.startswith("ml.inf2") or instance_type.startswith("ml.trn1")
+
+    # Separate out cases for clearer logic.
+    # When running Neuron test, skip CPU  and GPU test.
+    if request.node.get_closest_marker("neuron_test") and not is_neuron:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+    elif request.node.get_closest_marker("neuronx_test") and not is_neuronx:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running GPU test, skip CPU  and neuron test. When running CPU test, skip GPU  and neuron test.
+    elif (request.node.get_closest_marker("gpu_test") and not is_gpu) or (
+        request.node.get_closest_marker("cpu_test") and (is_gpu or is_neuron or is_neuronx)
+    ):
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running EIA test, skip the CPU, GPU and Neuron functions
+    elif (
+        request.node.get_closest_marker("neuron_test")
+        or request.node.get_closest_marker("gpu_test")
+        or request.node.get_closest_marker("cpu_test")
+    ) and is_eia:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+    # When running CPU or GPU or Neuron test, skip EIA test.
+    elif request.node.get_closest_marker("eia_test") and not is_eia:
+        pytest.skip("Skipping because running on '{}' instance".format(instance_type))
+
+
+@pytest.fixture(autouse=True)
+def skip_by_py_version(request, py_version):
+    if request.node.get_closest_marker("skip_py2") and py_version != "py3":
+        pytest.skip("Skipping the test because Python 2 is not supported.")
+
+
+@pytest.fixture(autouse=True)
+def skip_gpu_instance_restricted_regions(region, instance_type):
+    if (region in NO_P4_REGIONS and instance_type.startswith("ml.p4")) or (
+        region in NO_G5_REGIONS and instance_type.startswith("ml.g5")
+    ):
+        pytest.skip(
+            "Skipping GPU test in region {} with instance type {}".format(region, instance_type)
+        )
+
+
+@pytest.fixture(autouse=True)
+def skip_gpu_py2(request, use_gpu, instance_type, py_version, framework_version):
+    is_gpu = use_gpu or instance_type[3] in ["g", "p"]
+    if (
+        request.node.get_closest_marker("skip_gpu_py2")
+        and is_gpu
+        and py_version != "py3"
+        and framework_version == "1.4.0"
+    ):
+        pytest.skip("Skipping the test until mms issue resolved.")
+
+
+def _get_remote_override_flags():
+    try:
+        s3_client = boto3.client("s3")
+        sts_client = boto3.client("sts")
+        account_id = sts_client.get_caller_identity().get("Account")
+        result = s3_client.get_object(
+            Bucket=f"dlc-cicd-helper-{account_id}", Key="override_tests_flags.json"
+        )
+        json_content = json.loads(result["Body"].read().decode("utf-8"))
+    except ClientError as e:
+        logger.warning("ClientError when performing S3/STS operation: {}".format(e))
+        json_content = {}
+    return json_content
+
+
+def _is_test_disabled(test_name, build_name, version):
+    """
+    Expected format of remote_override_flags:
+    {
+        "CB Project Name for Test Type A": {
+            "CodeBuild Resolved Source Version": ["test_type_A_test_function_1", "test_type_A_test_function_2"]
+        },
+        "CB Project Name for Test Type B": {
+            "CodeBuild Resolved Source Version": ["test_type_B_test_function_1", "test_type_B_test_function_2"]
+        }
+    }
+
+    :param test_name: str Test Function node name (includes parametrized values in string)
+    :param build_name: str Build Project name of current execution
+    :param version: str Source Version of current execution
+    :return: bool True if test is disabled as per remote override, False otherwise
+    """
+    remote_override_flags = _get_remote_override_flags()
+    remote_override_build = remote_override_flags.get(build_name, {})
+    if version in remote_override_build:
+        return not remote_override_build[version] or any(
+            [test_keyword in test_name for test_keyword in remote_override_build[version]]
+        )
+    return False
+
+
+@pytest.fixture(autouse=True)
+def disable_test(request):
+    test_name = request.node.name
+    # We do not have a regex pattern to find CB name, which means we must resort to string splitting
+    build_arn = os.getenv("CODEBUILD_BUILD_ARN")
+    build_name = build_arn.split("/")[-1].split(":")[0] if build_arn else None
+    version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
+
+    if build_name and version and _is_test_disabled(test_name, build_name, version):
+        pytest.skip(f"Skipping {test_name} test because it has been disabled.")
+
+
+@pytest.fixture(autouse=True)
+def skip_test_successfully_executed_before(request):
+    """
+    "cache/lastfailed" contains information about failed tests only. We're running SM tests in separate threads for each image.
+    So when we retry SM tests, successfully executed tests executed again because pytest doesn't have that info in /.cache.
+    But the flag "--last-failed-no-failures all" requires pytest to execute all the available tests.
+    The only sign that a test passed last time - lastfailed file exists and the test name isn't in that file.
+    The method checks whether lastfailed file exists and the test name is not in it.
+    """
+    test_name = request.node.name
+    lastfailed = request.config.cache.get("cache/lastfailed", None)
+
+    if lastfailed is not None and not any(
+        test_name in failed_test_name for failed_test_name in lastfailed.keys()
+    ):
+        pytest.skip(f"Skipping {test_name} because it was successfully executed for this commit")
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/__init__.py b/test/sagemaker_tests/huggingface/vllm/integration/__init__.py
new file mode 100644
index 000000000000..1ae0ad1bf05d
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/__init__.py
@@ -0,0 +1,111 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import os
+import re
+import shutil
+import tarfile
+
+import boto3
+
+# Path to test resources
+resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources"))
+
+# Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime
+MODEL_ID = "Qwen/Qwen2.5-0.5B"
+model_dir = os.path.join(resources_path, "qwen2.5-0.5b")
+model_data = "qwen2.5-0.5b.tar.gz"
+model_data_path = os.path.join(model_dir, model_data)
+
+
+def ensure_model_downloaded():
+    """Download model from HuggingFace Hub and create tarball if not already present."""
+    if os.path.exists(model_data_path):
+        return model_data_path
+    
+    from huggingface_hub import snapshot_download
+    
+    os.makedirs(model_dir, exist_ok=True)
+    local_model_dir = os.path.join(model_dir, "model")
+    
+    print(f"Downloading {MODEL_ID} from HuggingFace Hub...")
+    snapshot_download(repo_id=MODEL_ID, local_dir=local_model_dir, ignore_patterns=["*.gguf", "*.onnx"])
+    
+    # Remove cache folder if present
+    cache_dir = os.path.join(local_model_dir, ".cache")
+    if os.path.exists(cache_dir):
+        shutil.rmtree(cache_dir)
+    
+    print(f"Creating tarball {model_data}...")
+    with tarfile.open(model_data_path, "w:gz") as tar:
+        for item in os.listdir(local_model_dir):
+            tar.add(os.path.join(local_model_dir, item), arcname=item)
+    
+    # Clean up extracted model
+    shutil.rmtree(local_model_dir)
+    
+    print(f"Model ready at {model_data_path}")
+    return model_data_path
+
+# Role for local mode (not used but required by SageMaker SDK)
+ROLE = "dummy/unused-role"
+DEFAULT_TIMEOUT = 45
+
+
+class NoLogStreamFoundError(Exception):
+    pass
+
+
+class SageMakerEndpointFailure(Exception):
+    pass
+
+
+def dump_logs_from_cloudwatch(e, region="us-west-2"):
+    """
+    Function to dump logs from cloudwatch during error handling.
+    Gracefully handles missing log groups/streams.
+    """
+    error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):")
+    endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)")
+    endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search(
+        str(e)
+    )
+    if endpoint_match:
+        logs_client = boto3.client("logs", region_name=region)
+        endpoint = endpoint_match.group(1)
+        log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}"
+        try:
+            log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name)
+            all_traffic_log_stream = ""
+            for log_stream in log_stream_resp.get("logStreams", []):
+                log_stream_name = log_stream.get("logStreamName")
+                if log_stream_name.startswith("AllTraffic"):
+                    all_traffic_log_stream = log_stream_name
+                    break
+            if not all_traffic_log_stream:
+                raise NoLogStreamFoundError(
+                    f"Cannot find all traffic log streams for endpoint {endpoint}"
+                ) from e
+            events = logs_client.get_log_events(
+                logGroupName=log_group_name, logStreamName=all_traffic_log_stream
+            )
+            raise SageMakerEndpointFailure(
+                f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}"
+            ) from e
+        except logs_client.exceptions.ResourceNotFoundException:
+            # Log group doesn't exist yet - endpoint may have failed before creating logs
+            raise SageMakerEndpointFailure(
+                f"Endpoint {endpoint} failed. No CloudWatch logs available yet."
+            ) from e
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/local/__init__.py b/test/sagemaker_tests/huggingface/vllm/integration/local/__init__.py
new file mode 100644
index 000000000000..199e66b95926
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/local/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/vllm/integration/local/test_serving.py
new file mode 100644
index 000000000000..dc16cacc1e70
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/local/test_serving.py
@@ -0,0 +1,106 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from contextlib import contextmanager
+
+import pytest
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+from sagemaker.deserializers import JSONDeserializer
+
+from ...integration import ROLE, ensure_model_downloaded
+from ...utils import local_mode_utils
+
+
+@contextmanager
+def _predictor(image, sagemaker_local_session, instance_type):
+    """Context manager for vLLM model deployment and cleanup.
+    
+    Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz.
+    vLLM loads the model from this local path.
+    """
+    # Download model from HuggingFace Hub if not already present
+    model_data_path = ensure_model_downloaded()
+    
+    env = {
+        "SM_VLLM_MODEL": "/opt/ml/model",
+        "SM_VLLM_MAX_MODEL_LEN": "512",
+        "SM_VLLM_HOST": "0.0.0.0",
+    }
+
+    model = Model(
+        model_data=f"file://{model_data_path}",
+        role=ROLE,
+        image_uri=image,
+        env=env,
+        sagemaker_session=sagemaker_local_session,
+        predictor_cls=Predictor,
+    )
+    with local_mode_utils.lock():
+        predictor = None
+        try:
+            predictor = model.deploy(1, instance_type)
+            yield predictor
+        finally:
+            if predictor is not None:
+                predictor.delete_endpoint()
+
+
+def _assert_vllm_prediction(predictor):
+    """Test vLLM inference using OpenAI-compatible completions API."""
+    predictor.serializer = JSONSerializer()
+    predictor.deserializer = JSONDeserializer()
+
+    data = {
+        "prompt": "What is Deep Learning?",
+        "max_tokens": 50,
+        "temperature": 0.7,
+    }
+    output = predictor.predict(data)
+
+    assert output is not None
+    assert "choices" in output
+
+
+def _assert_vllm_chat_prediction(predictor):
+    """Test vLLM inference using OpenAI-compatible chat completions API."""
+    predictor.serializer = JSONSerializer()
+    predictor.deserializer = JSONDeserializer()
+
+    data = {
+        "messages": [{"role": "user", "content": "What is Deep Learning?"}],
+        "max_tokens": 50,
+        "temperature": 0.7,
+    }
+    output = predictor.predict(data)
+
+    assert output is not None
+    assert "choices" in output
+
+
+@pytest.mark.model("qwen2.5-0.5b")
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_vllm_local_completions(docker_image, sagemaker_local_session, instance_type):
+    """Test vLLM local deployment with completions API."""
+    with _predictor(docker_image, sagemaker_local_session, instance_type) as predictor:
+        _assert_vllm_prediction(predictor)
+
+
+@pytest.mark.model("qwen2.5-0.5b")
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_vllm_local_chat(docker_image, sagemaker_local_session, instance_type):
+    """Test vLLM local deployment with chat completions API."""
+    with _predictor(docker_image, sagemaker_local_session, instance_type) as predictor:
+        _assert_vllm_chat_prediction(predictor)
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/__init__.py b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/__init__.py
new file mode 100644
index 000000000000..04fbf5d9a144
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/test_vllm.py b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/test_vllm.py
new file mode 100644
index 000000000000..6e00720a13d3
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/test_vllm.py
@@ -0,0 +1,124 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import logging
+
+import pytest
+import sagemaker
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+from sagemaker.deserializers import JSONDeserializer
+
+from ...integration import dump_logs_from_cloudwatch
+from ...integration.sagemaker.timeout import timeout_and_delete_endpoint
+from ..... import invoke_sm_endpoint_helper_function
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pytest.mark.model("bloom-560m")
+@pytest.mark.processor("gpu")
+@pytest.mark.gpu_test
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_vllm_bloom(framework_version, ecr_image, instance_type, sagemaker_regions):
+    invoke_sm_endpoint_helper_function(
+        ecr_image=ecr_image,
+        sagemaker_regions=sagemaker_regions,
+        test_function=_test_vllm_model,
+        dump_logs_from_cloudwatch=dump_logs_from_cloudwatch,
+        framework_version=framework_version,
+        instance_type=instance_type,
+        model_id="bigscience/bloom-560m",
+    )
+
+
+@pytest.mark.model("qwen3-8b")
+@pytest.mark.processor("gpu")
+@pytest.mark.gpu_test
+@pytest.mark.team("sagemaker-1p-algorithms")
+def test_vllm_qwen(framework_version, ecr_image, instance_type, sagemaker_regions):
+    invoke_sm_endpoint_helper_function(
+        ecr_image=ecr_image,
+        sagemaker_regions=sagemaker_regions,
+        test_function=_test_vllm_model,
+        dump_logs_from_cloudwatch=dump_logs_from_cloudwatch,
+        framework_version=framework_version,
+        instance_type=instance_type,
+        model_id="Qwen/Qwen3-8B",
+    )
+
+
+def _test_vllm_model(
+    image_uri,
+    sagemaker_session,
+    instance_type,
+    model_id,
+    framework_version=None,
+    **kwargs,
+):
+    """Test vLLM model deployment and inference using OpenAI-compatible API format
+    
+    Uses sagemaker.model.Model for SDK v3 compatibility instead of HuggingFaceModel.
+    
+    Args:
+        image_uri: ECR image URI
+        sagemaker_session: SageMaker session
+        instance_type: ML instance type
+        model_id: HuggingFace model ID
+        framework_version: Optional version info
+        **kwargs: Additional args from helper (boto_session, sagemaker_client, etc.)
+    """
+    endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-vllm-serving")
+
+    env = {
+        "SM_VLLM_MODEL": model_id,
+        "SM_VLLM_MAX_MODEL_LEN": "512",
+        "SM_VLLM_HOST": "0.0.0.0",
+    }
+
+    model = Model(
+        name=endpoint_name,
+        image_uri=image_uri,
+        role="SageMakerRole",
+        env=env,
+        sagemaker_session=sagemaker_session,
+        predictor_cls=Predictor,
+    )
+
+    with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=45):
+        predictor = model.deploy(
+            initial_instance_count=1,
+            instance_type=instance_type,
+            endpoint_name=endpoint_name,
+            container_startup_health_check_timeout=1800,
+            inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+        )
+
+        predictor.serializer = JSONSerializer()
+        predictor.deserializer = JSONDeserializer()
+
+        # vLLM uses OpenAI-compatible API format
+        data = {
+            "prompt": "What is Deep Learning?",
+            "max_tokens": 50,
+            "temperature": 0.7,
+        }
+
+        LOGGER.info(f"Running inference with data: {data}")
+        output = predictor.predict(data)
+        LOGGER.info(f"Output: {json.dumps(output)}")
+
+        assert output is not None
\ No newline at end of file
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/timeout.py b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/timeout.py
new file mode 100644
index 000000000000..1d13878031f7
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/integration/sagemaker/timeout.py
@@ -0,0 +1,66 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+import signal
+from contextlib import contextmanager
+import logging
+
+from botocore.exceptions import ClientError
+
+LOGGER = logging.getLogger("timeout")
+
+
+class TimeoutError(Exception):
+    pass
+
+
+@contextmanager
+def timeout(seconds=0, minutes=0, hours=0):
+    """Add a signal-based timeout to any block of code.
+    If multiple time units are specified, they will be added together to determine time limit.
+    Usage:
+    with timeout(seconds=5):
+        my_slow_function(...)
+    Args:
+        - seconds: The time limit, in seconds.
+        - minutes: The time limit, in minutes.
+        - hours: The time limit, in hours.
+    """
+
+    limit = seconds + 60 * minutes + 3600 * hours
+
+    def handler(signum, frame):
+        raise TimeoutError("timed out after {} seconds".format(limit))
+
+    try:
+        signal.signal(signal.SIGALRM, handler)
+        signal.alarm(limit)
+
+        yield
+    finally:
+        signal.alarm(0)
+
+
+@contextmanager
+def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            try:
+                sagemaker_session.delete_endpoint(endpoint_name)
+                LOGGER.info("deleted endpoint {}".format(endpoint_name))
+            except ClientError as ce:
+                if ce.response["Error"]["Code"] == "ValidationException":
+                    # avoids the inner exception to be overwritten
+                    pass
diff --git a/test/sagemaker_tests/huggingface/vllm/requirements.txt b/test/sagemaker_tests/huggingface/vllm/requirements.txt
new file mode 100644
index 000000000000..3cf0114ebead
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/requirements.txt
@@ -0,0 +1,29 @@
+boto3
+coverage
+# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local)
+docker>=5,<=6.1.3
+flake8==3.7.7
+Flask==1.1.1
+mock
+pytest==8.3.5
+pytest-cov
+pytest-rerunfailures
+pytest-xdist
+PyYAML
+protobuf>=3.20,<=3.20.2
+sagemaker>=2.237.0,<3
+six
+requests<2.32.0
+requests_mock
+Pillow
+retrying==1.3.3
+urllib3>=1.26.8
+pluggy>=1.5,<2
+requests_mock
+sagemaker-inference
+tenacity
+fabric
+invoke
+gitpython
+toml
+huggingface_hub
\ No newline at end of file
diff --git a/test/sagemaker_tests/huggingface/vllm/test.ipynb b/test/sagemaker_tests/huggingface/vllm/test.ipynb
new file mode 100644
index 000000000000..d582d9bb8b64
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/test.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a944ea92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import sagemaker\n",
+    "import boto3\n",
+    "\n",
+    "iam = boto3.client('iam')\n",
+    "role = iam.get_role(RoleName='sagemaker-dlc-demo')['Role']['Arn']\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "86a84f2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_image = \"754289655784.dkr.ecr.us-east-1.amazonaws.com/test-tgi:vllm-0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0c961547",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import json\n",
+    "from sagemaker.huggingface import HuggingFaceModel\n",
+    "\n",
+    "# sagemaker config\n",
+    "instance_type = \"ml.g5.12xlarge\"\n",
+    "health_check_timeout = 900\n",
+    "\n",
+    "\n",
+    "# Define Model and Endpoint configuration parameter\n",
+    "config = {\n",
+    "  'SM_VLLM_MODEL': \"Qwen/Qwen3-4B-Thinking-2507\", # model_id from hf.co/models\n",
+    "  'SM_VLLM_MAX_MODEL_LEN': \"2048\",  # Max length of input text\n",
+    "  'SM_VLLM_HOST': \"0.0.0.0\",  # Required for SageMaker networking\n",
+    "  #'HUGGING_FACE_HUB_TOKEN': \"<REPLACE WITH YOUR TOKEN>\"\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# create HuggingFaceModel with the image uri\n",
+    "llm_model = HuggingFaceModel(\n",
+    "  role=role,\n",
+    "  image_uri=llm_image,\n",
+    "  env=config,\n",
+    "  name=\"qwen3-4b-thinking-2507-demo-endpoint\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "002412ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using already existing model: qwen3-4b-thinking-2507-demo-endpoint\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------!"
+     ]
+    }
+   ],
+   "source": [
+    "# Deploy model to an endpoint\n",
+    "# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy\n",
+    "llm = llm_model.deploy(\n",
+    "  initial_instance_count=1,\n",
+    "  instance_type=instance_type,\n",
+    "  container_startup_health_check_timeout=health_check_timeout,\n",
+    "  inference_ami_version=\"al2-ami-sagemaker-inference-gpu-3-1\", \n",
+    "  wait=True\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "dfff1dc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prompt to generate\n",
+    "messages=[\n",
+    "    { \"role\": \"user\", \"content\": \"Give me a short introduction to large language model\" }\n",
+    "  ]\n",
+    "\n",
+    "# Generation arguments\n",
+    "parameters = {\n",
+    "    \"top_p\": 0.6,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"max_tokens\": 128,\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6041a998",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Okay, the user asked for a short introduction to large language models. Let me start by recalling what I know. LLMs are a big topic in AI, so I need to keep it concise but informative. \n",
+      "\n",
+      "First, I should define what an LLM is. They're AI models trained on massive amounts of text data. The key points are their size—billions or trillions of parameters—and their ability to generate human-like text. \n",
+      "\n",
+      "Wait, the user might not know terms like \"parameters.\" I should explain it simply. Maybe say they're huge neural networks that learn patterns from text. Also, mention common tasks:\n"
+     ]
+    }
+   ],
+   "source": [
+    "chat = llm.predict({\"messages\" :messages, **parameters})\n",
+    "\n",
+    "print(chat[\"choices\"][0][\"message\"][\"content\"].strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "293363c7",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'llm' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mllm\u001b[49m.delete_model()\n\u001b[32m      2\u001b[39m llm.delete_endpoint()\n",
+      "\u001b[31mNameError\u001b[39m: name 'llm' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "llm.delete_model()\n",
+    "llm.delete_endpoint()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "8144ef34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_model.delete_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81e7302d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test/sagemaker_tests/huggingface/vllm/utils/__init__.py b/test/sagemaker_tests/huggingface/vllm/utils/__init__.py
new file mode 100644
index 000000000000..6932ed1abd5b
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/utils/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import boto3
+import botocore
+
+
+def _botocore_resolver():
+    """
+    Get the DNS suffix for the given region.
+    :return: endpoint object
+    """
+    loader = botocore.loaders.create_loader()
+    return botocore.regions.EndpointResolver(loader.load_data("endpoints"))
+
+
+def get_ecr_registry(account, region):
+    """
+    Get prefix of ECR image URI
+    :param account: Account ID
+    :param region: region where ECR repo exists
+    :return: AWS ECR registry
+    """
+    endpoint_data = _botocore_resolver().construct_endpoint("ecr", region)
+    return "{}.dkr.{}".format(account, endpoint_data["hostname"])
diff --git a/test/sagemaker_tests/huggingface/vllm/utils/image_utils.py b/test/sagemaker_tests/huggingface/vllm/utils/image_utils.py
new file mode 100644
index 000000000000..3421e6ce2b42
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/utils/image_utils.py
@@ -0,0 +1,67 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import subprocess
+import sys
+
+CYAN_COLOR = "\033[36m"
+END_COLOR = "\033[0m"
+
+
+def build_base_image(
+    framework_name, framework_version, py_version, processor, base_image_tag, cwd="."
+):
+    base_image_uri = get_base_image_uri(framework_name, base_image_tag)
+
+    dockerfile_location = os.path.join(
+        "docker", framework_version, "base", "Dockerfile.{}".format(processor)
+    )
+
+    subprocess.check_call(
+        [
+            "docker",
+            "build",
+            "-t",
+            base_image_uri,
+            "-f",
+            dockerfile_location,
+            "--build-arg",
+            "py_version={}".format(py_version[-1]),
+            cwd,
+        ],
+        cwd=cwd,
+    )
+    print("created image {}".format(base_image_uri))
+    return base_image_uri
+
+
+def get_base_image_uri(framework_name, base_image_tag):
+    return "{}-base:{}".format(framework_name, base_image_tag)
+
+
+def get_image_uri(framework_name, tag):
+    return "{}:{}".format(framework_name, tag)
+
+
+def _check_call(cmd, *popenargs, **kwargs):
+    if isinstance(cmd, str):
+        cmd = cmd.split(" ")
+    _print_cmd(cmd)
+    subprocess.check_call(cmd, *popenargs, **kwargs)
+
+
+def _print_cmd(cmd):
+    print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR))
+    sys.stdout.flush()
diff --git a/test/sagemaker_tests/huggingface/vllm/utils/local_mode_utils.py b/test/sagemaker_tests/huggingface/vllm/utils/local_mode_utils.py
new file mode 100644
index 000000000000..fa6b3cf00c36
--- /dev/null
+++ b/test/sagemaker_tests/huggingface/vllm/utils/local_mode_utils.py
@@ -0,0 +1,46 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from contextlib import contextmanager
+import fcntl
+import os
+import tarfile
+import time
+
+from ..integration import resources_path
+
+LOCK_PATH = os.path.join(resources_path, "local_mode_lock")
+
+
+@contextmanager
+def lock():
+    # Since Local Mode uses the same port for serving, we need a lock in order
+    # to allow concurrent test execution.
+    local_mode_lock_fd = open(LOCK_PATH, "w")
+    local_mode_lock = local_mode_lock_fd.fileno()
+
+    fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
+
+    try:
+        yield
+    finally:
+        time.sleep(5)
+        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
+
+
+def assert_files_exist(output_path, directory_file_map):
+    for directory, files in directory_file_map.items():
+        with tarfile.open(os.path.join(output_path, "{}.tar.gz".format(directory))) as tar:
+            for f in files:
+                tar.getmember(f)
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index c7d0dc605588..8a0045919925 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -1820,6 +1820,7 @@ def get_framework_and_version_from_tag(image_uri):
         "huggingface_pytorch_trcomp",
         "huggingface_tensorflow",
         "huggingface_pytorch",
+        "huggingface_vllm",
         "stabilityai_pytorch",
         "pytorch_trcomp",
         "tensorflow",
@@ -1935,6 +1936,7 @@ def get_framework_from_image_uri(image_uri):
         "huggingface-pytorch-trcomp": "huggingface_pytorch_trcomp",
         "pytorch-trcomp": "pytorch_trcomp",
         "huggingface-pytorch": "huggingface_pytorch",
+        "huggingface-vllm": "huggingface_vllm",
         "stabilityai-pytorch": "stabilityai_pytorch",
         "mxnet": "mxnet",
         "pytorch": "pytorch",
diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py
index 0fe31bea4e8e..58d83aa52381 100644
--- a/test/test_utils/sagemaker.py
+++ b/test/test_utils/sagemaker.py
@@ -160,7 +160,10 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
     framework_major_version = framework_version.split(".")[0]
     job_type = get_job_type_from_image(image)
     framework = framework.replace("_trcomp", "").replace("stabilityai_", "")
-    path = os.path.join("test", "sagemaker_tests", framework, job_type)
+    if framework == "huggingface_vllm":
+        path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm")
+    else:
+        path = os.path.join("test", "sagemaker_tests", framework, job_type)
     aws_id_arg = "--aws-id"
     docker_base_arg = "--docker-base-name"
     instance_type_arg = "--instance-type"
@@ -271,7 +274,9 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
         )
     if framework == "tensorflow" and job_type == "training":
         path = os.path.join(os.path.dirname(path), f"{framework}{framework_major_version}_training")
-    if "huggingface" in framework and job_type == "inference":
+    if "huggingface" in framework and "vllm" in framework:
+        path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm")
+    elif "huggingface" in framework and job_type == "inference":
         path = os.path.join("test", "sagemaker_tests", "huggingface", "inference")
     if "trcomp" in framework:
         path = os.path.join(
diff --git a/test/testrunner.py b/test/testrunner.py
index eb75c5bff1cb..485180281d67 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -333,7 +333,8 @@ def main():
     is_hf_image_present = any("huggingface" in image_uri for image_uri in all_image_list)
     is_ag_image_present = any("autogluon" in image_uri for image_uri in all_image_list)
     is_trcomp_image_present = any("trcomp" in image_uri for image_uri in all_image_list)
-    is_hf_image_present = is_hf_image_present and not is_trcomp_image_present
+    is_vllm_image_present = any("vllm" in image_uri for image_uri in all_image_list)
+    is_hf_image_present = is_hf_image_present and not is_trcomp_image_present and not is_vllm_image_present
     is_hf_trcomp_image_present = is_hf_image_present and is_trcomp_image_present
     if (
         (is_hf_image_present or is_ag_image_present)
@@ -499,7 +500,8 @@ def main():
             if os.path.exists(KEYS_TO_DESTROY_FILE):
                 delete_key_pairs(KEYS_TO_DESTROY_FILE)
     elif specific_test_type == "sagemaker":
-        if "vllm" in dlc_images:
+        # Route generic vLLM (not HuggingFace vLLM) to vllm-specific tests
+        if "vllm" in dlc_images and "huggingface" not in dlc_images:
             run_vllm_tests("sagemaker", all_image_list, new_test_structure_enabled)
             return
 
@@ -557,7 +559,6 @@ def main():
             "habana": "Skipping SM tests because SM does not yet support Habana",
             "neuron": "Skipping - there are no local mode tests for Neuron",
             "huggingface-tensorflow-training": "Skipping - there are no local mode tests for HF TF training",
-            "vllm": "Skipping - there are no local mode tests for VLLM",
             "sglang": "Skipping - there are no local mode tests for sglang",
         }
 
@@ -569,6 +570,13 @@ def main():
                 sm_utils.generate_empty_report(report, test_type, skip_condition)
                 return
 
+        # Skip base vllm (not huggingface_vllm) - huggingface_vllm has local tests
+        if "vllm" in dlc_images and "huggingface" not in dlc_images:
+            LOGGER.info(f"Skipping - there are no local mode tests for base VLLM. Images: {dlc_images}")
+            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
+            sm_utils.generate_empty_report(report, test_type, "vllm")
+            return
+
         testing_image_list = [
             image
             for image in standard_images_list