Add huggingface/vllm local mode tests with tiny-random-qwen3 model

EC2 Default User · EC2 Default User · commit ea1f74cae080 · 2026-01-27T11:32:04.000Z
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/__init__.py b/test/sagemaker_tests/huggingface/vllm/integration/__init__.py
@@ -13,12 +13,21 @@
 from __future__ import absolute_import
 
 import json
+import os
 import re
 
 import boto3
 
+# Path to test resources
+resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources"))
 
-ROLE = "SageMakerRole"
+# Model artifacts for local mode tests
+model_dir = os.path.join(resources_path, "tiny-random-qwen3")
+model_data = "tiny-random-qwen3.tar.gz"
+model_data_path = os.path.join(model_dir, model_data)
+
+# Role for local mode (not used but required by SageMaker SDK)
+ROLE = "dummy/unused-role"
 DEFAULT_TIMEOUT = 45
 
 
@@ -32,7 +41,8 @@ class SageMakerEndpointFailure(Exception):
 
 def dump_logs_from_cloudwatch(e, region="us-west-2"):
     """
-    Function to dump logs from cloudwatch during error handling
+    Function to dump logs from cloudwatch during error handling.
+    Gracefully handles missing log groups/streams.
     """
     error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):")
     endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)")
@@ -43,6 +53,7 @@ def dump_logs_from_cloudwatch(e, region="us-west-2"):
         logs_client = boto3.client("logs", region_name=region)
         endpoint = endpoint_match.group(1)
         log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}"
+        try:
         log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name)
         all_traffic_log_stream = ""
         for log_stream in log_stream_resp.get("logStreams", []):
@@ -60,3 +71,8 @@ def dump_logs_from_cloudwatch(e, region="us-west-2"):
         raise SageMakerEndpointFailure(
             f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}"
         ) from e
+        except logs_client.exceptions.ResourceNotFoundException:
+            # Log group doesn't exist yet - endpoint may have failed before creating logs
+            raise SageMakerEndpointFailure(
+                f"Endpoint {endpoint} failed. No CloudWatch logs available yet."
+            ) from e
diff --git a/test/sagemaker_tests/huggingface/vllm/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/vllm/integration/local/test_serving.py
@@ -12,8 +12,6 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
-import json
-import logging
 from contextlib import contextmanager
 
 import pytest
@@ -22,101 +20,84 @@
 from sagemaker.serializers import JSONSerializer
 from sagemaker.deserializers import JSONDeserializer
 
-from ...integration import ROLE
-
-LOGGER = logging.getLogger(__name__)
+from ...integration import ROLE, model_data_path
+from ...utils import local_mode_utils
 
 
 @contextmanager
-def _predictor(image, sagemaker_local_session, instance_type, model_id):
-    """Context manager for vLLM model deployment and cleanup."""
+def _predictor(image, sagemaker_local_session, instance_type):
+    """Context manager for vLLM model deployment and cleanup.
+    
+    Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz.
+    vLLM loads the model from this local path.
+    """
     env = {
-        "SM_VLLM_MODEL": model_id,
+        "SM_VLLM_MODEL": "/opt/ml/model",
         "SM_VLLM_MAX_MODEL_LEN": "512",
         "SM_VLLM_HOST": "0.0.0.0",
     }
 
     model = Model(
+        model_data=f"file://{model_data_path}",
         role=ROLE,
         image_uri=image,
         env=env,
         sagemaker_session=sagemaker_local_session,
         predictor_cls=Predictor,
     )
-
-    predictor = None
-    try:
-        predictor = model.deploy(1, instance_type)
-        yield predictor
-    finally:
-        if predictor is not None:
-            predictor.delete_endpoint()
+    with local_mode_utils.lock():
+        predictor = None
+        try:
+            predictor = model.deploy(1, instance_type)
+            yield predictor
+        finally:
+            if predictor is not None:
+                predictor.delete_endpoint()
 
 
 def _assert_vllm_prediction(predictor):
-    """Test vLLM inference using OpenAI-compatible API format."""
+    """Test vLLM inference using OpenAI-compatible completions API."""
     predictor.serializer = JSONSerializer()
     predictor.deserializer = JSONDeserializer()
 
-    # vLLM uses OpenAI-compatible API format
     data = {
         "prompt": "What is Deep Learning?",
         "max_tokens": 50,
         "temperature": 0.7,
     }
-
-    LOGGER.info(f"Running inference with data: {data}")
     output = predictor.predict(data)
-    LOGGER.info(f"Output: {json.dumps(output)}")
 
     assert output is not None
-    # vLLM returns OpenAI-compatible response with 'choices' field
-    assert "choices" in output or "text" in output
+    assert "choices" in output
 
 
 def _assert_vllm_chat_prediction(predictor):
-    """Test vLLM inference using chat completions format."""
+    """Test vLLM inference using OpenAI-compatible chat completions API."""
     predictor.serializer = JSONSerializer()
     predictor.deserializer = JSONDeserializer()
 
-    # vLLM chat completions format
     data = {
-        "messages": [
-            {"role": "user", "content": "What is Deep Learning?"}
-        ],
+        "messages": [{"role": "user", "content": "What is Deep Learning?"}],
         "max_tokens": 50,
         "temperature": 0.7,
     }
-
-    LOGGER.info(f"Running chat inference with data: {data}")
     output = predictor.predict(data)
-    LOGGER.info(f"Output: {json.dumps(output)}")
 
     assert output is not None
     assert "choices" in output
 
 
-@pytest.mark.model("qwen3-0.6b")
-@pytest.mark.processor("gpu")
-@pytest.mark.gpu_test
+@pytest.mark.model("tiny-random-qwen3")
 @pytest.mark.team("sagemaker-1p-algorithms")
 def test_vllm_local_completions(ecr_image, sagemaker_local_session, instance_type):
     """Test vLLM local deployment with completions API."""
-    instance_type = instance_type if instance_type != "local" else "local_gpu"
-    with _predictor(
-        ecr_image, sagemaker_local_session, instance_type, "Qwen/Qwen3-0.6B"
-    ) as predictor:
+    with _predictor(ecr_image, sagemaker_local_session, instance_type) as predictor:
         _assert_vllm_prediction(predictor)
 
 
-@pytest.mark.model("qwen3-0.6b")
-@pytest.mark.processor("gpu")
-@pytest.mark.gpu_test
+@pytest.mark.model("tiny-random-qwen3")
 @pytest.mark.team("sagemaker-1p-algorithms")
 def test_vllm_local_chat(ecr_image, sagemaker_local_session, instance_type):
     """Test vLLM local deployment with chat completions API."""
-    instance_type = instance_type if instance_type != "local" else "local_gpu"
-    with _predictor(
-        ecr_image, sagemaker_local_session, instance_type, "Qwen/Qwen3-0.6B"
-    ) as predictor:
+    with _predictor(ecr_image, sagemaker_local_session, instance_type) as predictor:
         _assert_vllm_chat_prediction(predictor)
diff --git a/test/sagemaker_tests/huggingface/vllm/resources/tiny-random-qwen3/tiny-random-qwen3.tar.gz b/test/sagemaker_tests/huggingface/vllm/resources/tiny-random-qwen3/tiny-random-qwen3.tar.gz