Adding example for sagemaker testing with vLLM DLC

Jyothirmaikottu · Jyothirmaikottu · commit fd2b4b566f60 · 2025-10-20T10:36:14.000-07:00
diff --git a/vllm-samples/sagemaker/README.md b/vllm-samples/sagemaker/README.md
@@ -0,0 +1,111 @@
+# AWS SageMaker vLLM Inference
+
+Deploy and run inference on vLLM models using AWS SageMaker and vLLM DLC.
+
+## Files
+
+- `endpoint.py` - Deploy vLLM model to SageMaker endpoint
+- `inference.py` - Run inference against deployed endpoint
+
+## Prerequisites
+
+- AWS CLI configured with appropriate permissions
+- HuggingFace token for model access (if required)
+
+## Setup
+
+### Create IAM Role
+
+```bash
+# Create trust policy
+cat > trust-policy.json << EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Service": "sagemaker.amazonaws.com"
+      },
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
+EOF
+
+# Create role
+aws iam create-role --role-name SageMakerExecutionRole --assume-role-policy-document file://trust-policy.json
+
+# Attach policies
+aws iam attach-role-policy --role-name SageMakerExecutionRole --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
+aws iam attach-role-policy --role-name SageMakerExecutionRole --policy-arn arn:aws:iam::aws:policy/AmazonElasticContainerRegistryPublicFullAccess
+```
+
+## Quick Start
+
+### 1. Get Latest Image URI
+
+```bash
+# Check available images: https://gallery.ecr.aws/deep-learning-containers/vllm
+# Get latest vLLM DLC image URI
+export CONTAINER_URI="public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1"
+```
+
+### 2. Deploy Endpoint
+
+```bash
+# update variables in endpoint.py and run
+python endpoint.py
+```
+
+### 3. Run Inference
+
+```bash
+# update endpoint_name in inference.py and run
+python inference.py
+```
+
+## Configuration
+
+### Model Parameters
+- `SM_VLLM_MODEL` - HuggingFace model ID
+- `SM_VLLM_HF_TOKEN` - HuggingFace access token
+
+### Inference Parameters
+- `max_tokens` - Maximum response length
+- `temperature` - Sampling randomness (0-1)
+- `top_p` - Nucleus sampling threshold
+- `top_k` - Top-k sampling limit
+
+## Instance Types
+
+Recommended GPU instances:
+- `ml.g5.12xlarge` - 4 A10G GPUs, 48 vCPUs, 192 GB RAM
+- `ml.g5.24xlarge` - 4 A10G GPUs, 96 vCPUs, 384 GB RAM
+- `ml.p4d.24xlarge` - 8 A100 GPUs, 96 vCPUs, 1152 GB RAM
+
+## Test NixlConnector
+
+Test NixlConnector locally - [NixlConnector Documentation](https://docs.vllm.ai/en/latest/features/nixl_connector_usage.html#transport-configuration)
+
+```bash
+# Pull latest vLLM DLC for EC2
+docker pull public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1
+
+# Run container with GPU access
+docker run -it --entrypoint=/bin/bash --gpus=all \
+  -v $(pwd):/workspace \
+  public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1
+
+# Inside container, run the NixlConnector test
+export HF_TOKEN= "<TOKEN>"
+./testNixlConnector.sh
+```
+
+## Cleanup
+
+```python
+import boto3
+sagemaker = boto3.client('sagemaker')
+sagemaker.delete_endpoint(EndpointName='<endpoint-name>')
+```
diff --git a/vllm-samples/sagemaker/endpoint.py b/vllm-samples/sagemaker/endpoint.py
@@ -0,0 +1,36 @@
+from sagemaker.model import Model
+
+# Configuration - replace placeholders with actual values
+aws_region = "<REGION>"
+instance_type = "ml.g5.12xlarge"  # GPU instance for vLLM
+iam_role = "<IAM-ROLE>"
+endpoint_name = "<NAME>"
+container_uri = "<IMAGE_URI>"  # DLC image with vLLM
+
+try:
+    print(f"Starting deployment of endpoint: {endpoint_name}")
+    print(f"Using image: {container_uri}")
+    print(f"Instance type: {instance_type}")
+
+    print("Creating SageMaker model...")
+
+    model = Model(
+        name=endpoint_name,
+        image_uri=container_uri,
+        role="SageMakerRole",
+        env={
+            "SM_VLLM_MODEL": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",  # Model to load
+            "SM_VLLM_HF_TOKEN": "<HF-TOKEN>",  # HuggingFace token for model access
+        },
+    )
+    print("Model created successfully")
+    print("Starting endpoint deployment (this may take 10-15 minutes)...")
+
+    endpoint_config = model.deploy(
+        instance_type=instance_type,
+        initial_instance_count=1,
+        endpoint_name=endpoint_name,
+        wait=False,  # Deploy asynchronously
+    )
+except Exception as e:
+    print(f"Deployment failed: {str(e)}")
diff --git a/vllm-samples/sagemaker/inference.py b/vllm-samples/sagemaker/inference.py
@@ -0,0 +1,67 @@
+import json
+import sagemaker
+from sagemaker import serializers
+from sagemaker.predictor import Predictor
+
+
+def invoke_endpoint(endpoint_name, prompt, max_tokens=2400, temperature=0.01):
+    """Invoke SageMaker endpoint with vLLM model for text generation"""
+    try:
+        predictor = Predictor(
+            endpoint_name=endpoint_name,
+            serializer=serializers.JSONSerializer(),
+        )
+
+        payload = {
+            "messages": [{"role": "user", "content": prompt}],  # Chat format
+            "max_tokens": max_tokens,  # Response length limit
+            "temperature": temperature,  # Randomness (0=deterministic, 1=creative)
+            "top_p": 0.9,  # Nucleus sampling
+            "top_k": 50,  # Top-k sampling
+        }
+
+        response = predictor.predict(payload)
+
+        # Handle different response formats
+        if isinstance(response, bytes):
+            response = response.decode("utf-8")
+
+        if isinstance(response, str):
+            try:
+                response = json.loads(response)
+            except json.JSONDecodeError:
+                print("Warning: Response is not valid JSON. Returning as string.")
+
+        return response
+
+    except Exception as e:
+        print(f"Inference failed: {str(e)}")
+        return None
+
+
+def main():
+    endpoint_name = "<NAME>"  # Replace with actual endpoint name
+
+    # Sample prompt for testing
+    test_prompt = "Write a python code to generate n prime numbers"
+
+    print("Sending request to endpoint...")
+    response = invoke_endpoint(
+        endpoint_name=endpoint_name,
+        prompt=test_prompt,
+        max_tokens=2400,  # Adjust based on expected response length
+        temperature=0.01,  # Low temperature for consistent code generation
+    )
+
+    if response:
+        print("\nResponse from endpoint:")
+        if isinstance(response, (dict, list)):
+            print(json.dumps(response, indent=2))
+        else:
+            print(response)
+    else:
+        print("No response received from the endpoint.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm-samples/sagemaker/testNixlConnector.sh b/vllm-samples/sagemaker/testNixlConnector.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Function to wait for server to be ready
+wait_for_server() {
+    local host=$1
+    local port=$2
+    local timeout=120
+    local count=0
+    
+    echo "Waiting for server at $host:$port to be ready..."
+    while ! curl -s http://$host:$port/health > /dev/null 2>&1; do
+        sleep 5
+        count=$((count + 5))
+        if [ $count -ge $timeout ]; then
+            echo "Timeout waiting for server at $host:$port"
+            return 1
+        fi
+    done
+    echo "Server at $host:$port is ready"
+}
+
+# Start first GPU (prefiller)
+echo "Starting prefiller on GPU 0..."
+CUDA_VISIBLE_DEVICES=0 \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+  --port 8100 \
+  --max-model-len 6000 \
+  --enforce-eager \        
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
+  > vllm_gpu0.log 2>&1 &
+
+# Start second GPU (decoder)
+echo "Starting decoder on GPU 1..."
+CUDA_VISIBLE_DEVICES=1 \
+UCX_NET_DEVICES=all \
+VLLM_NIXL_SIDE_CHANNEL_PORT=5601 \
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+  --port 8200 \
+  --max-model-len 6000 \
+  --enforce-eager \
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
+  > vllm_gpu1.log 2>&1 &
+
+
+# Wait for GPU servers
+wait_for_server localhost 8100
+wait_for_server localhost 8200
+
+# Start proxy server
+echo "Starting proxy server..."
+python3 proxy.py \
+  --host 0.0.0.0 \
+  --port 8192 \
+  --prefiller-hosts localhost \
+  --prefiller-ports 8100 \
+  --decoder-hosts localhost \
+  --decoder-ports 8200 \
+  > proxy_server.log 2>&1 &
+
+# Wait for proxy server
+wait_for_server localhost 8192
+
+# wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+    --host 0.0.0.0 \
+    --port 8192 \
+    --model  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --dataset-name sharegpt \
+    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
+    --num-prompts 30
+