aws-samples
diff --git a/‎vllm-samples/sagemaker/README.md‎
Lines changed: 34 additions & 49 deletions b/‎vllm-samples/sagemaker/README.md‎
Lines changed: 34 additions & 49 deletions
diff --git a/‎vllm-samples/sagemaker/deploy_and_test_sm_endpoint.py‎
Lines changed: 176 additions & 0 deletions b/‎vllm-samples/sagemaker/deploy_and_test_sm_endpoint.py‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎vllm-samples/sagemaker/endpoint.py‎
Lines changed: 0 additions & 36 deletions b/‎vllm-samples/sagemaker/endpoint.py‎
Lines changed: 0 additions & 36 deletions
@@ -4,8 +4,8 @@ Deploy and run inference on vLLM models using AWS SageMaker and vLLM DLC.
 
 ## Files
 
-- `endpoint.py` - Deploy vLLM model to SageMaker endpoint
-- `inference.py` - Run inference against deployed endpoint
+- `deploy_and_test_sm_endpoint.py` - Complete workflow: deploy, inference, and cleanup
+- `testNixlConnector.sh` - Multi-GPU NixlConnector test script
 
 ## Prerequisites
 
@@ -17,24 +17,8 @@ Deploy and run inference on vLLM models using AWS SageMaker and vLLM DLC.
 ### Create IAM Role
 
 ```bash
-# Create trust policy
-cat > trust-policy.json << EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Principal": {
-        "Service": "sagemaker.amazonaws.com"
-      },
-      "Action": "sts:AssumeRole"
-    }
-  ]
-}
-EOF
-
 # Create role
-aws iam create-role --role-name SageMakerExecutionRole --assume-role-policy-document file://trust-policy.json
+aws iam create-role --role-name SageMakerExecutionRole
 
 # Attach policies
 aws iam attach-role-policy --role-name SageMakerExecutionRole --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
@@ -43,39 +27,42 @@ aws iam attach-role-policy --role-name SageMakerExecutionRole --policy-arn arn:a
 
 ## Quick Start
 
-### 1. Get Latest Image URI
+### 1. Set Environment Variables
 
 ```bash
 # Check available images: https://gallery.ecr.aws/deep-learning-containers/vllm
-# Get latest vLLM DLC image URI
 export CONTAINER_URI="public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1"
+export IAM_ROLE="SageMakerExecutionRole"
+export HF_TOKEN="your-huggingface-token" 
 ```
 
-### 2. Deploy Endpoint
-
-```bash
-# update variables in endpoint.py and run
-python endpoint.py
-```
-
-### 3. Run Inference
+### 2. Run Complete Workflow
 
 ```bash
-# update endpoint_name in inference.py and run
-python inference.py
+# Deploy, run inference, and cleanup automatically
+python deploy_and_test_sm_endpoint.py --endpoint-name vllm-test-$(date +%s) --prompt "Write a Python function to calculate fibonacci numbers"
+
+# Alternate with custom parameters
+python deploy_and_test_sm_endpoint.py \
+  --endpoint-name my-vllm-endpoint \
+  --model-id deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+  --instance-type ml.g5.12xlarge \
+  --prompt "Explain machine learning" \
+  --max-tokens 1000 \
+  --temperature 0.7
 ```
 
-## Configuration
-
-### Model Parameters
-- `SM_VLLM_MODEL` - HuggingFace model ID
-- `SM_VLLM_HF_TOKEN` - HuggingFace access token
+## Command Line Options
 
-### Inference Parameters
-- `max_tokens` - Maximum response length
-- `temperature` - Sampling randomness (0-1)
-- `top_p` - Nucleus sampling threshold
-- `top_k` - Top-k sampling limit
+- `--endpoint-name` - SageMaker endpoint name (required)
+- `--container-uri` - DLC image URI (default from env)
+- `--iam-role` - IAM role ARN (default from env)
+- `--instance-type` - Instance type (default: ml.g5.12xlarge)
+- `--model-id` - HuggingFace model ID (default: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)
+- `--hf-token` - HuggingFace token (default from env)
+- `--prompt` - Inference prompt (default: code generation example)
+- `--max-tokens` - Maximum response length (default: 2400)
+- `--temperature` - Sampling randomness 0-1 (default: 0.01)
 
 ## Instance Types
 
@@ -90,22 +77,20 @@ Test NixlConnector locally - [NixlConnector Documentation](https://docs.vllm.ai/
 
 ```bash
 # Pull latest vLLM DLC for EC2
-docker pull public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1
+docker pull public.ecr.aws/deep-learning-containers/vllm:0.11-gpu-py312
 
 # Run container with GPU access
 docker run -it --entrypoint=/bin/bash --gpus=all \
   -v $(pwd):/workspace \
-  public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312-cu128-ubuntu22.04-sagemaker-v1.1
+  public.ecr.aws/deep-learning-containers/vllm:0.11-gpu-py312
 
 # Inside container, run the NixlConnector test
 export HF_TOKEN= "<TOKEN>"
 ./testNixlConnector.sh
 ```
 
-## Cleanup
+## Notes
 
-```python
-import boto3
-sagemaker = boto3.client('sagemaker')
-sagemaker.delete_endpoint(EndpointName='<endpoint-name>')
-```
+- The script automatically cleans up resources after inference to avoid ongoing costs
+- Deployment waits for endpoint to be ready before running inference
+- All parameters can be set via environment variables or command line arguments
@@ -0,0 +1,176 @@
+import argparse
+import json
+import os
+import sagemaker
+from sagemaker.model import Model
+from sagemaker import serializers
+from sagemaker.predictor import Predictor
+
+
+def deploy_endpoint(
+    endpoint_name, container_uri, iam_role, instance_type, model_id, hf_token
+):
+    """Deploy vLLM model to SageMaker endpoint"""
+    try:
+        print(f"Starting deployment of endpoint: {endpoint_name}")
+        print(f"Using image: {container_uri}")
+        print(f"Instance type: {instance_type}")
+
+        print("Creating SageMaker model...")
+        model = Model(
+            name=endpoint_name,
+            image_uri=container_uri,
+            role=iam_role,
+            env={
+                "SM_VLLM_MODEL": model_id,  # Model to load
+                "SM_VLLM_HF_TOKEN": hf_token,  # HuggingFace token for model access
+            },
+        )
+        print("Model created successfully")
+        print("Starting endpoint deployment (this may take 10-15 minutes)...")
+
+        model.deploy(
+            instance_type=instance_type,
+            initial_instance_count=1,
+            endpoint_name=endpoint_name,
+            wait=True,  # Wait for deployment to complete
+        )
+        print(f"Endpoint {endpoint_name} deployed successfully")
+        return True
+    except Exception as e:
+        print(f"Deployment failed: {str(e)}")
+        return False
+
+
+def cleanup_endpoint(endpoint_name):
+    """Delete SageMaker endpoint and model"""
+    try:
+        import boto3
+
+        sagemaker_client = boto3.client("sagemaker")
+
+        print(f"Cleaning up endpoint: {endpoint_name}")
+        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
+        sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
+        sagemaker_client.delete_model(ModelName=endpoint_name)
+        print(f"Endpoint {endpoint_name} cleaned up successfully")
+        return True
+    except Exception as e:
+        print(f"Cleanup failed: {str(e)}")
+        return False
+
+
+def invoke_endpoint(endpoint_name, prompt, max_tokens=2400, temperature=0.01):
+    """Invoke SageMaker endpoint with vLLM model for text generation"""
+    try:
+        predictor = Predictor(
+            endpoint_name=endpoint_name,
+            serializer=serializers.JSONSerializer(),
+        )
+
+        payload = {
+            "messages": [{"role": "user", "content": prompt}],  # Chat format
+            "max_tokens": max_tokens,  # Response length limit
+            "temperature": temperature,  # Randomness (0=deterministic, 1=creative)
+            "top_p": 0.9,  # Nucleus sampling
+            "top_k": 50,  # Top-k sampling
+        }
+
+        response = predictor.predict(payload)
+
+        # Handle different response formats
+        if isinstance(response, bytes):
+            response = response.decode("utf-8")
+
+        if isinstance(response, str):
+            try:
+                response = json.loads(response)
+            except json.JSONDecodeError:
+                print("Warning: Response is not valid JSON. Returning as string.")
+
+        return response
+
+    except Exception as e:
+        print(f"Inference failed: {str(e)}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="SageMaker vLLM Inference")
+    parser.add_argument(
+        "--endpoint-name", required=True, help="SageMaker endpoint name"
+    )
+    parser.add_argument(
+        "--container-uri",
+        help="DLC image URI",
+        default=os.getenv(
+            "CONTAINER_URI",
+            "public.ecr.aws/deep-learning-containers/vllm:0.11.0-gpu-py312",
+        ),
+    )
+    parser.add_argument(
+        "--iam-role", help="IAM role ARN", default=os.getenv("IAM_ROLE")
+    )
+    parser.add_argument(
+        "--instance-type", default="ml.g5.12xlarge", help="Instance type"
+    )
+    parser.add_argument(
+        "--model-id",
+        default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        help="HuggingFace model ID",
+    )
+    parser.add_argument(
+        "--hf-token", help="HuggingFace token", default=os.getenv("HF_TOKEN", "")
+    )
+    parser.add_argument(
+        "--prompt",
+        default="Write a python code to generate n prime numbers",
+        help="Inference prompt",
+    )
+    parser.add_argument("--max-tokens", type=int, default=2400, help="Maximum tokens")
+    parser.add_argument(
+        "--temperature", type=float, default=0.01, help="Sampling temperature"
+    )
+
+    args = parser.parse_args()
+
+    if not args.iam_role:
+        print("Error: IAM role required")
+        return
+
+    # Deploy endpoint
+    if not deploy_endpoint(
+        args.endpoint_name,
+        args.container_uri,
+        args.iam_role,
+        args.instance_type,
+        args.model_id,
+        args.hf_token,
+    ):
+        return
+
+    # Run inference
+    print("\nSending request to endpoint...")
+    response = invoke_endpoint(
+        endpoint_name=args.endpoint_name,
+        prompt=args.prompt,
+        max_tokens=args.max_tokens,
+        temperature=args.temperature,
+    )
+
+    if response:
+        print("\nResponse from endpoint:")
+        if isinstance(response, (dict, list)):
+            print(json.dumps(response, indent=2))
+        else:
+            print(response)
+    else:
+        print("No response received from the endpoint.")
+
+    # Cleanup
+    print("\nCleaning up resources...")
+    cleanup_endpoint(args.endpoint_name)
+
+
+if __name__ == "__main__":
+    main()