ai-dynamo · ganeshku1 · Sep 25, 2025 · Sep 7, 2025 · Sep 7, 2025 · Sep 7, 2025
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -5,7 +5,8 @@ SPDX-License-Identifier: Apache-2.0
 
 # Profiling with AIPerf
 
-This tutorial shows how to measure model performance across different inference solutions using AIPerf.
+This tutorial will demonstrate how you can use AIPerf to measure the performance of
+models using various inference solutions.
 
 ### Table of Contents
 - [Profile Qwen3-0.6B Using Dynamo](#dynamo-qwen3-0.6B)
@@ -16,6 +17,7 @@ This tutorial shows how to measure model performance across different inference
 > [!NOTE]
 > The latest installation instructions for Dynamo are available on [Github](https://github.com/ai-dynamo/dynamo?tab=readme-ov-file#1-initial-setup)
 
+<!-- setup-dynamo-default-openai-endpoint-server -->
 ```bash
 # Set environment variables
 export AIPERF_REPO_TAG="main"
@@ -40,7 +42,10 @@ docker run \
   --network host \
   ${DYNAMO_PREBUILT_IMAGE_TAG} \
     /bin/bash -c "python3 -m dynamo.frontend & python3 -m dynamo.vllm --model ${MODEL} --enforce-eager --no-enable-prefix-caching" > server.log 2>&1 &
+```
+<!-- /setup-dynamo-default-openai-endpoint-server -->
 
+```bash
 # Set up AIPerf
 docker run \
   -it \
@@ -64,82 +69,64 @@ source .venv/bin/activate
 git clone -b ${AIPERF_REPO_TAG} --depth 1 https://github.com/ai-dynamo/aiperf.git
 
 uv pip install ./aiperf
-
-# It can take some time for Dynamo to become ready.
-# The following command returns when Dynamo is ready to accept requests.
-while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"'"${MODEL}"'","messages":[{"role":"user","content":"a"}],"max_completion_tokens":1}')" != "200" ]; do sleep 1; done
-
+```
+<!-- health-check-dynamo-default-openai-endpoint-server -->
+```bash
+timeout 900 bash -c 'while [ "$(curl -s -o /dev/null -w "%{http_code}" localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"a\"}],\"max_completion_tokens\":1}")" != "200" ]; do sleep 2; done' || { echo "Dynamo not ready after 15min"; exit 1; }
+```
+<!-- /health-check-dynamo-default-openai-endpoint-server -->
+<!-- aiperf-run-dynamo-default-openai-endpoint-server -->
+```bash
 # Profile the model
 aiperf profile \
     --model Qwen/Qwen3-0.6B \
     --endpoint-type chat \
     --endpoint /v1/chat/completions \
     --streaming \
-    --url localhost:8000 \
-    --synthetic-input-tokens-mean 1000 \
+    --url localhost:8080 \
+    --synthetic-input-tokens-mean 100 \
     --synthetic-input-tokens-stddev 0 \
-    --output-tokens-mean 2000 \
+    --output-tokens-mean 200 \
     --output-tokens-stddev 0 \
-    --extra-inputs min_tokens:2000 \
+    --extra-inputs min_tokens:200 \
     --extra-inputs ignore_eos:true \
-    --concurrency 2048 \
-    --request-count 6144 \
-    --warmup-request-count 1000 \
-    --conversation-num 8000 \
-    --random-seed 100 \
-    -v \
-    -H 'Authorization: Bearer NOT USED' \
-    -H 'Accept: text/event-stream'
+    --concurrency 4 \
+    --request-count 64 \
+    --warmup-request-count 1 \
+    --conversation-num 8 \
+    --random-seed 100
 ```
 
-## Profile Qwen3-0.6B Using vLLM <a id="vllm-qwen3-0.6B">
-```bash
-# Install vLLM from pip:
-pip install vllm
+<!-- /aiperf-run-dynamo-default-openai-endpoint-server -->
 
-# Load and run the model:
-vllm serve "Qwen/Qwen3-0.6B"
-
-uv venv
-source .venv/bin/activate
-uv pip install git+https://github.com/ai-dynamo/aiperf.git
-
-aiperf profile \
-    --model Qwen/Qwen3-0.6B \
-    --endpoint-type chat \
-    --endpoint /v1/chat/completions \
-    --streaming \
-    --request-rate 1000 \
-    --request-count 6500
+## Profile Qwen3-0.6B using vllm <a id="vllm-qwen3-0.6B">
+<!-- setup-vllm-default-openai-endpoint-server -->
+```bash
+# Pull and run vLLM Docker container:
+docker pull vllm/vllm-openai:latest
+docker run --gpus all -p 8000:8000 vllm/vllm-openai:latest \
+  --model Qwen/Qwen3-0.6B \
+  --host 0.0.0.0 --port 8000
 ```
+<!-- /setup-vllm-default-openai-endpoint-server -->
 
-## Profile Qwen3-0.6B Using vLLM and Docker <a id="vllm-qwen3-0.6B-docker">
-
-
+<!-- health-check-vllm-default-openai-endpoint-server -->
 ```bash
-# Install the latest vLLM docker container:
-docker run \
-  -it \
-  --rm \
-  --gpus all \
-  --network host \
-  vllm/vllm-openai:latest \
-  --model Qwen/Qwen3-0.6B
-
-# In a separate terminal, ensure dependencies are installed
-apt update && apt install -y curl git
-curl -LsSf https://astral.sh/uv/install.sh | sh
-uv venv --python 3.10
-source .venv/bin/activate
+timeout 900 bash -c 'while [ "$(curl -s -o /dev/null -w "%{http_code}" localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"test\"}],\"max_tokens\":1}")" != "200" ]; do sleep 2; done' || { echo "vLLM not ready after 15min"; exit 1; }
+```
+<!-- /health-check-vllm-default-openai-endpoint-server -->
 
-# Install and Run AIPerf
-uv pip install git+https://github.com/ai-dynamo/aiperf.git
 
+<!-- aiperf-run-vllm-default-openai-endpoint-server -->
+```bash
+# Profile the model
 aiperf profile \
     --model Qwen/Qwen3-0.6B \
     --endpoint-type chat \
     --endpoint /v1/chat/completions \
     --streaming \
-    --request-rate 100 \
-    --request-count 650
-```
+    --request-rate 32 \
+    --request-count 64 \
+    --url localhost:8000
+```
+<!-- /aiperf-run-vllm-default-openai-endpoint-server -->
diff --git a/tests/ci/basic_end_to_end/start_server.sh b/tests/ci/basic_end_to_end/start_server.sh
diff --git a/tests/ci/basic_end_to_end/test.sh b/tests/ci/basic_end_to_end/test.sh
diff --git a/tests/ci/common/setup_test.sh b/tests/ci/common/setup_test.sh
diff --git a/tests/ci/test_docs_end_to_end/constants.py b/tests/ci/test_docs_end_to_end/constants.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Constants for the end-to-end testing framework.
+"""
+
+# Tag patterns
+SETUP_TAG_PREFIX = "setup-"
+HEALTH_CHECK_TAG_PREFIX = "health-check-"
+AIPERF_RUN_TAG_PREFIX = "aiperf-run-"
+TAG_SUFFIX = "endpoint-server"
+
+# Tag lengths for parsing
+SETUP_TAG_PREFIX_LEN = len(SETUP_TAG_PREFIX)
+HEALTH_CHECK_TAG_PREFIX_LEN = len(HEALTH_CHECK_TAG_PREFIX)
+AIPERF_RUN_TAG_PREFIX_LEN = len(AIPERF_RUN_TAG_PREFIX)
+TAG_SUFFIX_LEN = len(TAG_SUFFIX)
+
+# AIPerf execution
+AIPERF_UI_TYPE = "simple"
+
+# Timeouts
+SETUP_MONITOR_TIMEOUT = 30  # seconds to monitor setup output
+CONTAINER_BUILD_TIMEOUT = 600  # seconds for Docker build
+CONTAINER_START_TIMEOUT = 60  # seconds for container startup
+AIPERF_COMMAND_TIMEOUT = 300  # seconds for AIPerf commands
diff --git a/tests/ci/test_docs_end_to_end/data_types.py b/tests/ci/test_docs_end_to_end/data_types.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Data models for the end-to-end testing framework.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Command:
+    """Represents a command extracted from markdown"""
+
+    tag_name: str
+    command: str
+    file_path: str
+    start_line: int
+    end_line: int
+
+
+@dataclass
+class Server:
+    """Represents a server with its setup, health check, and aiperf commands"""
+
+    name: str
+    setup_command: Command | None
+    health_check_command: Command | None
+    aiperf_commands: list[Command]
diff --git a/tests/ci/test_docs_end_to_end/main.py b/tests/ci/test_docs_end_to_end/main.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Simple end-to-end test tool for AIPerf documentation.
+
+Parses markdown files for server setup and AIPerf run commands,
+builds AIPerf container, and executes tests.
+"""
+
+import logging
+import sys
+
+from parser import MarkdownParser
+from test_runner import TestRunner
+from utils import get_repo_root, setup_logging
+
+# Configure logging using centralized utility
+setup_logging()
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Main function"""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Run end-to-end tests from markdown documentation"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show discovered commands without executing",
+    )
+    parser.add_argument(
+        "--all-servers",
+        action="store_true",
+        help="Run tests for all discovered servers",
+    )
+    args = parser.parse_args()
+
+    # Get repository root using centralized function
+    repo_root = get_repo_root()
+
+    # Parse markdown files
+    md_parser = MarkdownParser()
+    servers = md_parser.parse_directory(str(repo_root))
+
+    if not servers:
+        logger.warning("No servers found")
+        return 0
+
+    logger.info(f"Discovered {len(servers)} servers:")
+    for name, server in servers.items():
+        setup_file = (
+            server.setup_command.file_path if server.setup_command else "MISSING"
+        )
+        health_file = (
+            server.health_check_command.file_path
+            if server.health_check_command
+            else "MISSING"
+        )
+        aiperf_count = len(server.aiperf_commands)
+        logger.info(
+            f"  {name}: setup={setup_file}, health={health_file}, aiperf_commands={aiperf_count}"
+        )
+
+    if args.dry_run:
+        logger.info("Dry run completed")
+        return 0
+
+    # Run tests
+    runner = TestRunner()
+    success = runner.run_tests(servers)
+
+    if success:
+        logger.info("All tests passed!")
+        return 0
+    else:
+        logger.error("Some tests failed")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())