vllm-project · AlonKellner-RedHat · Oct 30, 2025 · Oct 30, 2025 · Nov 2, 2025 · Nov 4, 2025
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -0,0 +1,83 @@
+# This workflow builds a Docker artifact, caches it based on the Dockerfile content,
+# and then runs e2e tests using that artifact.
+
+name: E2E Tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      # 1. Set up Python and Tox, which are needed for the test step
+      - name: Set up Python
+        run: |
+          echo uv
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          chmod +x $HOME/.local/bin/uv $HOME/.local/bin/uvx
+          uv self update
+          uvx --version
+          echo python
+          uv python install 3.12 --default
+          python --version
+
+      # 2. Cache the binary artifact
+      # The key is based on the runner's OS and the hash of the Dockerfile.
+      # If the Dockerfile changes, the hash changes, and a new cache is created.
+      - name: Cache vLLM-sim binary
+        id: cache-vllm-sim
+        uses: actions/cache@v4
+        with:
+          # The path to the file you want to cache
+          path: bin/llm-d-inference-sim
+          # The unique key for the cache
+          key: vllm-sim-binary-${{ runner.os }}-${{ hashFiles('tests/e2e/vllm-sim.Dockerfile') }}
+
+      # 3. Set up Docker Buildx (required for the 'docker build -o' command)
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # 4. Conditionally build the artifact
+      # This step only runs if the cache step above did NOT find a match.
+      # 'steps.cache-vllm-sim.outputs.cache-hit' will be 'true' if the cache was restored.
+      - name: Build vLLM-sim artifact (if not cached)
+        if: steps.cache-vllm-sim.outputs.cache-hit != 'true'
+        run: |
+          echo "Cache miss. Building artifact..."
+          docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
+        shell: bash
+
+      - name: Verify artifact
+        run: |
+          if [ -f "bin/llm-d-inference-sim" ]; then
+            echo "Artifact found."
+          else
+            echo "ERROR: Artifact bin/llm-d-inference-sim not found!"
+            exit 1
+          fi
+        shell: bash
+
+      - name: Cache dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/
+          key: ${{ runner.os }}
+          restore-keys: |
+            ${{ runner.os }}
+
+      # 5. Run the e2e tests
+      # This step runs every time, using either the restored cache or the freshly built artifact.
+      - name: Run E2E tests
+        run: uvx --with tox-uv tox -e test-e2e
+        shell: bash
diff --git a/.gitignore b/.gitignore
@@ -230,3 +230,6 @@ src/ui/next-env.d.ts
 !src/ui/public/manifest.json
 !src/ui/serve.json
 .eslintcache
+
+# e2e tests
+bin/
diff --git a/pylock.toml b/pylock.toml
diff --git a/pyproject.toml b/pyproject.toml
@@ -115,6 +115,7 @@ dev = [
     "pytest-cov~=5.0.0",
     "pytest-mock~=3.14.0",
     "pytest-rerunfailures~=14.0",
+    "pytest-timeout~=2.4.0",
     "respx~=0.22.0",
 
     # code quality

diff --git a/tests/e2e/README.md b/tests/e2e/README.md
@@ -0,0 +1,17 @@
+# E2E tests
+
+The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:
+
+```shell
+docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
+```
+
+On MacOS run:
+```shell
+docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./ --build-arg BUILDOS=darwin
+```
+
+Then to run the tests:
+```shell
+tox -e test-e2e
+```
diff --git a/tests/e2e/test_max_error_benchmark.py b/tests/e2e/test_max_error_benchmark.py
@@ -0,0 +1,78 @@
+# E2E test for max error rate constraint functionality
+
+from pathlib import Path
+
+import pytest
+
+from tests.e2e.utils import (
+    GuidellmClient,
+    assert_constraint_triggered,
+    assert_no_python_exceptions,
+    cleanup_report_file,
+    load_benchmark_report,
+)
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(
+        port=8000,
+        model="databricks/dolly-v2-12b",
+        mode="random",
+        time_to_first_token=1,  # 1ms TTFT
+        inter_token_latency=1,  # 1ms ITL
+    )
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_max_error_benchmark(server: VllmSimServer):
+    """
+    Test that the max error rate constraint is properly triggered when server goes down.
+    """
+    report_path = Path("tests/e2e/max_error_benchmarks.json")
+    rate = 10
+    max_error_rate = 0.1
+
+    # Create and configure the guidellm client
+    client = GuidellmClient(target=server.get_url(), output_path=report_path)
+
+    try:
+        # Start the benchmark
+        client.start_benchmark(
+            rate=rate,
+            max_seconds=25,
+            max_error_rate=max_error_rate,
+        )
+
+        # Wait for the benchmark to complete (server will be stopped after 15 seconds)
+        client.wait_for_completion(timeout=30, stop_server_after=15, server=server)
+
+        # Assert no Python exceptions occurred
+        assert_no_python_exceptions(client.stderr)
+
+        # Load and validate the report
+        report = load_benchmark_report(report_path)
+        benchmark = report["benchmarks"][0]
+
+        # Check that the max error rate constraint was triggered
+        assert_constraint_triggered(
+            benchmark,
+            "max_error_rate",
+            {
+                "exceeded_error_rate": True,
+                "current_error_rate": lambda rate: rate >= max_error_rate,
+            },
+        )
+
+    finally:
+        cleanup_report_file(report_path)
diff --git a/tests/e2e/test_over_saturated_benchmark.py b/tests/e2e/test_over_saturated_benchmark.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+import pytest
+
+from tests.e2e.utils import (
+    GuidellmClient,
+    assert_constraint_triggered,
+    assert_no_python_exceptions,
+    cleanup_report_file,
+    load_benchmark_report,
+)
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(
+        port=8000,
+        model="databricks/dolly-v2-12b",
+        mode="random",
+        time_to_first_token=10000,
+        inter_token_latency=100,
+        max_num_seqs=1,
+    )
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.skip(reason="Skipping future feature test")
+@pytest.mark.timeout(60)
+def test_over_saturated_benchmark(server: VllmSimServer):
+    """
+    Another example test interacting with the server.
+    """
+    report_path = Path("tests/e2e/over_saturated_benchmarks.json")
+    rate = 100
+
+    # Create and configure the guidellm client
+    client = GuidellmClient(target=server.get_url(), output_path=report_path)
+
+    cleanup_report_file(report_path)
+    # Start the benchmark
+    client.start_benchmark(
+        rate=rate,
+        max_seconds=20,
+        stop_over_saturated=True,
+        extra_env={
+            "GUIDELLM__CONSTRAINT_OVER_SATURATION_MIN_SECONDS": "0",
+            "GOMAXPROCS": "1",
+        },
+    )
+
+    # Wait for the benchmark to complete
+    client.wait_for_completion(timeout=55)
+
+    # Assert no Python exceptions occurred
+    assert_no_python_exceptions(client.stderr)
+
+    # Load and validate the report
+    report = load_benchmark_report(report_path)
+    benchmark = report["benchmarks"][0]
+
+    # Check that the max duration constraint was triggered
+    assert_constraint_triggered(
+        benchmark, "stop_over_saturated", {"is_over_saturated": True}
+    )
+
+    cleanup_report_file(report_path)
diff --git a/tests/e2e/test_placeholder.py b/tests/e2e/test_placeholder.py
diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py
@@ -0,0 +1,120 @@
+# E2E tests for successful benchmark scenarios with timing validation
+
+from pathlib import Path
+
+import pytest
+
+from tests.e2e.utils import (
+    GuidellmClient,
+    assert_constraint_triggered,
+    assert_no_python_exceptions,
+    assert_successful_requests_fields,
+    cleanup_report_file,
+    load_benchmark_report,
+)
+from tests.e2e.vllm_sim_server import VllmSimServer
+
+
+@pytest.fixture(scope="module")
+def server():
+    """
+    Pytest fixture to start and stop the server for the entire module
+    using the TestServer class.
+    """
+    server = VllmSimServer(
+        port=8000,
+        model="databricks/dolly-v2-12b",
+        mode="random",
+        time_to_first_token=1,  # 1ms TTFT
+        inter_token_latency=1,  # 1ms ITL
+    )
+    try:
+        server.start()
+        yield server  # Yield the URL for tests to use
+    finally:
+        server.stop()  # Teardown: Stop the server after tests are done
+
+
+@pytest.mark.timeout(30)
+def test_max_seconds_benchmark(server: VllmSimServer):
+    """
+    Test that the max seconds constraint is properly triggered.
+    """
+    report_path = Path("tests/e2e/max_duration_benchmarks.json")
+    rate = 10
+
+    # Create and configure the guidellm client
+    client = GuidellmClient(target=server.get_url(), output_path=report_path)
+
+    try:
+        # Start the benchmark
+        client.start_benchmark(
+            rate=rate,
+            max_seconds=1,
+        )
+
+        # Wait for the benchmark to complete
+        client.wait_for_completion(timeout=30)
+
+        # Assert no Python exceptions occurred
+        assert_no_python_exceptions(client.stderr)
+
+        # Load and validate the report
+        report = load_benchmark_report(report_path)
+        benchmark = report["benchmarks"][0]
+
+        # Check that the max duration constraint was triggered
+        assert_constraint_triggered(
+            benchmark, "max_seconds", {"duration_exceeded": True}
+        )
+
+        # Validate successful requests have all expected fields
+        successful_requests = benchmark["requests"]["successful"]
+        assert_successful_requests_fields(successful_requests)
+
+    finally:
+        cleanup_report_file(report_path)
+
+
+@pytest.mark.timeout(30)
+def test_max_requests_benchmark(server: VllmSimServer):
+    """
+    Test that the max requests constraint is properly triggered.
+    """
+    report_path = Path("tests/e2e/max_number_benchmarks.json")
+    rate = 10
+
+    # Create and configure the guidellm client
+    client = GuidellmClient(target=server.get_url(), output_path=report_path)
+
+    try:
+        # Start the benchmark
+        client.start_benchmark(
+            rate=rate,
+            max_requests=rate,
+        )
+
+        # Wait for the benchmark to complete
+        client.wait_for_completion(timeout=30)
+
+        # Assert no Python exceptions occurred
+        assert_no_python_exceptions(client.stderr)
+
+        # Load and validate the report
+        report = load_benchmark_report(report_path)
+        benchmark = report["benchmarks"][0]
+
+        # Check that the max requests constraint was triggered
+        assert_constraint_triggered(
+            benchmark, "max_requests", {"processed_exceeded": True}
+        )
+
+        # Validate successful requests have all expected fields
+        successful_requests = benchmark["requests"]["successful"]
+        assert len(successful_requests) == rate, (
+            f"Expected {rate} successful requests, got {len(successful_requests)}"
+        )
+        assert_successful_requests_fields(successful_requests)
+
+    finally:
+        cleanup_report_file(report_path)