Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# This workflow builds a Docker artifact, caches it based on the Dockerfile content,
# and then runs e2e tests using that artifact.

name: E2E Tests

on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:

jobs:
build-and-test:
runs-on: ubuntu-latest

steps:
- name: Check out repository
uses: actions/checkout@v4

# 1. Set up Python and Tox, which are needed for the test step
- name: Set up Python
run: |
echo uv
curl -LsSf https://astral.sh/uv/install.sh | sh
chmod +x $HOME/.local/bin/uv $HOME/.local/bin/uvx
uv self update
uvx --version
echo python
uv python install 3.12 --default
python --version

# 2. Cache the binary artifact
# The key is based on the runner's OS and the hash of the Dockerfile.
# If the Dockerfile changes, the hash changes, and a new cache is created.
- name: Cache vLLM-sim binary
id: cache-vllm-sim
uses: actions/cache@v4
with:
# The path to the file you want to cache
path: bin/llm-d-inference-sim
# The unique key for the cache
key: vllm-sim-binary-${{ runner.os }}-${{ hashFiles('tests/e2e/vllm-sim.Dockerfile') }}

# 3. Set up Docker Buildx (required for the 'docker build -o' command)
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

# 4. Conditionally build the artifact
# This step only runs if the cache step above did NOT find a match.
# 'steps.cache-vllm-sim.outputs.cache-hit' will be 'true' if the cache was restored.
- name: Build vLLM-sim artifact (if not cached)
if: steps.cache-vllm-sim.outputs.cache-hit != 'true'
run: |
echo "Cache miss. Building artifact..."
docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
shell: bash

- name: Verify artifact
run: |
if [ -f "bin/llm-d-inference-sim" ]; then
echo "Artifact found."
else
echo "ERROR: Artifact bin/llm-d-inference-sim not found!"
exit 1
fi
shell: bash

- name: Cache dependencies
uses: actions/cache@v4
with:
path: ~/.cache/
key: ${{ runner.os }}
restore-keys: |
${{ runner.os }}

# 5. Run the e2e tests
# This step runs every time, using either the restored cache or the freshly built artifact.
- name: Run E2E tests
run: uvx --with tox-uv tox -e test-e2e
shell: bash
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,6 @@ src/ui/next-env.d.ts
!src/ui/public/manifest.json
!src/ui/serve.json
.eslintcache

# e2e tests
bin/
810 changes: 79 additions & 731 deletions pylock.toml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ dev = [
"pytest-cov~=5.0.0",
"pytest-mock~=3.14.0",
"pytest-rerunfailures~=14.0",
"pytest-timeout~=2.4.0",
"respx~=0.22.0",

# code quality
Expand Down
17 changes: 17 additions & 0 deletions tests/e2e/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# E2E tests

The E2E tests in GuideLLM use the [vLLM simulator by llm-d](https://llm-d.ai/docs/architecture/Components/inf-simulator), to run them run the following command:

```shell
docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./
```

On MacOS run:
```shell
docker build . -f tests/e2e/vllm-sim.Dockerfile -o type=local,dest=./ --build-arg BUILDOS=darwin
```

Then to run the tests:
```shell
tox -e test-e2e
```
78 changes: 78 additions & 0 deletions tests/e2e/test_max_error_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# E2E test for max error rate constraint functionality

from pathlib import Path

import pytest

from tests.e2e.utils import (
GuidellmClient,
assert_constraint_triggered,
assert_no_python_exceptions,
cleanup_report_file,
load_benchmark_report,
)
from tests.e2e.vllm_sim_server import VllmSimServer


@pytest.fixture(scope="module")
def server():
"""
Pytest fixture to start and stop the server for the entire module
using the TestServer class.
"""
server = VllmSimServer(
port=8000,
model="databricks/dolly-v2-12b",
mode="random",
time_to_first_token=1, # 1ms TTFT
inter_token_latency=1, # 1ms ITL
)
try:
server.start()
yield server # Yield the URL for tests to use
finally:
server.stop() # Teardown: Stop the server after tests are done


@pytest.mark.timeout(30)
def test_max_error_benchmark(server: VllmSimServer):
"""
Test that the max error rate constraint is properly triggered when server goes down.
"""
report_path = Path("tests/e2e/max_error_benchmarks.json")
rate = 10
max_error_rate = 0.1

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_seconds=25,
max_error_rate=max_error_rate,
)

# Wait for the benchmark to complete (server will be stopped after 15 seconds)
client.wait_for_completion(timeout=30, stop_server_after=15, server=server)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max error rate constraint was triggered
assert_constraint_triggered(
benchmark,
"max_error_rate",
{
"exceeded_error_rate": True,
"current_error_rate": lambda rate: rate >= max_error_rate,
},
)

finally:
cleanup_report_file(report_path)
75 changes: 75 additions & 0 deletions tests/e2e/test_over_saturated_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from pathlib import Path

import pytest

from tests.e2e.utils import (
GuidellmClient,
assert_constraint_triggered,
assert_no_python_exceptions,
cleanup_report_file,
load_benchmark_report,
)
from tests.e2e.vllm_sim_server import VllmSimServer


@pytest.fixture(scope="module")
def server():
"""
Pytest fixture to start and stop the server for the entire module
using the TestServer class.
"""
server = VllmSimServer(
port=8000,
model="databricks/dolly-v2-12b",
mode="random",
time_to_first_token=10000,
inter_token_latency=100,
max_num_seqs=1,
)
try:
server.start()
yield server # Yield the URL for tests to use
finally:
server.stop() # Teardown: Stop the server after tests are done


@pytest.mark.skip(reason="Skipping future feature test")
@pytest.mark.timeout(60)
def test_over_saturated_benchmark(server: VllmSimServer):
"""
Another example test interacting with the server.
"""
report_path = Path("tests/e2e/over_saturated_benchmarks.json")
rate = 100

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

cleanup_report_file(report_path)
# Start the benchmark
client.start_benchmark(
rate=rate,
max_seconds=20,
stop_over_saturated=True,
extra_env={
"GUIDELLM__CONSTRAINT_OVER_SATURATION_MIN_SECONDS": "0",
"GOMAXPROCS": "1",
},
)

# Wait for the benchmark to complete
client.wait_for_completion(timeout=55)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max duration constraint was triggered
assert_constraint_triggered(
benchmark, "stop_over_saturated", {"is_over_saturated": True}
)

cleanup_report_file(report_path)
6 changes: 0 additions & 6 deletions tests/e2e/test_placeholder.py

This file was deleted.

120 changes: 120 additions & 0 deletions tests/e2e/test_successful_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# E2E tests for successful benchmark scenarios with timing validation

from pathlib import Path

import pytest

from tests.e2e.utils import (
GuidellmClient,
assert_constraint_triggered,
assert_no_python_exceptions,
assert_successful_requests_fields,
cleanup_report_file,
load_benchmark_report,
)
from tests.e2e.vllm_sim_server import VllmSimServer


@pytest.fixture(scope="module")
def server():
"""
Pytest fixture to start and stop the server for the entire module
using the TestServer class.
"""
server = VllmSimServer(
port=8000,
model="databricks/dolly-v2-12b",
mode="random",
time_to_first_token=1, # 1ms TTFT
inter_token_latency=1, # 1ms ITL
)
try:
server.start()
yield server # Yield the URL for tests to use
finally:
server.stop() # Teardown: Stop the server after tests are done


@pytest.mark.timeout(30)
def test_max_seconds_benchmark(server: VllmSimServer):
"""
Test that the max seconds constraint is properly triggered.
"""
report_path = Path("tests/e2e/max_duration_benchmarks.json")
rate = 10

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_seconds=1,
)

# Wait for the benchmark to complete
client.wait_for_completion(timeout=30)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max duration constraint was triggered
assert_constraint_triggered(
benchmark, "max_seconds", {"duration_exceeded": True}
)

# Validate successful requests have all expected fields
successful_requests = benchmark["requests"]["successful"]
assert_successful_requests_fields(successful_requests)

finally:
cleanup_report_file(report_path)


@pytest.mark.timeout(30)
def test_max_requests_benchmark(server: VllmSimServer):
"""
Test that the max requests constraint is properly triggered.
"""
report_path = Path("tests/e2e/max_number_benchmarks.json")
rate = 10

# Create and configure the guidellm client
client = GuidellmClient(target=server.get_url(), output_path=report_path)

try:
# Start the benchmark
client.start_benchmark(
rate=rate,
max_requests=rate,
)

# Wait for the benchmark to complete
client.wait_for_completion(timeout=30)

# Assert no Python exceptions occurred
assert_no_python_exceptions(client.stderr)

# Load and validate the report
report = load_benchmark_report(report_path)
benchmark = report["benchmarks"][0]

# Check that the max requests constraint was triggered
assert_constraint_triggered(
benchmark, "max_requests", {"processed_exceeded": True}
)

# Validate successful requests have all expected fields
successful_requests = benchmark["requests"]["successful"]
assert len(successful_requests) == rate, (
f"Expected {rate} successful requests, got {len(successful_requests)}"
)
assert_successful_requests_fields(successful_requests)

finally:
cleanup_report_file(report_path)
Loading
Loading