add test_llm_d_inference_sim end-to-end testing

diamondburned · diamondburned · commit 99671c022b39 · 2025-11-26T00:01:26.000-08:00
run with the same `pdm run test:e2e`.

this requires `llm-d-inference-sim` to be present in the local
environment. see the module docstring for more information.

the `e2e_test-on-change.yml` workflow has been updated to run on all
`push` and `pull_request` events, not just this one.
diff --git a/.github/workflows/e2e_test-on-change.yml b/.github/workflows/e2e_test-on-change.yml
@@ -2,34 +2,24 @@ name: E2E Test on change
 
 on:
   push:
-    branches:
-      - main
-      - 'feature/**'
   pull_request:
-    branches:
-      - main
-      - 'feature/**'
 
 jobs:
   e2e-tests:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['3.13']
     steps:
-      - name: Checkout Code
+      - name: Checkout code
         uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Set up PDM
-        uses: pdm-project/setup-pdm@v4
+
+      - name: Install Nix
+        uses: cachix/install-nix-action@v31
         with:
-          python-version: ${{ matrix.python-version }}
+          github_access_token: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Install dependencies
         run: |
-          pdm sync -d
-      - name: Run e2e tests
+          nix develop -c pdm sync -d
+
+      - name: Run end-to-end tests
         run: |
-          pdm run test:e2e
+          nix develop -c pdm run test:e2e
diff --git a/e2e/testdata/models/.gitignore b/e2e/testdata/models/.gitignore
@@ -0,0 +1,4 @@
+*
+!.gitignore
+!*.tar.gz
+!*.tar.zst
diff --git a/e2e/testdata/models/google_gemma-3-270m.tar.gz b/e2e/testdata/models/google_gemma-3-270m.tar.gz
diff --git a/e2e/tests/test_llm_d_inference_sim.py b/e2e/tests/test_llm_d_inference_sim.py
@@ -0,0 +1,119 @@
+"""
+End-to-end integration testing of inference-perf using llm-d-inference-sim[1].
+
+In order for these tests to run, you must have `llm-d-inference-sim` in your
+PATH. The GitHub Actions runner will have this, but you may also install it
+locally by following llm-d-inference-sim's README or by entering the Nix shell
+of this repository (i.e. `nix develop`).
+
+If your local environment is missing `llm-d-inference-sim`, tests here will
+automatically be skipped.
+
+[1]: https://github.com/llm-d/llm-d-inference-sim
+"""
+
+import pytest
+
+from utils.llm_d_inference_sim import LLMDInferenceSimRunner
+from utils.benchmark import run_benchmark_minimal
+from utils.testdata import extract_tarball
+
+
+TEST_SIM_PORT = 18000
+TEST_MODEL_NAME = "google/gemma-3-270m"
+TEST_MODEL_TARBALL = "e2e/testdata/models/google_gemma-3-270m.tar.gz"
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(not LLMDInferenceSimRunner.is_available(), reason="local environment missing llm-d-inference-sim")
+@pytest.mark.parametrize(
+    "data",
+    [
+        pytest.param(
+            {
+                "type": "mock",
+            },
+            id="data_mock",
+        ),
+        pytest.param(
+            {
+                "type": "shared_prefix",
+                "shared_prefix": {
+                    "num_groups": 256,
+                    "num_prompts_per_group": 16,
+                    "system_prompt_len": 512,
+                    "question_len": 256,
+                    "output_len": 256,
+                },
+            },
+            id="data_shared_prefix",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "load",
+    [
+        pytest.param(
+            {
+                "type": "constant",
+                "stages": [{"rate": 1, "duration": 5}],
+                "num_workers": 2,
+            },
+            id="load_constant_slow",
+        ),
+        pytest.param(
+            {
+                "type": "constant",
+                "interval": 2,
+                "stages": [{"rate": 1, "duration": 5}, {"rate": 2, "duration": 5}],
+                "num_workers": 2,
+            },
+            id="load_constant_slow_two_stages",
+        ),
+        pytest.param(
+            {
+                "type": "constant",
+                "stages": [{"rate": 100, "duration": 5}],
+                "num_workers": 2,
+            },
+            id="load_constant_fast",
+        ),
+    ],
+)
+async def test_completion_successful_run(data: dict, load: dict):
+    """
+    Very simple inference-perf integration test that ensures a wide range of
+    vLLM benchmarking configurations can run successfully.
+    """
+    config = {
+        "data": data,
+        "load": load,
+        "api": {
+            "type": "completion",
+            "streaming": True,
+        },
+        "server": {
+            "type": "vllm",
+            "model_name": TEST_MODEL_NAME,
+            "base_url": f"http://127.0.0.1:{TEST_SIM_PORT}",
+            "ignore_eos": True,
+        },
+        "tokenizer": {
+            "pretrained_model_name_or_path": str(extract_tarball(TEST_MODEL_TARBALL)),
+        },
+        "report": {
+            "request_lifecycle": {
+                "summary": True,
+                "per_stage": True,
+                "per_request": True,
+            },
+        },
+    }
+
+    async with LLMDInferenceSimRunner(TEST_MODEL_NAME, port=TEST_SIM_PORT):
+        result = await run_benchmark_minimal(config)
+
+    assert result.success, "Benchmark failed"
+    assert result.reports, "No reports generated from benchmark"
+    assert result.reports["summary_lifecycle_metrics.json"], "Missing summary report"
+    assert result.reports["per_request_lifecycle_metrics.json"], "Missing requests report"
diff --git a/e2e/utils/llm_d_inference_sim.py b/e2e/utils/llm_d_inference_sim.py
@@ -0,0 +1,139 @@
+import aiohttp
+import asyncio
+import logging
+import sys
+import textwrap
+import shutil
+from contextlib import AsyncContextDecorator
+
+
+logger = logging.getLogger(__name__)
+
+
+class LLMDInferenceSimRunner(AsyncContextDecorator):
+    @staticmethod
+    def is_available(executable: str = "llm-d-inference-sim") -> bool:
+        """
+        Returns whether llm-d-inference-sim is present in the local
+        environment.
+        """
+        return shutil.which(executable) is not None
+
+    executable: str
+    argv: list[str]
+
+    _port: int
+    _proc: asyncio.subprocess.Process | None = None
+    _wait_until_ready: bool
+
+    def __init__(
+        self,
+        model: str,
+        *cmd_args: str,
+        port: int = 8000,
+        max_waiting_queue_length: int = 10000,
+        executable: str = "llm-d-inference-sim",
+        wait_until_ready=True,
+    ) -> None:
+        self.executable = executable
+        self.argv = [
+            *("--port", str(port)),
+            *("--model", model),
+            *("--max-waiting-queue-length", str(max_waiting_queue_length)),
+            *cmd_args,
+        ]
+        self._port = port
+        self._wait_until_ready = wait_until_ready
+
+    async def __aenter__(self) -> "LLMDInferenceSimRunner":
+        """
+        Starts running the llm-d-inference-sim server in the background.
+        Once the contextmanager exits, stop the server using a SIGTERM.
+        """
+        if not LLMDInferenceSimRunner.is_available(self.executable):
+            raise FileNotFoundError(f"executable not found: {self.executable}")
+
+        logger.debug(f"starting server: {self.argv=}")
+        self._proc = await asyncio.create_subprocess_exec(
+            self.executable,
+            *self.argv,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+        )
+
+        if self._wait_until_ready:
+            try:
+                await self.wait_until_ready()
+            except Exception:
+                await self.__aexit__(*sys.exc_info())
+                raise
+
+        return self
+
+    async def __aexit__(self, *exc):
+        """
+        Sends a SIGTERM to the server and waits a bit for it to stop.
+        Returns true if process exited gracefully.
+        """
+        terminate_task = asyncio.create_task(self._terminate())
+        await self._wait()
+        await terminate_task
+
+    async def wait_until_ready(
+        self,
+        polling_sec: float = 0.5,
+        timeout_sec: float | None = 10,
+    ) -> None:
+        """Waits until the server is ready to serve requests."""
+        assert self._proc
+
+        async def wait_http():
+            async with aiohttp.ClientSession() as http:
+                while True:
+                    try:
+                        async with http.head(f"http://localhost:{self._port}") as resp:
+                            await resp.read()
+                            logger.debug(f"querying server's / endpoint returned {resp.status=}")
+                        return True
+                    except asyncio.exceptions.CancelledError:
+                        raise
+                    except asyncio.exceptions.TimeoutError:
+                        raise
+                    except Exception as e:
+                        logger.debug(f"http polling error: {e}, retrying...")
+                        await asyncio.sleep(polling_sec)
+                        continue
+
+        async def wait_proc():
+            await self._wait()
+            raise ConnectionRefusedError("server process exited before port was ready")
+
+        done, pending = await asyncio.wait(
+            [asyncio.create_task(x) for x in [wait_http(), wait_proc()]],
+            return_when=asyncio.FIRST_COMPLETED,
+            timeout=timeout_sec,
+        )
+        [task.cancel() for task in pending]  # cancel pending tasks
+        [task.result() for task in done]  # ensure an exception is thrown
+
+    async def _wait(self) -> None:
+        proc = self._proc
+        assert proc
+
+        stdout, _ = await proc.communicate()
+        stdout_pretty = textwrap.indent(stdout.decode(), "  | ")
+        logger.debug(f"server exited with status {proc.returncode}, output:\n{stdout_pretty}")
+
+    async def _terminate(self) -> None:
+        proc = self._proc
+        assert proc
+
+        try:
+            proc.terminate()
+            await asyncio.sleep(2)
+            proc.kill()
+        except ProcessLookupError:
+            pass  # process already exited
+        except Exception as e:
+            logger.debug(f"server failed to be terminated: {e}")
+            raise
diff --git a/e2e/utils/testdata.py b/e2e/utils/testdata.py
@@ -0,0 +1,30 @@
+import os
+import pathlib
+import subprocess
+
+TEST_E2E_DIR = pathlib.Path(__file__).parent.parent
+TEST_E2E_TESTDATA = TEST_E2E_DIR.joinpath("testdata")
+
+
+def extract_tarball(name: str | pathlib.Path) -> pathlib.Path:
+    """
+    Extract tarball with the given path to the directory that that tarball is
+    in.
+
+    The returned path is the folder containing the content of the tarball, named
+    after the tarball name itself without the extension.
+    """
+    name = pathlib.Path(name).resolve()
+
+    dest = name
+    while dest.suffix:
+        dest = dest.with_suffix("")
+
+    if not dest.is_dir():
+        if not name.is_file():
+            raise FileNotFoundError(f"Tarball {name} not found!")
+
+        os.makedirs(dest)
+        subprocess.run(["tar", "-xzvf", name, "-C", dest], check=True)
+
+    return dest
diff --git a/flake.nix b/flake.nix
@@ -56,7 +56,15 @@
                   (nixRunWrap "llm-d-inference-sim")
                 ];
 
-              buildInputs = [ pkgs.python3Packages.venvShellHook ];
+              buildInputs =
+                with pkgs;
+                with python3Packages;
+                [
+                  numpy
+                  torch
+                  venvShellHook
+                ];
+
               venvDir = "venv";
             };
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -115,6 +115,8 @@ docstring-code-format = false
 docstring-code-line-length = "dynamic"
 
 [tool.pytest.ini_options]
+asyncio_mode = "auto"
+asyncio_default_fixture_loop_scope = "session"
 log_cli = true
 log_cli_level = "INFO"
 testpaths = ["."]