Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,11 @@ jobs:
with:
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5

- name: Install dependencies
run: |
python -m pip install pip==26.0.1
pip install -e ".[dev]"
run: uv sync --frozen --extra dev

- name: Run pre-commit
run: |
pre-commit run --all-files --show-diff-on-failure
run: uv run pre-commit run --all-files --show-diff-on-failure
29 changes: 6 additions & 23 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,14 @@ on:
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install pip==26.0.1
pip install -e ".[test]"
- name: Install uv
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5

- name: Run tests
run: |
pytest -xv -m "not slow and not performance" --cov=src --cov-report=xml --cov-report=html
run: uv run --frozen --extra test pytest -xv -m "not slow and not performance" --cov=src --cov-report=xml --cov-report=html

- name: Upload coverage report
uses: actions/upload-artifact@v4
Expand All @@ -43,13 +31,8 @@ jobs:
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5

- name: Audit dependencies for known vulnerabilities
run: |
python -m pip install pip==26.0.1
pip install -e ".[dev,test,performance]"
pip-audit
run: uv run --frozen --extra dev --extra test --extra performance pip-audit
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,10 @@ repos:
types: [python]
pass_filenames: true
exclude: ^(src/inference_endpoint/openai/openai_types_gen.py)$

- id: uv-lock-check
name: Check uv.lock is up-to-date
entry: uv lock --check
language: system
pass_filenames: false
files: ^pyproject\.toml$
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new uv-lock-check hook only runs when pyproject.toml changes (files: ^pyproject\.toml$). If uv.lock is modified (or accidentally corrupted) without a pyproject.toml change, this check won’t run and an inconsistent lockfile could still be committed. Consider widening the files selector to also trigger on uv.lock changes (or remove the selector so it always runs when relevant).

Suggested change
files: ^pyproject\.toml$
files: ^(pyproject\.toml|uv\.lock)$

Copilot uses AI. Check for mistakes.
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
42 changes: 28 additions & 14 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,42 @@ High-performance benchmarking tool for LLM inference endpoints targeting 50k+ QP

```bash
# Development setup
python3.12 -m venv venv && source venv/bin/activate
pip install -e ".[dev,test]"
pre-commit install
uv sync --extra dev --extra test
uv run pre-commit install

# Testing
pytest # All tests (excludes slow/performance)
pytest -m unit # Unit tests only
pytest -m integration # Integration tests only
pytest --cov=src --cov-report=html # With coverage
pytest -xvs tests/unit/path/to/test_file.py # Single test file
uv run pytest # All tests (excludes slow/performance)
uv run pytest -m unit # Unit tests only
uv run pytest -m integration # Integration tests only
uv run pytest --cov=src --cov-report=html # With coverage
uv run pytest -xvs tests/unit/path/to/test_file.py # Single test file

# Code quality (run before commits)
pre-commit run --all-files
uv run pre-commit run --all-files

# Local testing with echo server
python -m inference_endpoint.testing.echo_server --port 8765
inference-endpoint probe --endpoints http://localhost:8765 --model test-model
uv run python -m inference_endpoint.testing.echo_server --port 8765
uv run inference-endpoint probe --endpoints http://localhost:8765 --model test-model

# CLI usage
uv run inference-endpoint benchmark offline --endpoints URL --model NAME --dataset PATH
uv run inference-endpoint benchmark online --endpoints URL --model NAME --dataset PATH --load-pattern poisson --target-qps 100
uv run inference-endpoint benchmark from-config --config config.yaml
```

### Backward-compatible setup (pip + venv)

Does not use `uv.lock` — dependency versions may differ from the lockfile.

```bash
python3.12 -m venv venv && source venv/bin/activate
pip install -e ".[dev,test]"
pre-commit install

# After activating the venv, commands run without the `uv run` prefix:
pytest -m unit
pre-commit run --all-files
inference-endpoint benchmark offline --endpoints URL --model NAME --dataset PATH
inference-endpoint benchmark online --endpoints URL --model NAME --dataset PATH --load-pattern poisson --target-qps 100
inference-endpoint benchmark from-config --config config.yaml
```

## Architecture
Expand Down Expand Up @@ -345,5 +359,5 @@ Known failure modes when AI tools generate code for this project. Reference thes

### Dependency & Environment

- **Adding new dependencies without justification**: AI may `pip install` or add imports for packages not in `pyproject.toml`. Any new dependency must be justified, added to the correct optional group, and pinned to an exact version (`==`). After adding a dependency, run `pip-audit` (included in `dev` extras) to verify it has no known vulnerabilities.
- **Adding new dependencies without justification**: AI may `pip install` or add imports for packages not in `pyproject.toml`. Any new dependency must be justified, added to the correct optional group, and pinned to an exact version (`==`). After adding a dependency, run `pip-audit` (included in `dev` extras) to verify it has no known vulnerabilities. When adding dependencies, use `uv add <package>==<version>` to update both `pyproject.toml` and `uv.lock` atomically, then run `uv run pip-audit` to check for vulnerabilities.
- **Using `requests`/`aiohttp` for HTTP**: This project has its own HTTP client (`endpoint_client/http.py`) using `httptools`. AI defaults to `requests` or `aiohttp` — these should not appear in production code (test dependencies are fine).
62 changes: 44 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,81 +13,107 @@ A high-performance benchmarking tool for LLM endpoints.
# Note: This repo will be migrated to https://github.com/mlcommons/endpoints
git clone https://github.com/mlcommons/endpoints.git
cd endpoints
```

This project uses [uv](https://docs.astral.sh/uv/) for dependency management. All dependencies are pinned in `uv.lock`.

```bash
# Install dependencies
uv sync

# For development (includes linting, testing, and type-checking tools)
uv sync --extra dev --extra test
uv run pre-commit install
```

# Create virtual environment
python3.12 -m venv venv
source venv/bin/activate
<details>
<summary>Using pip + venv instead (backward-compatible)</summary>

# As a user
pip install .
> **Note:** pip installs from `pyproject.toml` directly and does not use `uv.lock`. Dependency versions may differ.

# As a developer (with development and test extras)
```bash
python3.12 -m venv venv && source venv/bin/activate
pip install -e ".[dev,test]"
pre-commit install
```

After activating the venv, all commands below work without the `uv run` prefix.

</details>

### Basic Usage

```bash
# Show help
inference-endpoint --help
uv run inference-endpoint --help

# Show system information
inference-endpoint -v info
uv run inference-endpoint -v info

# Test endpoint connectivity
inference-endpoint probe \
uv run inference-endpoint probe \
--endpoints http://your-endpoint:8000 \
--model Qwen/Qwen3-8B

# Run offline benchmark (max throughput - uses all dataset samples)
inference-endpoint benchmark offline \
uv run inference-endpoint benchmark offline \
--endpoints http://your-endpoint:8000 \
--model Qwen/Qwen3-8B \
--dataset tests/datasets/dummy_1k.jsonl

# Run online benchmark (sustained QPS - requires --target-qps, --load-pattern)
inference-endpoint benchmark online \
uv run inference-endpoint benchmark online \
--endpoints http://your-endpoint:8000 \
--model Qwen/Qwen3-8B \
--dataset tests/datasets/dummy_1k.jsonl \
--load-pattern poisson \
--target-qps 100

# With explicit sample count
inference-endpoint benchmark offline \
uv run inference-endpoint benchmark offline \
--endpoints http://your-endpoint:8000 \
--model Qwen/Qwen3-8B \
--dataset tests/datasets/dummy_1k.jsonl \
--num-samples 5000

# ... or activate the venv to skip the `uv run` prefix:
# source .venv/bin/activate
# inference-endpoint --help
# inference-endpoint benchmark offline --endpoints URL --model NAME --dataset PATH
```

### Running Locally

```bash
# Start local echo server
python3 -m inference_endpoint.testing.echo_server --port 8765 &
uv run python -m inference_endpoint.testing.echo_server --port 8765 &

# Test with dummy dataset (included in repo)
inference-endpoint benchmark offline \
uv run inference-endpoint benchmark offline \
--endpoints http://localhost:8765 \
--model Qwen/Qwen3-8B \
--dataset tests/datasets/dummy_1k.jsonl

# Stop echo server
pkill -f echo_server

# ... or with an activated venv (source .venv/bin/activate):
# python -m inference_endpoint.testing.echo_server --port 8765 &
# inference-endpoint benchmark offline --endpoints http://localhost:8765 --model Qwen/Qwen3-8B --dataset tests/datasets/dummy_1k.jsonl
```

See [Local Testing Guide](docs/LOCAL_TESTING.md) for detailed instructions.

### Running Tests and Examples

```bash
# Install test dependencies
pip install ".[test]"
uv run pytest -m "not performance and not run_explicitly"
uv run pytest -m unit
uv run pytest --cov=src --cov-report=html

# Run tests (excluding performance and explicit-run tests)
pytest -m "not performance and not run_explicitly"
# ... or with an activated venv (source .venv/bin/activate):
# pytest -m "not performance and not run_explicitly"
# pytest -m unit

# Run examples: follow instructions in examples/*/README.md
```
Expand Down
33 changes: 16 additions & 17 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
[build-system]
requires = ["setuptools==78.1.1", "wheel==0.46.3"]
build-backend = "setuptools.build_meta"
requires = ["uv_build>=0.7.6,<0.8"]
build-backend = "uv_build"
Comment on lines 1 to +3
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switching the build backend to uv_build changes how sdists/wheels are produced, but the CI workflows only run uv sync + pytest and don’t exercise the packaging path. To catch packaging regressions early, consider adding a CI step that builds the distribution artifacts and performs a minimal smoke test (e.g., install the built wheel and import/run the CLI).

Copilot uses AI. Check for mistakes.

[tool.uv]
index-url = "https://pypi.org/simple"
environments = [
"sys_platform == 'linux' and platform_machine == 'x86_64'",
"sys_platform == 'linux' and platform_machine == 'aarch64'",
"sys_platform == 'darwin' and platform_machine == 'x86_64'",
"sys_platform == 'darwin' and platform_machine == 'arm64'",
]

[tool.uv-build]
module-root = "src"
data = {"inference_endpoint" = ["config/templates/*.yaml"]}
exclude = ["evaluation/livecodebench/_server.py"]

[project]
name = "inference-endpoint"
Expand Down Expand Up @@ -112,21 +126,6 @@ Documentation = "https://github.com/mlperf/inference-endpoint#readme"
Repository = "https://github.com/mlperf/inference-endpoint.git"
Issues = "https://github.com/mlperf/inference-endpoint/issues"

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-dir]
"" = "src"

[tool.setuptools.package-data]
inference_endpoint = ["config/templates/*.yaml"]

[tool.setuptools.exclude-package-data]
"inference_endpoint.evaluation.livecodebench" = ["_server.py"]

[tool.autopep8]
max_line_length = 88

[tool.ruff]
target-version = "py312"
line-length = 88
Expand Down
15 changes: 10 additions & 5 deletions scripts/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,25 @@

FROM python:3.12.11-slim

# Copy uv binary from official image
COPY --from=ghcr.io/astral-sh/uv:0.7.6 /uv /uvx /bin/

# Set environment variables
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy

# Install system dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends build-essential procps \
apt-get install -y --no-install-recommends build-essential procps \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir /mnt/inference-endpoint
WORKDIR /mnt/inference-endpoint
COPY pyproject.toml .

# Copy lockfile + project metadata first for Docker layer caching
COPY pyproject.toml uv.lock .python-version ./
COPY src/ ./src/

# Create a non-root user for security
Expand All @@ -33,4 +38,4 @@ RUN if ! getent group ${GROUP_ID}; then \
USER appuser
ENV PATH="/home/appuser/.local/bin:$PATH"

RUN pip install -e .[dev,test]
RUN uv sync --frozen --extra dev --extra test
Loading
Loading