Skip to content

Commit 46c6c48

Browse files
author
Wen Liang
committed
feat(quota): support per‑client and anonymous server‑side request quotas
Unrestricted API usage can lead to runaway costs and fragmented client-side throttling logic. This commit introduces a built-in quota mechanism at the server level, enabling operators to centrally enforce per-client and anonymous rate limits—without needing external proxies or client changes. This helps contain compute costs, enforces fair usage, and simplifies deployment and monitoring of Llama Stack services. Quotas are fully opt-in and have no effect unless explicitly configured. Currently, SQLite is the only supported KV store. If quotas are configured but authentication is disabled, authenticated limits will gracefully fall back to anonymous limits. Highlights: - Adds `QuotaMiddleware` to enforce request quotas: - Uses bearer token as client ID if present; otherwise falls back to IP address - Tracks requests in KV store with per-key TTL expiration - Returns HTTP 429 if a client exceeds their quota - Extends `ServerConfig` with a `quota` section: - `kvstore`: configuration for the backend (currently only SQLite) - `anonymous_max_requests`: per-period cap for unauthenticated clients - `authenticated_max_requests`: per-period cap for authenticated clients - `period`: duration of the quota window (currently only `day` is supported) - Adds full test coverage with FastAPI `TestClient` and custom middleware injection Behavior changes: - Quotas are disabled by default unless explicitly configured - Anonymous users get a conservative default quota; authenticated clients can be given more generous limits To enable per-client request quotas in `run.yaml`, add: ```yaml server: port: 8321 auth: provider_type: custom config: endpoint: https://auth.example.com/validate quota: kvstore: type: sqlite db_path: ./quotas.db anonymous_max_requests: 100 authenticated_max_requests: 1000 period: day ``` Signed-off-by: Wen Liang <[email protected]>
1 parent 7aae8fa commit 46c6c48

File tree

5 files changed

+299
-1
lines changed

5 files changed

+299
-1
lines changed

llama_stack/distribution/datatypes.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
2626
from llama_stack.apis.vector_io import VectorIO
2727
from llama_stack.providers.datatypes import Api, ProviderSpec
28-
from llama_stack.providers.utils.kvstore.config import KVStoreConfig
28+
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
2929

3030
LLAMA_STACK_BUILD_CONFIG_VERSION = "2"
3131
LLAMA_STACK_RUN_CONFIG_VERSION = "2"
@@ -235,6 +235,19 @@ class AuthenticationConfig(BaseModel):
235235
)
236236

237237

238+
class QuotaPeriod(str, Enum):
239+
DAY = "day"
240+
241+
242+
class QuotaConfig(BaseModel):
243+
kvstore: SqliteKVStoreConfig = Field(description="Config for KV store backend (SQLite only for now)")
244+
anonymous_max_requests: int = Field(default=100, description="Max requests for unauthenticated clients per period")
245+
authenticated_max_requests: int = Field(
246+
default=1000, description="Max requests for authenticated clients per period"
247+
)
248+
period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
249+
250+
238251
class ServerConfig(BaseModel):
239252
port: int = Field(
240253
default=8321,
@@ -262,6 +275,10 @@ class ServerConfig(BaseModel):
262275
default=False,
263276
description="Disable IPv6 support",
264277
)
278+
quota: QuotaConfig | None = Field(
279+
default=None,
280+
description="Per client quota request configuration",
281+
)
265282

266283

267284
class StackRunConfig(BaseModel):

llama_stack/distribution/server/auth.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ async def __call__(self, scope, receive, send):
113113
"namespaces": [token],
114114
}
115115

116+
# Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
117+
# can identify the requester and enforce per-client rate limits.
118+
scope["authenticated_client_id"] = token
119+
116120
# Store attributes in request scope
117121
scope["user_attributes"] = user_attributes
118122
logger.debug(f"Authentication successful: {len(scope['user_attributes'])} attributes")
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
import json
8+
import time
9+
from datetime import datetime, timedelta, timezone
10+
11+
from starlette.types import ASGIApp, Receive, Scope, Send
12+
13+
from llama_stack.log import get_logger
14+
from llama_stack.providers.utils.kvstore.api import KVStore
15+
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
16+
from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
17+
18+
logger = get_logger(name=__name__, category="quota")
19+
20+
21+
class QuotaMiddleware:
22+
"""
23+
ASGI middleware that enforces separate quotas for authenticated and anonymous clients
24+
within a configurable time window.
25+
26+
- For authenticated requests, it reads the client ID from the
27+
`Authorization: Bearer <client_id>` header.
28+
- For anonymous requests, it falls back to the IP address of the client.
29+
Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
30+
once a client exceeds its quota.
31+
"""
32+
33+
def __init__(
34+
self,
35+
app: ASGIApp,
36+
kv_config: KVStoreConfig,
37+
anonymous_max_requests: int,
38+
authenticated_max_requests: int,
39+
window_seconds: int = 86400,
40+
):
41+
self.app = app
42+
self.kv_config = kv_config
43+
self.kv: KVStore | None = None
44+
self.anonymous_max_requests = anonymous_max_requests
45+
self.authenticated_max_requests = authenticated_max_requests
46+
self.window_seconds = window_seconds
47+
48+
if isinstance(self.kv_config, SqliteKVStoreConfig):
49+
logger.warning(
50+
"QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
51+
f"window_seconds={self.window_seconds}"
52+
)
53+
54+
async def _get_kv(self) -> KVStore:
55+
if self.kv is None:
56+
self.kv = await kvstore_impl(self.kv_config)
57+
return self.kv
58+
59+
async def __call__(self, scope: Scope, receive: Receive, send: Send):
60+
if scope["type"] == "http":
61+
# pick key & limit based on auth
62+
auth_id = scope.get("authenticated_client_id")
63+
if auth_id:
64+
key_id = auth_id
65+
limit = self.authenticated_max_requests
66+
else:
67+
# fallback to IP
68+
client = scope.get("client")
69+
key_id = client[0] if client else "anonymous"
70+
limit = self.anonymous_max_requests
71+
72+
current_window = int(time.time() // self.window_seconds)
73+
key = f"quota:{key_id}:{current_window}"
74+
75+
try:
76+
kv = await self._get_kv()
77+
prev = await kv.get(key) or "0"
78+
count = int(prev) + 1
79+
80+
if int(prev) == 0:
81+
# Set with expiration datetime when it is the first request in the window.
82+
expiration = datetime.now(timezone.utc) + timedelta(seconds=self.window_seconds)
83+
await kv.set(key, str(count), expiration=expiration)
84+
else:
85+
await kv.set(key, str(count))
86+
except Exception:
87+
logger.exception("Failed to access KV store for quota")
88+
return await self._send_error(send, 500, "Quota service error")
89+
90+
if count > limit:
91+
logger.warning(
92+
"Quota exceeded for client %s: %d/%d",
93+
key_id,
94+
count,
95+
limit,
96+
)
97+
return await self._send_error(send, 429, "Quota exceeded")
98+
99+
return await self.app(scope, receive, send)
100+
101+
async def _send_error(self, send: Send, status: int, message: str):
102+
await send(
103+
{
104+
"type": "http.response.start",
105+
"status": status,
106+
"headers": [[b"content-type", b"application/json"]],
107+
}
108+
)
109+
body = json.dumps({"error": {"message": message}}).encode()
110+
await send({"type": "http.response.body", "body": body})

llama_stack/distribution/server/server.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060

6161
from .auth import AuthenticationMiddleware
6262
from .endpoints import get_all_api_endpoints
63+
from .quota import QuotaMiddleware
6364

6465
REPO_ROOT = Path(__file__).parent.parent.parent.parent
6566

@@ -432,6 +433,35 @@ def main(args: argparse.Namespace | None = None):
432433
if config.server.auth:
433434
logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
434435
app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
436+
else:
437+
if config.server.quota:
438+
quota = config.server.quota
439+
logger.warning(
440+
"Configured authenticated_max_requests (%d) but no auth is enabled; "
441+
"falling back to anonymous_max_requests (%d) for all the requests",
442+
quota.authenticated_max_requests,
443+
quota.anonymous_max_requests,
444+
)
445+
446+
if config.server.quota:
447+
logger.info("Enabling quota middleware for authenticated and anonymous clients")
448+
449+
quota = config.server.quota
450+
anonymous_max_requests = quota.anonymous_max_requests
451+
# if auth is disabled, use the anonymous max requests
452+
authenticated_max_requests = quota.authenticated_max_requests if config.server.auth else anonymous_max_requests
453+
454+
kv_config = quota.kvstore
455+
window_map = {"day": 86400}
456+
window_seconds = window_map[quota.period.value]
457+
458+
app.add_middleware(
459+
QuotaMiddleware,
460+
kv_config=kv_config,
461+
anonymous_max_requests=anonymous_max_requests,
462+
authenticated_max_requests=authenticated_max_requests,
463+
window_seconds=window_seconds,
464+
)
435465

436466
try:
437467
impls = asyncio.run(construct_stack(config))

tests/unit/server/test_quota.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
import os
8+
9+
import pytest
10+
from fastapi import FastAPI, Request
11+
from fastapi.testclient import TestClient
12+
from starlette.middleware.base import BaseHTTPMiddleware
13+
14+
from llama_stack.distribution.datatypes import QuotaConfig, QuotaPeriod
15+
from llama_stack.distribution.server.quota import QuotaMiddleware
16+
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
17+
18+
TEST_DB_PATH = "./quotas_test.db"
19+
20+
21+
@pytest.fixture(autouse=True)
22+
def clean_sqlite_db():
23+
"""
24+
Remove the test DB file before each test to ensure clean state.
25+
"""
26+
if os.path.exists(TEST_DB_PATH):
27+
os.remove(TEST_DB_PATH)
28+
29+
30+
class InjectClientIDMiddleware(BaseHTTPMiddleware):
31+
"""
32+
Middleware that injects 'authenticated_client_id' to mimic AuthenticationMiddleware.
33+
"""
34+
35+
def __init__(self, app, client_id="client1"):
36+
super().__init__(app)
37+
self.client_id = client_id
38+
39+
async def dispatch(self, request: Request, call_next):
40+
request.scope["authenticated_client_id"] = self.client_id
41+
return await call_next(request)
42+
43+
44+
def build_quota_config() -> QuotaConfig:
45+
return QuotaConfig(
46+
kvstore=SqliteKVStoreConfig(db_path=TEST_DB_PATH),
47+
anonymous_max_requests=1,
48+
authenticated_max_requests=2,
49+
period=QuotaPeriod.DAY,
50+
)
51+
52+
53+
@pytest.fixture(scope="function")
54+
def auth_app(request):
55+
"""
56+
FastAPI app with InjectClientIDMiddleware and QuotaMiddleware for authenticated testing.
57+
"""
58+
inner_app = FastAPI()
59+
60+
@inner_app.get("/test")
61+
async def test_endpoint():
62+
return {"message": "ok"}
63+
64+
client_id = f"client_{request.node.name}"
65+
quota = build_quota_config()
66+
67+
app = InjectClientIDMiddleware(
68+
QuotaMiddleware(
69+
inner_app,
70+
kv_config=quota.kvstore,
71+
anonymous_max_requests=quota.anonymous_max_requests,
72+
authenticated_max_requests=quota.authenticated_max_requests,
73+
window_seconds=86400,
74+
),
75+
client_id=client_id,
76+
)
77+
return app
78+
79+
80+
def test_authenticated_quota_allows_up_to_limit(auth_app):
81+
client = TestClient(auth_app)
82+
assert client.get("/test").status_code == 200
83+
assert client.get("/test").status_code == 200
84+
85+
86+
def test_authenticated_quota_blocks_after_limit(auth_app):
87+
client = TestClient(auth_app)
88+
client.get("/test")
89+
client.get("/test")
90+
resp = client.get("/test")
91+
assert resp.status_code == 429
92+
assert resp.json()["error"]["message"] == "Quota exceeded"
93+
94+
95+
def test_anonymous_quota_allows_up_to_limit():
96+
inner_app = FastAPI()
97+
98+
@inner_app.get("/test")
99+
async def test_endpoint():
100+
return {"message": "ok"}
101+
102+
quota = build_quota_config()
103+
104+
app = QuotaMiddleware(
105+
inner_app,
106+
kv_config=quota.kvstore,
107+
anonymous_max_requests=quota.anonymous_max_requests,
108+
authenticated_max_requests=quota.authenticated_max_requests,
109+
window_seconds=86400,
110+
)
111+
112+
client = TestClient(app)
113+
assert client.get("/test").status_code == 200
114+
115+
116+
def test_anonymous_quota_blocks_after_limit():
117+
inner_app = FastAPI()
118+
119+
@inner_app.get("/test")
120+
async def test_endpoint():
121+
return {"message": "ok"}
122+
123+
quota = build_quota_config()
124+
125+
app = QuotaMiddleware(
126+
inner_app,
127+
kv_config=quota.kvstore,
128+
anonymous_max_requests=quota.anonymous_max_requests,
129+
authenticated_max_requests=quota.authenticated_max_requests,
130+
window_seconds=86400,
131+
)
132+
133+
client = TestClient(app)
134+
client.get("/test")
135+
resp = client.get("/test")
136+
assert resp.status_code == 429
137+
assert resp.json()["error"]["message"] == "Quota exceeded"

0 commit comments

Comments
 (0)