Skip to content

Commit 66774be

Browse files
Wen Liangliangwen12year
Wen Liang
authored andcommitted
feat(quota): support per‑client and anonymous server‑side request quotas
Unrestricted usage can lead to runaway costs and fragmented client‑side workarounds. This commit introduces a native quota mechanism to the server, giving operators a unified, centrally managed throttle for both authenticated and anonymous requests—without needing extra proxies or custom client logic. This helps contain cloud‑compute expenses, enables fine‑grained usage control, and simplifies deployment and monitoring of Llama Stack services. Quotas are fully opt‑in and have no effect unless explicitly configured. Notice that Quotas require authentication to be enabled if you want to configure an authenticated max requests; otherwise they still work for anonymous users. Only ‘sqlite’ is supported as a backend; any other `type` will be rejected. Highlights: - Adds `QuotaMiddleware` to enforce per‑client and anonymous quotas: - If a bearer token is present, uses that client ID; otherwise falls back to the client’s IP address - Tracks usage in a SQLite‑backed KV store with per‑key TTL - Returns HTTP 429 when the quota is exceeded - Extends `ServerConfig` with a `quota` section that now supports: - `anonymous_max_requests` (max requests per period for unauthenticated clients) - `authenticated_max_requests` (max requests per period for authenticated clients) - `period` (length of the quota window, only "day" is supported now) - `db_path` (path to the SQLite file) - If authenticated max requests are configured but no auth provider is present, falling back to anonymous max requests. Behavior changes: - Quotas are disabled by default unless explicitly configured - SQLite defaults to `./quotas.db` if no DB path is set - Anonymous users get a low default quota; authenticated users can be given a higher one To enable per-client request quotas in `run.yaml`, add: ``` server: port: 8321 auth: provider_type: custom config: endpoint: https://auth.example.com/validate quota: type: sqlite config: anonymous_max_requests: 100 authenticated_max_requests: 1000 db_path: ./quotas.db period: day ``` Signed-off-by: Wen Liang <[email protected]>
1 parent 8e316c9 commit 66774be

File tree

5 files changed

+300
-0
lines changed

5 files changed

+300
-0
lines changed

llama_stack/distribution/datatypes.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,28 @@ class AuthenticationConfig(BaseModel):
234234
)
235235

236236

237+
class QuotaPeriod(str, Enum):
238+
DAY = "day"
239+
240+
241+
class QuotaType(str, Enum):
242+
SQLITE = "sqlite"
243+
244+
245+
class QuotaSqliteConfig(BaseModel):
246+
anonymous_max_requests: int = Field(default=100, description="Max requests for unauthenticated clients per period")
247+
authenticated_max_requests: int = Field(
248+
default=1000, description="Max requests for authenticated clients per period"
249+
)
250+
db_path: str = Field(default="./quotas.db", description="Path to the SQLite DB file")
251+
period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
252+
253+
254+
class QuotaConfig(BaseModel):
255+
type: QuotaType = Field(description="Quota backend type. Only 'sqlite' is supported at this time")
256+
config: QuotaSqliteConfig
257+
258+
237259
class ServerConfig(BaseModel):
238260
port: int = Field(
239261
default=8321,
@@ -261,6 +283,10 @@ class ServerConfig(BaseModel):
261283
default=False,
262284
description="Disable IPv6 support",
263285
)
286+
quota: QuotaConfig | None = Field(
287+
default=None,
288+
description="Per client quota request configuration",
289+
)
264290

265291

266292
class StackRunConfig(BaseModel):

llama_stack/distribution/server/auth.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ async def __call__(self, scope, receive, send):
113113
"namespaces": [token],
114114
}
115115

116+
scope["authenticated_client_id"] = token
117+
116118
# Store attributes in request scope
117119
scope["user_attributes"] = user_attributes
118120
logger.debug(f"Authentication successful: {len(scope['user_attributes'])} attributes")
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
import json
8+
import time
9+
from datetime import datetime, timedelta, timezone
10+
11+
from starlette.types import ASGIApp, Receive, Scope, Send
12+
13+
from llama_stack.log import get_logger
14+
from llama_stack.providers.utils.kvstore.api import KVStore
15+
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
16+
from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
17+
18+
logger = get_logger(name=__name__, category="quota")
19+
20+
21+
class QuotaMiddleware:
22+
"""
23+
ASGI middleware that enforces separate quotas for authenticated and anonymous clients
24+
within a configurable time window.
25+
26+
- For authenticated requests, it reads the client ID from the
27+
`Authorization: Bearer <client_id>` header.
28+
- For anonymous requests, it falls back to the IP address of the client.
29+
Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
30+
once a client exceeds its quota.
31+
"""
32+
33+
def __init__(
34+
self,
35+
app: ASGIApp,
36+
kv_config: KVStoreConfig,
37+
anonymous_max_requests: int,
38+
authenticated_max_requests: int,
39+
window_seconds: int = 86400,
40+
):
41+
self.app = app
42+
self.kv_config = kv_config
43+
self.kv: KVStore | None = None
44+
self.anonymous_max_requests = anonymous_max_requests
45+
self.authenticated_max_requests = authenticated_max_requests
46+
self.window_seconds = window_seconds
47+
48+
if isinstance(self.kv_config, SqliteKVStoreConfig):
49+
logger.warning(
50+
"QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
51+
f"window_seconds={self.window_seconds}"
52+
)
53+
54+
async def _get_kv(self) -> KVStore:
55+
if self.kv is None:
56+
self.kv = await kvstore_impl(self.kv_config)
57+
return self.kv
58+
59+
async def __call__(self, scope: Scope, receive: Receive, send: Send):
60+
if scope["type"] == "http":
61+
# pick key & limit based on auth
62+
auth_id = scope.get("authenticated_client_id")
63+
if auth_id:
64+
key_id = auth_id
65+
limit = self.authenticated_max_requests
66+
else:
67+
# fallback to IP
68+
client = scope.get("client")
69+
key_id = client[0] if client else "anonymous"
70+
limit = self.anonymous_max_requests
71+
72+
current_window = int(time.time() // self.window_seconds)
73+
key = f"quota:{key_id}:{current_window}"
74+
75+
try:
76+
kv = await self._get_kv()
77+
prev = await kv.get(key) or "0"
78+
count = int(prev) + 1
79+
80+
if int(prev) == 0:
81+
# Set with expiration datetime when it is the first request in the window.
82+
expiration = datetime.now(timezone.utc) + timedelta(seconds=self.window_seconds)
83+
await kv.set(key, str(count), expiration=expiration)
84+
else:
85+
await kv.set(key, str(count))
86+
except Exception:
87+
logger.exception("Failed to access KV store for quota")
88+
return await self._send_error(send, 500, "Quota service error")
89+
90+
if count > limit:
91+
logger.warning(
92+
"Quota exceeded for client %s: %d/%d",
93+
key_id,
94+
count,
95+
limit,
96+
)
97+
return await self._send_error(send, 429, "Quota exceeded")
98+
99+
return await self.app(scope, receive, send)
100+
101+
async def _send_error(self, send: Send, status: int, message: str):
102+
await send(
103+
{
104+
"type": "http.response.start",
105+
"status": status,
106+
"headers": [[b"content-type", b"application/json"]],
107+
}
108+
)
109+
body = json.dumps({"error": {"message": message}}).encode()
110+
await send({"type": "http.response.body", "body": body})

llama_stack/distribution/server/server.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
5252
TelemetryAdapter,
5353
)
54+
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
5455
from llama_stack.providers.utils.telemetry.tracing import (
5556
CURRENT_TRACE_CONTEXT,
5657
end_trace,
@@ -60,6 +61,7 @@
6061

6162
from .auth import AuthenticationMiddleware
6263
from .endpoints import get_all_api_endpoints
64+
from .quota import QuotaMiddleware
6365

6466
REPO_ROOT = Path(__file__).parent.parent.parent.parent
6567

@@ -432,6 +434,37 @@ def main(args: argparse.Namespace | None = None):
432434
if config.server.auth:
433435
logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
434436
app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
437+
else:
438+
if config.server.quota:
439+
qc = config.server.quota.config
440+
logger.warning(
441+
"Configured authenticated_max_requests (%d) but no auth is enabled; "
442+
"falling back to anonymous_max_requests (%d) for all the requests",
443+
qc.authenticated_max_requests,
444+
qc.anonymous_max_requests,
445+
)
446+
447+
if config.server.quota:
448+
logger.info("Enabling per-client quota middleware")
449+
450+
quota_conf = config.server.quota.config
451+
anonymous_max_requests = quota_conf.anonymous_max_requests
452+
# if auth is disabled, use the anonymous max requests
453+
authenticated_max_requests = (
454+
quota_conf.authenticated_max_requests if config.server.auth else anonymous_max_requests
455+
)
456+
457+
kv_config = SqliteKVStoreConfig(db_path=quota_conf.db_path)
458+
window_map = {"day": 86400}
459+
window_seconds = window_map[quota_conf.period.value]
460+
461+
app.add_middleware(
462+
QuotaMiddleware,
463+
kv_config=kv_config,
464+
anonymous_max_requests=anonymous_max_requests,
465+
authenticated_max_requests=authenticated_max_requests,
466+
window_seconds=window_seconds,
467+
)
435468

436469
try:
437470
impls = asyncio.run(construct_stack(config))

tests/unit/server/test_quota.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
import os
8+
9+
import pytest
10+
from fastapi import FastAPI, Request
11+
from fastapi.testclient import TestClient
12+
from starlette.middleware.base import BaseHTTPMiddleware
13+
14+
from llama_stack.distribution.server.quota import QuotaMiddleware
15+
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
16+
17+
18+
@pytest.fixture(autouse=True)
19+
def clean_sqlite_db():
20+
"""
21+
Remove the quotas.db file before each test to ensure no leftover state on disk.
22+
"""
23+
db_path = "./quotas_test.db"
24+
if os.path.exists(db_path):
25+
os.remove(db_path)
26+
27+
28+
class InjectClientIDMiddleware(BaseHTTPMiddleware):
29+
"""
30+
Middleware that injects 'authenticated_client_id' to mimic AuthenticationMiddleware.
31+
"""
32+
33+
def __init__(self, app, client_id="client1"):
34+
super().__init__(app)
35+
self.client_id = client_id
36+
37+
async def dispatch(self, request: Request, call_next):
38+
request.scope["authenticated_client_id"] = self.client_id
39+
return await call_next(request)
40+
41+
42+
@pytest.fixture(scope="function")
43+
def auth_app(request):
44+
"""
45+
Create a FastAPI app with both InjectClientIDMiddleware and QuotaMiddleware
46+
for authenticated user tests. Each test gets a unique client_id.
47+
"""
48+
inner_app = FastAPI()
49+
50+
@inner_app.get("/test")
51+
async def test_endpoint():
52+
return {"message": "ok"}
53+
54+
client_id = f"client_{request.node.name}"
55+
app = InjectClientIDMiddleware(
56+
QuotaMiddleware(
57+
inner_app,
58+
kv_config=SqliteKVStoreConfig(db_path="./quotas_test.db"),
59+
anonymous_max_requests=1,
60+
authenticated_max_requests=2,
61+
window_seconds=60,
62+
),
63+
client_id=client_id,
64+
)
65+
return app
66+
67+
68+
def test_authenticated_quota_allows_up_to_limit(auth_app):
69+
client = TestClient(auth_app)
70+
# Authenticated limit is 2
71+
resp1 = client.get("/test")
72+
assert resp1.status_code == 200
73+
resp2 = client.get("/test")
74+
assert resp2.status_code == 200
75+
76+
77+
def test_authenticated_quota_blocks_after_limit(auth_app):
78+
client = TestClient(auth_app)
79+
client.get("/test")
80+
client.get("/test")
81+
resp3 = client.get("/test")
82+
assert resp3.status_code == 429
83+
assert resp3.json()["error"]["message"] == "Quota exceeded"
84+
85+
86+
def test_anonymous_quota_allows_up_to_limit():
87+
"""
88+
Confirm anonymous requests use the anonymous_max_requests limit.
89+
"""
90+
inner_app = FastAPI()
91+
92+
@inner_app.get("/test")
93+
async def test_endpoint():
94+
return {"message": "ok"}
95+
96+
app = QuotaMiddleware(
97+
inner_app,
98+
kv_config=SqliteKVStoreConfig(db_path="./quotas_test.db"),
99+
anonymous_max_requests=1,
100+
authenticated_max_requests=2,
101+
window_seconds=60,
102+
)
103+
104+
client = TestClient(app)
105+
# Anonymous limit is 1
106+
resp1 = client.get("/test")
107+
assert resp1.status_code == 200
108+
109+
110+
def test_anonymous_quota_blocks_after_limit():
111+
inner_app = FastAPI()
112+
113+
@inner_app.get("/test")
114+
async def test_endpoint():
115+
return {"message": "ok"}
116+
117+
app = QuotaMiddleware(
118+
inner_app,
119+
kv_config=SqliteKVStoreConfig(db_path="./quotas_test.db"),
120+
anonymous_max_requests=1,
121+
authenticated_max_requests=2,
122+
window_seconds=60,
123+
)
124+
125+
client = TestClient(app)
126+
client.get("/test")
127+
resp2 = client.get("/test")
128+
assert resp2.status_code == 429
129+
assert resp2.json()["error"]["message"] == "Quota exceeded"

0 commit comments

Comments
 (0)