feat(quota): add server‑side per‑client request quotas

liangwen12year · Wen Liang · commit 572571a83d93 · 2025-05-02T18:45:06.000-04:00
Usage without limits can lead to runaway costs and fragmented
client‑side workarounds.  By building a native quota mechanism into the
server, operators gain a single, centrally managed throttle for
per‑client requests—no extra proxies or bespoke client logic required.
This helps contain cloud‑compute expenses, provides fine‑grained control
over usage, and simplifies deployment and monitoring of Llama Stack
services. Quotas remain opt‑in and fully configurable, ensuring zero
impact unless explicitly enabled.

- Add `QuotaMiddleware` (llama_stack/distribution/server/quota.py)
  • Reads `Authorization: Bearer &lt;client_id&gt;`
  • Tracks daily counts in Redis
  • Enforces `quota_requests_per_day` over a `quota_window_seconds` window
  • Returns HTTP 429 when exceeded

- Extend `ServerConfig` with three new fields:
  • quota_redis_url
  • quota_requests_per_day
  • quota_window_seconds

- Wire middleware into server startup (`server.py`) and CLI entrypoint
  (`llama_stack/cli/stack/run.py`), gated on `quota_redis_url`.
- Add CLI flags `--quota-redis-url`, `--quota-requests-per-day`, and
  `--quota-window-seconds` and ensure they override YAML config.
- Leave quotas disabled by default when `quota_redis_url` is unset.

To enable per‑client request quotas, add these three settings under the
`server:` section of your `run.yaml`. Set the `quota_redis_url` to your
Redis connection string to activate per‑client quotas; leave it blank or
omit it to disable quotas. Use `quota_requests_per_day` to define the
maximum number of requests each client may make in the window, and
`quota_window_seconds` to specify the length of that window in seconds
(for example, 86400 for 24 hours).

```
server:
  port: 8321
  quota_redis_url: redis://localhost:6379/0
  quota_requests_per_day: 1000
  quota_window_seconds: 86400
```

Signed-off-by: Wen Liang &lt;wenliang@redhat.com&gt;
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
@@ -269,10 +269,19 @@ After this step is successful, you should be able to find the built container im
 ### Running your Stack server
 Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
 
-```
+```bash
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+usage: llama stack run [-h]
+                       [--port PORT]
+                       [--image-name IMAGE_NAME]
+                       [--disable-ipv6]
+                       [--env KEY=VALUE]
+                       [--tls-keyfile TLS_KEYFILE]
+                       [--tls-certfile TLS_CERTFILE]
                        [--image-type {conda,container,venv}]
+                       [--quota-redis-url QUOTA_REDIS_URL]
+                       [--quota-requests-per-day QUOTA_REQUESTS_PER_DAY]
+                       [--quota-window-seconds QUOTA_WINDOW_SECONDS]
                        config
 
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
@@ -293,6 +302,12 @@ options:
                         Path to TLS certificate file for HTTPS (default: None)
   --image-type {conda,container,venv}
                         Image Type used during the build. This can be either conda or container or venv. (default: conda)
+  --quota-redis-url QUOTA_REDIS_URL
+                        Redis URL for quota tracking; omit to disable quotas.
+  --quota-requests-per-day QUOTA_REQUESTS_PER_DAY
+                        Max requests each client may make per window (default: 1000).
+  --quota-window-seconds QUOTA_WINDOW_SECONDS
+                        Quota window length in seconds (default: 86400 = 24 h).
 
 ```
 
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
@@ -75,6 +75,23 @@ def _add_arguments(self):
             help="Image Type used during the build. This can be either conda or container or venv.",
             choices=[e.value for e in ImageType],
         )
+        self.parser.add_argument(
+            "--quota-redis-url",
+            type=str,
+            help="Redis URL for quota tracking (enables quotas)",
+        )
+        self.parser.add_argument(
+            "--quota-requests-per-day",
+            type=int,
+            default=None,
+            help="Max requests per client per day",
+        )
+        self.parser.add_argument(
+            "--quota-window-seconds",
+            type=int,
+            default=None,
+            help="Time window for the daily quota, in seconds",
+        )
 
     # If neither image type nor image name is provided, but at the same time
     # the current environment has conda breadcrumbs, then assume what the user
@@ -144,6 +161,10 @@ def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
 
             # Build the server args from the current args passed to the CLI
             server_args = argparse.Namespace()
+            # Propagate quota flags into server_main
+            server_args.quota_redis_url = args.quota_redis_url
+            server_args.quota_requests_per_day = args.quota_requests_per_day
+            server_args.quota_window_seconds = args.quota_window_seconds
             for arg in vars(args):
                 # If this is a function, avoid passing it
                 # "args" contains:
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
@@ -253,6 +253,20 @@ class ServerConfig(BaseModel):
         default=None,
         description="Authentication configuration for the server",
     )
+    quota_redis_url: str | None = Field(
+        default=None,
+        description="Redis URL for quota tracking (e.g. redis://localhost:6379/0). If unset, quotas are disabled.",
+    )
+    quota_requests_per_day: int = Field(
+        default=1000,
+        description="Default maximum number of requests allowed per client per day",
+        ge=1,
+    )
+    quota_window_seconds: int = Field(
+        default=86400,
+        description="Time window in seconds for the daily quota (default: 24h)",
+        ge=1,
+    )
 
 
 class StackRunConfig(BaseModel):
diff --git a/llama_stack/distribution/server/quota.py b/llama_stack/distribution/server/quota.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+from datetime import datetime, timezone
+
+import redis.asyncio as aioredis
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="quota")
+
+
+class QuotaMiddleware:
+    """
+    ASGI middleware enforcing per-client daily request quotas.
+
+    Expects Authorization: Bearer <client_id> header.
+    Tracks counts in Redis; returns HTTP 429 when limit is exceeded.
+    """
+
+    def __init__(
+        self,
+        app: ASGIApp,
+        redis_url: str = "redis://localhost:6379/0",
+        default_requests_per_day: int = 1000,
+        window_seconds: int = 86400,
+    ):
+        self.app = app
+        self.redis = aioredis.from_url(redis_url, encoding="utf-8", decode_responses=True)
+        self.default_limit = default_requests_per_day
+        self.window = window_seconds
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] == "http":
+            # Extract API key from Authorization header
+            headers = dict(scope.get("headers", []))
+            auth = headers.get(b"authorization", b"").decode()
+            if not auth or not auth.startswith("Bearer "):
+                return await self._send_error(send, 401, "Missing or invalid API key")
+
+            client_id = auth.split("Bearer ", 1)[1].strip()
+            key = f"quota:{client_id}:{datetime.now(timezone.utc).date().isoformat()}"
+
+            try:
+                count = await self.redis.incr(key)
+                if count == 1:
+                    await self.redis.expire(key, self.window)
+            except Exception:
+                logger.exception("Error accessing Redis for quota")
+                return await self._send_error(send, 500, "Quota service error")
+
+            if count > self.default_limit:
+                logger.warning(
+                    "Quota exceeded for client %s: %d/%d",
+                    client_id,
+                    count,
+                    self.default_limit,
+                )
+                return await self._send_error(send, 429, "Quota exceeded")
+
+        # Pass through to downstream app
+        return await self.app(scope, receive, send)
+
+    async def _send_error(self, send: Send, status: int, message: str):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        body = json.dumps({"error": {"message": message}}).encode()
+        await send({"type": "http.response.body", "body": body})
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
@@ -57,6 +57,7 @@
 )
 
 from .auth import AuthenticationMiddleware
+from .quota import QuotaMiddleware
 from .endpoints import get_all_api_endpoints
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@@ -401,6 +402,13 @@ def main(args: argparse.Namespace | None = None):
         config = replace_env_vars(config_contents)
         config = StackRunConfig(**config)
 
+        if getattr(args, "quota_redis_url", None):
+            config.server.quota_redis_url = args.quota_redis_url
+        if getattr(args, "quota_requests_per_day", None) is not None:
+            config.server.quota_requests_per_day = args.quota_requests_per_day
+        if getattr(args, "quota_window_seconds", None) is not None:
+            config.server.quota_window_seconds = args.quota_window_seconds
+
     # now that the logger is initialized, print the line about which type of config we are using.
     logger.info(log_line)
 
@@ -422,6 +430,18 @@ def main(args: argparse.Namespace | None = None):
         logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
         app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
 
+
+    # Add per-client quota enforcement
+    # per‑client quota enforcement (only if configured)
+    if config.server.quota_redis_url:
+        logger.info("Enabling per-client quota middleware")
+        app.add_middleware(
+            QuotaMiddleware,
+            redis_url=config.server.quota_redis_url,
+            default_requests_per_day=config.server.quota_requests_per_day,
+            window_seconds=config.server.quota_window_seconds,
+        )
+
     try:
         impls = asyncio.run(construct_stack(config))
     except InvalidProviderError as e:
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml
@@ -431,3 +431,6 @@ tool_groups:
   provider_id: rag-runtime
 server:
   port: 8321
+  quota_redis_url: ""
+  quota_requests_per_day: 1000
+  quota_window_seconds: 86400
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     "pillow",
     "h11>=0.16.0",
     "kubernetes",
+    "redis>=4.4.0",
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@
 #    uv export --frozen --no-hashes --no-emit-project --output-file=requirements.txt
 annotated-types==0.7.0
 anyio==4.8.0
+async-timeout==5.0.1 ; python_full_version < '3.11.3'
 attrs==25.1.0
 blobfile==3.0.0
 cachetools==5.5.2
@@ -49,6 +50,7 @@ python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
 pyyaml==6.0.2
+redis==6.0.0
 referencing==0.36.2
 regex==2024.11.6
 requests==2.32.3
diff --git a/tests/unit/server/test_quota.py b/tests/unit/server/test_quota.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# tests/unit/server/test_quota.py
+
+import pytest
+import redis.asyncio as aioredis
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from llama_stack.distribution.server.quota import QuotaMiddleware
+
+
+@pytest.fixture(autouse=True)
+def fake_redis(monkeypatch):
+    """
+    Replace aioredis.from_url with a fake in-memory Redis for quota tests.
+    """
+
+    class FakeRedis:
+        def __init__(self):
+            self._store = {}
+
+        async def incr(self, key):
+            v = self._store.get(key, 0) + 1
+            self._store[key] = v
+            return v
+
+        async def expire(self, key, seconds):
+            # no-op TTL for tests
+            return True
+
+    def fake_from_url(url, encoding="utf-8", decode_responses=True):
+        # Return our FakeRedis instance synchronously
+        return FakeRedis()
+
+    monkeypatch.setattr(aioredis, "from_url", fake_from_url)
+
+
+@pytest.fixture
+def app():
+    """
+    Create a FastAPI app with QuotaMiddleware mounted.
+    Use a small limit (2 requests) and short window (60s) for testing.
+    """
+    app = FastAPI()
+    app.add_middleware(
+        QuotaMiddleware,
+        redis_url="redis://localhost:6379/0",
+        default_requests_per_day=2,
+        window_seconds=60,
+    )
+
+    @app.get("/test")
+    def test_endpoint():
+        return {"message": "ok"}
+
+    return app
+
+
+def test_quota_allows_up_to_limit(app):
+    client = TestClient(app)
+    headers = {"Authorization": "Bearer client1"}
+
+    # First two requests should pass
+    resp1 = client.get("/test", headers=headers)
+    assert resp1.status_code == 200
+    assert resp1.json() == {"message": "ok"}
+
+    resp2 = client.get("/test", headers=headers)
+    assert resp2.status_code == 200
+    assert resp2.json() == {"message": "ok"}
+
+
+def test_quota_blocks_after_limit(app):
+    client = TestClient(app)
+    headers = {"Authorization": "Bearer client1"}
+
+    # Exceed the limit: 3rd request should be throttled
+    client.get("/test", headers=headers)
+    client.get("/test", headers=headers)
+    resp3 = client.get("/test", headers=headers)
+    assert resp3.status_code == 429
+    assert resp3.json()["error"]["message"] == "Quota exceeded"
+
+
+def test_missing_auth_header_returns_401(app):
+    client = TestClient(app)
+
+    # No Authorization header → 401
+    resp = client.get("/test")
+    assert resp.status_code == 401
+    assert "Missing or invalid API key" in resp.json()["error"]["message"]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ dependencies = [`
`40`	`40`	`"pillow",`
`41`	`41`	`"h11>=0.16.0",`
`42`	`42`	`"kubernetes",`
	`43`	`+ "redis>=4.4.0",`
`43`	`44`	`]`
`44`	`45`
`45`	`46`	`[project.optional-dependencies]`