feat(monitoring): add llm oriented metrics to grafana dashboard (#768)

leoguillaume · web-flow · commit 525e3fe390b2 · 2026-03-03T18:33:02.000+01:00
* feat(monitoring): add llm oriented metrics to grafana dashboard

* chore(monitoring): move metrics definition to utils file

* Update unit coverage badge

* chore(monitoring): lint code

---------

Co-authored-by: leoguillaume &lt;leoguillaume@users.noreply.github.com&gt;
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
diff --git a/.github/badges/coverage.json b/.github/badges/coverage.json
@@ -1 +1 @@
-{"schemaVersion":1,"label":"coverage","message":"50.19%","color":"red"}
+{"schemaVersion":1,"label":"coverage","message":"49.58%","color":"red"}
diff --git a/api/clients/model/_basemodelprovider.py b/api/clients/model/_basemodelprovider.py
@@ -113,7 +113,6 @@ def _get_usage(self, request_content: RequestContent, response_data: dict | list
         tokenizer = getattr(global_context, "tokenizer", None)
         if tokenizer and request_content.endpoint in tokenizer.USAGE_ENDPOINTS:
             try:
-                completion_tokens = 0
                 prompt_tokens = tokenizer.get_prompt_tokens(endpoint=request_content.endpoint, body=request_content.json)
                 completion_tokens = tokenizer.get_completion_tokens(endpoint=request_content.endpoint, response_data=response_data)
                 total_tokens = prompt_tokens + completion_tokens
@@ -149,10 +148,10 @@ def _format_request(self, request_content: RequestContent) -> RequestContent:
         Format a request to a provider model. This method can be overridden by a subclass to add additional headers or parameters. This method format the requested endpoint thanks the ENDPOINT_TABLE attribute.
 
         Args:
-            content(RequestContent): The request content to format.
+            request_content(RequestContent): The request content to format.
 
         Returns:
-            content(RequestContent): The formatted request content.
+            request_content(RequestContent): The formatted request content.
         """
         if "model" in request_content.json:
             request_content.json["model"] = self.model_name
@@ -221,7 +220,8 @@ def _format_response(self, request_content: RequestContent, response: httpx.Resp
 
         return response
 
-    async def _ensure_timeseries_exists(self, redis_client: AsyncRedis, key: str) -> None:
+    @staticmethod
+    async def _ensure_timeseries_exists(redis_client: AsyncRedis, key: str) -> None:
         """
         Ensure a time series exists with proper retention configuration.
 
diff --git a/api/endpoints/monitoring.py b/api/endpoints/monitoring.py
@@ -3,24 +3,43 @@
 from fastapi import Depends, FastAPI
 import prometheus_client
 from prometheus_client import CollectorRegistry, multiprocess
-from prometheus_fastapi_instrumentator import Instrumentator
+from prometheus_fastapi_instrumentator import Instrumentator, metrics
 from starlette.responses import Response
 
 from api.helpers._accesscontroller import AccessController
 from api.schemas.admin.roles import PermissionType
+from api.utils.monitoring import (
+    inference_output_tokens_per_second,
+    inference_requests_duration_seconds,
+    inference_requests_total,
+    inference_tokens_total,
+    inference_ttft_milliseconds,
+)
 from api.utils.variables import RouterName
 
 
-def setup_prometheus(app: FastAPI, include_in_schema: bool = True) -> None:
-    app.instrumentator = Instrumentator().instrument(app=app)
+def setup_prometheus(app: FastAPI, metric_namespace: str = "ogl", include_in_schema: bool = True) -> None:
+    app.instrumentator = (
+        Instrumentator()
+        .instrument(app=app)
+        .add(
+            metrics.default(metric_namespace=metric_namespace),
+            inference_output_tokens_per_second(metric_namespace=metric_namespace),
+            inference_requests_total(metric_namespace=metric_namespace),
+            inference_requests_duration_seconds(metric_namespace=metric_namespace),
+            inference_ttft_milliseconds(metric_namespace=metric_namespace),
+            inference_tokens_total(metric_namespace=metric_namespace),
+        )
+        .expose(app)
+    )
 
     @app.get(
         path="/metrics",
         tags=[RouterName.MONITORING.title()],
         dependencies=[Depends(dependency=AccessController(permissions=[PermissionType.READ_METRIC]))],
         include_in_schema=include_in_schema,
     )
-    def metrics() -> Response:
+    def get_metrics() -> Response:
         if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
             registry = CollectorRegistry()
             multiprocess.MultiProcessCollector(registry)
diff --git a/api/helpers/load_balancing/_leastbusyloadbalancingstrategy.py b/api/helpers/load_balancing/_leastbusyloadbalancingstrategy.py
@@ -20,7 +20,6 @@ def __init__(self, redis_client: AsyncRedis | Redis, load_balancing_metric: Metr
         Get a provider to handle the request based on the specified routing strategy.
 
         Args:
-            candidates (list[int]): The list of provider candidates (provider IDs) to choose from
             redis_client (AsyncRedis): Redis client instance, required for least busy strategy
             load_balancing_metric (Metric): The type of metric to use for performance evaluation
 
diff --git a/api/utils/hooks_decorator.py b/api/utils/hooks_decorator.py
@@ -189,7 +189,7 @@ async def update_budget(usage: Usage):
                 # Update the budget
                 update_stmt = update(User).where(User.id == user_id).values(budget=new_budget, updated=func.now()).returning(User.budget)
 
-                result = await postgres_session.execute(update_stmt)
+                await postgres_session.execute(update_stmt)
 
         except Exception as e:
             logger.exception(f"Failed to update budget for user {user_id}: {e}")
diff --git a/api/utils/monitoring.py b/api/utils/monitoring.py
@@ -0,0 +1,206 @@
+from collections.abc import Callable
+
+from prometheus_client import Counter, Histogram
+from prometheus_fastapi_instrumentator.metrics import Info
+
+from api.utils.context import request_context
+
+
+def _build_metric_name(namespace: str, name: str) -> str:
+    return f"{namespace}_{name}" if namespace else name
+
+
+def inference_requests_total(metric_namespace: str = "") -> Callable[[Info], None]:
+    metric_name = _build_metric_name(metric_namespace, "inference_requests_total")
+    metric = Counter(
+        metric_name,
+        "Total number of LLM requests.",
+        labelnames=("endpoint", "model", "status_code"),
+    )
+
+    def instrumentation(info: Info) -> None:
+        try:
+            context = request_context.get()
+            model = context.router_name
+            endpoint = context.endpoint
+            if model and endpoint:
+                metric.labels(endpoint=endpoint, model=model, status_code=info.modified_status).inc()
+        except Exception:
+            pass
+
+    return instrumentation
+
+
+def inference_requests_duration_seconds(metric_namespace: str = "") -> Callable[[Info], None]:
+    metric_name = _build_metric_name(metric_namespace, "inference_requests_duration_seconds")
+    metric = Histogram(
+        metric_name,
+        "Duration of LLM requests in seconds.",
+        labelnames=("endpoint", "model", "status_code"),
+        buckets=(
+            0.05,
+            0.1,
+            0.2,
+            0.3,
+            0.4,
+            0.5,
+            0.75,
+            1,
+            1.5,
+            2,
+            2.5,
+            3,
+            3.5,
+            4,
+            4.5,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            15,
+            20,
+            25,
+            30,
+            45,
+            60,
+            75,
+            90,
+            105,
+            120,
+            150,
+            180,
+            210,
+            240,
+            270,
+            300,
+        ),
+    )
+
+    def instrumentation(info: Info) -> None:
+        try:
+            context = request_context.get()
+            model = context.router_name
+            endpoint = context.endpoint
+            latency = context.latency
+            if model and endpoint and latency is not None:
+                metric.labels(
+                    endpoint=endpoint,
+                    model=model,
+                    status_code=info.modified_status,
+                ).observe(latency / 1000)
+        except Exception:
+            pass
+
+    return instrumentation
+
+
+def inference_ttft_milliseconds(metric_namespace: str = "") -> Callable[[Info], None]:
+    metric_name = _build_metric_name(metric_namespace, "inference_ttft_milliseconds")
+    metric = Histogram(
+        metric_name,
+        "Time to first token for streaming LLM responses in milliseconds.",
+        labelnames=("endpoint", "model", "status_code"),
+        buckets=(
+            5,
+            10,
+            20,
+            30,
+            50,
+            75,
+            100,
+            150,
+            200,
+            300,
+            500,
+            750,
+            1000,
+            1500,
+            2000,
+            3000,
+            5000,
+            7500,
+            10000,
+            15000,
+            20000,
+            25000,
+            30000,
+            45000,
+            60000,
+            75000,
+            90000,
+            105000,
+            120000,
+            135000,
+            150000,
+            165000,
+            180000,
+            210000,
+            240000,
+            270000,
+            300000,
+        ),
+    )
+
+    def instrumentation(info: Info) -> None:
+        try:
+            context = request_context.get()
+            model = context.router_name
+            endpoint = context.endpoint
+            ttft = context.ttft
+            if model and endpoint and ttft is not None:
+                metric.labels(endpoint=endpoint, model=model, status_code=info.modified_status).observe(ttft)
+        except Exception:
+            pass
+
+    return instrumentation
+
+
+def inference_output_tokens_per_second(metric_namespace: str = "") -> Callable[[Info], None]:
+    metric_name = _build_metric_name(metric_namespace, "inference_output_tokens_per_second")
+    metric = Histogram(
+        metric_name,
+        "Output generation speed in tokens per second (completion tokens / request duration, TTFT included).",
+        labelnames=("endpoint", "model"),
+        buckets=(5, 10, 20, 30, 50, 75, 85, 90, 95, 100, 105, 110, 115, 125, 150, 175, 200, 250, 300, 400, 500, 750, 1000),
+    )
+
+    def instrumentation(info: Info) -> None:
+        try:
+            context = request_context.get()
+            model = context.router_name
+            endpoint = context.endpoint
+            usage = context.usage
+            latency = context.latency
+            if model and endpoint and usage and latency and usage.completion_tokens:
+                metric.labels(endpoint=endpoint, model=model).observe(usage.completion_tokens / (latency / 1000))
+        except Exception:
+            pass
+
+    return instrumentation
+
+
+def inference_tokens_total(metric_namespace: str = "") -> Callable[[Info], None]:
+    metric_name = _build_metric_name(metric_namespace, "inference_tokens_total")
+    metric = Counter(
+        metric_name,
+        "Total number of tokens consumed (prompt and completion).",
+        labelnames=("endpoint", "model", "type"),
+    )
+
+    def instrumentation(info: Info) -> None:
+        try:
+            context = request_context.get()
+            model = context.router_name
+            endpoint = context.endpoint
+            usage = context.usage
+            if model and endpoint and usage is not None:
+                if usage.prompt_tokens:
+                    metric.labels(endpoint=endpoint, model=model, type="prompt").inc(usage.prompt_tokens)
+                if usage.completion_tokens:
+                    metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
+        except Exception:
+            pass
+
+    return instrumentation

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-{"schemaVersion":1,"label":"coverage","message":"50.19%","color":"red"}`
	`1`	`+{"schemaVersion":1,"label":"coverage","message":"49.58%","color":"red"}`