Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from fastapi import FastAPI, Request
import sentry_sdk
from starlette.middleware.sessions import SessionMiddleware
from starlette.responses import JSONResponse

from api.endpoints.monitoring import setup_prometheus
from api.schemas.core.context import RequestContext
Expand Down Expand Up @@ -89,7 +88,3 @@ def _setup_monitoring(app: FastAPI, configuration: Configuration) -> None:

if configuration.settings.monitoring_prometheus_enabled:
setup_prometheus(app, include_in_schema=include_in_schema)

@app.get(path="/health", tags=[RouterName.MONITORING.title()], include_in_schema=include_in_schema)
def health() -> JSONResponse:
return JSONResponse(content={"status": "ok"}, status_code=200)
6 changes: 3 additions & 3 deletions api/clients/model/_basemodelprovider.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def _get_usage(self, request_content: RequestContent, response_data: dict | list
tokenizer = getattr(global_context, "tokenizer", None)
if tokenizer and request_content.endpoint in tokenizer.USAGE_ENDPOINTS:
try:
completion_tokens = 0
prompt_tokens = tokenizer.get_prompt_tokens(endpoint=request_content.endpoint, body=request_content.json)
completion_tokens = tokenizer.get_completion_tokens(endpoint=request_content.endpoint, response_data=response_data)
total_tokens = prompt_tokens + completion_tokens
Expand Down Expand Up @@ -154,7 +153,7 @@ def _format_request(self, request_content: RequestContent) -> RequestContent:
Format a request to a provider model. This method can be overridden by a subclass to add additional headers or parameters. This method format the requested endpoint thanks the ENDPOINT_TABLE attribute.

Args:
content(RequestContent): The request content to format.
request_content(RequestContent): The request content to format.

Returns:
content(RequestContent): The formatted request content.
Expand Down Expand Up @@ -226,7 +225,8 @@ def _format_response(self, request_content: RequestContent, response: httpx.Resp

return response

async def _ensure_timeseries_exists(self, redis_client: AsyncRedis, key: str) -> None:
@staticmethod
async def _ensure_timeseries_exists(redis_client: AsyncRedis, key: str) -> None:
"""
Ensure a time series exists with proper retention configuration.

Expand Down
11 changes: 11 additions & 0 deletions api/endpoints/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from fastapi import APIRouter
from starlette.responses import JSONResponse

from api.utils.variables import RouterName

router = APIRouter(tags=[RouterName.HEALTH.title()])


@router.get(path="/health")
def health() -> JSONResponse:
return JSONResponse(content={"status": "ok"}, status_code=200)
15 changes: 14 additions & 1 deletion api/endpoints/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,25 @@
from starlette.responses import Response

from api.helpers._accesscontroller import AccessController
from api.helpers._metricsmiddleware import (
inference_requests_duration_seconds,
inference_requests_total,
inference_tokens_total,
inference_ttft_milliseconds,
)
from api.schemas.admin.roles import PermissionType
from api.utils.variables import RouterName


def setup_prometheus(app: FastAPI, include_in_schema: bool = True) -> None:
app.instrumentator = Instrumentator().instrument(app=app)
app.instrumentator = (
Instrumentator()
.instrument(app=app)
.add(inference_requests_total())
.add(inference_requests_duration_seconds())
.add(inference_ttft_milliseconds())
.add(inference_tokens_total())
)

@app.get(
path="/metrics",
Expand Down
104 changes: 104 additions & 0 deletions api/helpers/_metricsmiddleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from collections.abc import Callable

from prometheus_client import Counter, Histogram
from prometheus_fastapi_instrumentator.metrics import Info

from api.utils.context import request_context


def inference_requests_total() -> Callable[[Info], None]:
metric = Counter(
"inference_requests_total",
"Total number of LLM requests.",
labelnames=("endpoint", "model", "status_code"),
)

def instrumentation(info: Info) -> None:
try:
context = request_context.get()
model = context.router_name
endpoint = context.endpoint
if model and endpoint:
metric.labels(
endpoint=endpoint,
model=model,
status_code=info.modified_status,
).inc()
except Exception:
Comment thread Fixed

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Copilot Autofix

AI 3 months ago

In general, empty except blocks should be replaced with handling that either (a) narrows the exception type and/or (b) logs the error and, if appropriate, re-raises or returns a safe default. For metrics middleware, we typically want to ensure that exceptions in metrics code never interfere with request processing, but we should still log them so they can be diagnosed.

The best fix here is to keep the try/except around the metric updates, but replace the except Exception: pass blocks with a handler that logs the exception, scoped clearly as a metrics failure. Since this is FastAPI/Prometheus code, using the standard library logging module is appropriate and doesn’t introduce external dependencies. We’ll add a module-level logger (e.g. logger = logging.getLogger(__name__)) and in each except Exception: block call logger.exception(...) with a short message explaining which metric failed. This preserves the existing behavior of not raising beyond the instrumentation function while eliminating the silent failure.

Concretely:

  • In api/helpers/_metricsmiddleware.py, add import logging and a logger = logging.getLogger(__name__) definition near the top.
  • In inference_requests_total.instrumentation, replace the except Exception: pass with except Exception: logger.exception("Failed to record inference_requests_total metric").
  • In inference_requests_duration_seconds.instrumentation, replace similarly with a message like "Failed to record inference_requests_duration_seconds metric".
  • In inference_output_tokens_per_second.instrumentation, replace with "Failed to record inference_output_tokens_per_second metric".

No new methods are needed beyond the logger definition; no change in function signatures or existing metric logic is required.

Suggested changeset 1
api/helpers/_metricsmiddleware.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/helpers/_metricsmiddleware.py b/api/helpers/_metricsmiddleware.py
--- a/api/helpers/_metricsmiddleware.py
+++ b/api/helpers/_metricsmiddleware.py
@@ -1,11 +1,14 @@
 from collections.abc import Callable
+import logging
 
 from prometheus_client import Counter, Histogram
 from prometheus_fastapi_instrumentator.metrics import Info
 
 from api.utils.context import request_context
 
+logger = logging.getLogger(__name__)
 
+
 def _build_metric_name(namespace: str, name: str) -> str:
     return f"{namespace}_{name}" if namespace else name
 
@@ -30,7 +26,7 @@
                     status_code=info.modified_status,
                 ).inc()
         except Exception:
-            pass
+            logger.exception("Failed to record inference_requests_total metric")
 
     return instrumentation
 
@@ -95,7 +91,7 @@
                     status_code=info.modified_status,
                 ).observe(latency / 1000)
         except Exception:
-            pass
+            logger.exception("Failed to record inference_requests_duration_seconds metric")
 
     return instrumentation
 
@@ -184,7 +180,7 @@
             if model and endpoint and usage and latency and usage.completion_tokens:
                 metric.labels(endpoint=endpoint, model=model).observe(usage.completion_tokens / (latency / 1000))
         except Exception:
-            pass
+            logger.exception("Failed to record inference_output_tokens_per_second metric")
 
     return instrumentation
 
EOF
@@ -1,11 +1,14 @@
from collections.abc import Callable
import logging

from prometheus_client import Counter, Histogram
from prometheus_fastapi_instrumentator.metrics import Info

from api.utils.context import request_context

logger = logging.getLogger(__name__)


def _build_metric_name(namespace: str, name: str) -> str:
return f"{namespace}_{name}" if namespace else name

@@ -30,7 +26,7 @@
status_code=info.modified_status,
).inc()
except Exception:
pass
logger.exception("Failed to record inference_requests_total metric")

return instrumentation

@@ -95,7 +91,7 @@
status_code=info.modified_status,
).observe(latency / 1000)
except Exception:
pass
logger.exception("Failed to record inference_requests_duration_seconds metric")

return instrumentation

@@ -184,7 +180,7 @@
if model and endpoint and usage and latency and usage.completion_tokens:
metric.labels(endpoint=endpoint, model=model).observe(usage.completion_tokens / (latency / 1000))
except Exception:
pass
logger.exception("Failed to record inference_output_tokens_per_second metric")

return instrumentation

Copilot is powered by AI and may make mistakes. Always verify output.
pass

return instrumentation


def inference_requests_duration_seconds() -> Callable[[Info], None]:
metric = Histogram(
"inference_requests_duration_seconds",
"Duration of LLM requests in seconds.",
labelnames=("endpoint", "model", "status_code"),
)

def instrumentation(info: Info) -> None:
try:
context = request_context.get()
model = context.router_name
endpoint = context.endpoint
latency = context.latency
if model and endpoint and latency is not None:
metric.labels(
endpoint=endpoint,
model=model,
status_code=info.modified_status,
).observe(latency / 1000)
except Exception:
Comment thread Fixed

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Copilot Autofix

AI 3 months ago

General approach: keep the “do not break the request due to metrics failures” behavior, but avoid completely silent exception handling. Add a brief comment stating that errors in metrics should not affect the main flow and log the exception in a non-intrusive way (e.g., via the standard logging module).

Concrete fix:

  • In api/helpers/_metricsmiddleware.py, add an import for the standard-library logging module.
  • Replace the two except Exception: pass blocks inside:
    • inference_requests_duration_seconds(...).instrumentation
    • inference_ttft_milliseconds(...).instrumentation
  • With except Exception: blocks that:
    • include a short comment explaining that metrics errors are intentionally ignored for request safety, and
    • log the exception with logging.getLogger(__name__).exception(...), e.g. logging.getLogger(__name__).exception("Failed to record inference request duration metric").

This preserves existing functionality (no exception propagates to the caller), but prevents completely silent failures and documents the intent.

Specific locations:

  • Add import logging near the top of api/helpers/_metricsmiddleware.py.
  • Modify lines 97–98 and 162–163 accordingly.

No additional non-standard dependencies are needed; logging is from the Python standard library.


Suggested changeset 1
api/helpers/_metricsmiddleware.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/helpers/_metricsmiddleware.py b/api/helpers/_metricsmiddleware.py
--- a/api/helpers/_metricsmiddleware.py
+++ b/api/helpers/_metricsmiddleware.py
@@ -4,6 +4,7 @@
 from prometheus_fastapi_instrumentator.metrics import Info
 
 from api.utils.context import request_context
+import logging
 
 
 def _build_metric_name(namespace: str, name: str) -> str:
@@ -95,7 +96,10 @@
                     status_code=info.modified_status,
                 ).observe(latency / 1000)
         except Exception:
-            pass
+            # Metrics collection must not interfere with request handling; log and continue.
+            logging.getLogger(__name__).exception(
+                "Failed to record inference request duration metric"
+            )
 
     return instrumentation
 
@@ -160,7 +164,10 @@
                     status_code=info.modified_status,
                 ).observe(ttft)
         except Exception:
-            pass
+            # Metrics collection must not interfere with request handling; log and continue.
+            logging.getLogger(__name__).exception(
+                "Failed to record inference TTFT metric"
+            )
 
     return instrumentation
 
EOF
@@ -4,6 +4,7 @@
from prometheus_fastapi_instrumentator.metrics import Info

from api.utils.context import request_context
import logging


def _build_metric_name(namespace: str, name: str) -> str:
@@ -95,7 +96,10 @@
status_code=info.modified_status,
).observe(latency / 1000)
except Exception:
pass
# Metrics collection must not interfere with request handling; log and continue.
logging.getLogger(__name__).exception(
"Failed to record inference request duration metric"
)

return instrumentation

@@ -160,7 +164,10 @@
status_code=info.modified_status,
).observe(ttft)
except Exception:
pass
# Metrics collection must not interfere with request handling; log and continue.
logging.getLogger(__name__).exception(
"Failed to record inference TTFT metric"
)

return instrumentation

Copilot is powered by AI and may make mistakes. Always verify output.
pass

return instrumentation


def inference_ttft_milliseconds() -> Callable[[Info], None]:
metric = Histogram(
"inference_ttft_milliseconds",
"Time to first token for streaming LLM responses in milliseconds.",
labelnames=("endpoint", "model", "status_code"),
)

def instrumentation(info: Info) -> None:
try:
context = request_context.get()
model = context.router_name
endpoint = context.endpoint
ttft = context.ttft
if model and endpoint and ttft is not None:
metric.labels(
endpoint=endpoint,
model=model,
status_code=info.modified_status,
).observe(ttft)
except Exception:
Comment thread Fixed
Comment thread Fixed

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Copilot Autofix

AI 3 months ago

To fix the problem, keep the broad except Exception to protect the main request handling from metric failures, but replace the empty body with minimal logging that records the error. This preserves existing behavior (exceptions are not re-raised) while avoiding silent failure. Since we must not change existing imports except to add well-known libraries, the least intrusive approach is to use the standard-library logging module.

Concretely, in api/helpers/_metricsmiddleware.py:

  1. Add import logging near the top of the file alongside the existing imports.

  2. In each instrumentation inner function that currently has:

    except Exception:
        pass

    replace it with a logging call, for example:

    except Exception:
        logging.getLogger(__name__).exception(
            "Error while recording %s metric", "<metric_name>"
        )

    where <metric_name> is a short identifier like "inference_requests_total", "inference_ttft_milliseconds", or "inference_tokens_total" corresponding to the function.

This way, any unexpected issues in metric collection are visible in logs, but they still do not interfere with normal request processing. No additional methods or helper functions are required beyond the standard logging import.

Suggested changeset 1
api/helpers/_metricsmiddleware.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/helpers/_metricsmiddleware.py b/api/helpers/_metricsmiddleware.py
--- a/api/helpers/_metricsmiddleware.py
+++ b/api/helpers/_metricsmiddleware.py
@@ -4,6 +4,7 @@
 from prometheus_fastapi_instrumentator.metrics import Info
 
 from api.utils.context import request_context
+import logging
 
 
 def _build_metric_name(namespace: str, name: str) -> str:
@@ -30,7 +31,9 @@
                     status_code=info.modified_status,
                 ).inc()
         except Exception:
-            pass
+            logging.getLogger(__name__).exception(
+                "Error while recording inference_requests_total metric"
+            )
 
     return instrumentation
 
@@ -160,7 +163,9 @@
                     status_code=info.modified_status,
                 ).observe(ttft)
         except Exception:
-            pass
+            logging.getLogger(__name__).exception(
+                "Error while recording inference_ttft_milliseconds metric"
+            )
 
     return instrumentation
 
@@ -209,6 +214,8 @@
                 if usage.completion_tokens:
                     metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
         except Exception:
-            pass
+            logging.getLogger(__name__).exception(
+                "Error while recording inference_tokens_total metric"
+            )
 
     return instrumentation
EOF
@@ -4,6 +4,7 @@
from prometheus_fastapi_instrumentator.metrics import Info

from api.utils.context import request_context
import logging


def _build_metric_name(namespace: str, name: str) -> str:
@@ -30,7 +31,9 @@
status_code=info.modified_status,
).inc()
except Exception:
pass
logging.getLogger(__name__).exception(
"Error while recording inference_requests_total metric"
)

return instrumentation

@@ -160,7 +163,9 @@
status_code=info.modified_status,
).observe(ttft)
except Exception:
pass
logging.getLogger(__name__).exception(
"Error while recording inference_ttft_milliseconds metric"
)

return instrumentation

@@ -209,6 +214,8 @@
if usage.completion_tokens:
metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
except Exception:
pass
logging.getLogger(__name__).exception(
"Error while recording inference_tokens_total metric"
)

return instrumentation
Copilot is powered by AI and may make mistakes. Always verify output.
pass

return instrumentation


def inference_tokens_total() -> Callable[[Info], None]:
metric = Counter(
"inference_tokens_total",
"Total number of tokens consumed (prompt and completion).",
labelnames=("endpoint", "model", "type"),
)

def instrumentation(info: Info) -> None:
try:
context = request_context.get()
model = context.router_name
endpoint = context.endpoint
usage = context.usage
if model and endpoint and usage is not None:
if usage.prompt_tokens:
metric.labels(endpoint=endpoint, model=model, type="prompt").inc(usage.prompt_tokens)
if usage.completion_tokens:
metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
except Exception:

Check notice

Code scanning / CodeQL

Empty except Note

'except' clause does nothing but pass and there is no explanatory comment.

Copilot Autofix

AI 3 months ago

In general, the fix is to stop silently swallowing all exceptions. For non-critical metrics code, the usual pattern is: keep the broad except Exception (so metrics never break requests) but add lightweight logging in the handler so that failures are visible. This maintains existing behavior (no exception propagation) but avoids losing information.

The best fix here is to:

  • Keep the try/except Exception: structure so that metrics failures never affect the main application.
  • In each except block, call a logger to record the exception with context (e.g., which instrumentation function failed).
  • Reuse a single module-level logger (using Python’s standard logging module) so that the rest of the system can route these logs appropriately.

Concretely in api/helpers/_metricsmiddleware.py:

  • Add import logging at the top and define logger = logging.getLogger(__name__) after the imports.
  • For each of the four instrumentation functions shown, replace except Exception:\n pass with except Exception:\n logger.exception("..."), using a message that identifies the specific metric (e.g., "Error recording inference_requests_total metric"). This keeps external behavior the same (no raised exceptions), but ensures errors are visible.

No additional third‑party dependencies are needed; we use Python’s built‑in logging module.

Suggested changeset 1
api/helpers/_metricsmiddleware.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/api/helpers/_metricsmiddleware.py b/api/helpers/_metricsmiddleware.py
--- a/api/helpers/_metricsmiddleware.py
+++ b/api/helpers/_metricsmiddleware.py
@@ -1,11 +1,14 @@
 from collections.abc import Callable
 
+import logging
 from prometheus_client import Counter, Histogram
 from prometheus_fastapi_instrumentator.metrics import Info
 
 from api.utils.context import request_context
 
+logger = logging.getLogger(__name__)
 
+
 def _build_metric_name(namespace: str, name: str) -> str:
     return f"{namespace}_{name}" if namespace else name
 
@@ -30,7 +27,7 @@
                     status_code=info.modified_status,
                 ).inc()
         except Exception:
-            pass
+            logger.exception("Error recording inference_requests_total metric")
 
     return instrumentation
 
@@ -160,7 +157,7 @@
                     status_code=info.modified_status,
                 ).observe(ttft)
         except Exception:
-            pass
+            logger.exception("Error recording inference_ttft_milliseconds metric")
 
     return instrumentation
 
@@ -184,7 +181,7 @@
             if model and endpoint and usage and latency and usage.completion_tokens:
                 metric.labels(endpoint=endpoint, model=model).observe(usage.completion_tokens / (latency / 1000))
         except Exception:
-            pass
+            logger.exception("Error recording inference_output_tokens_per_second metric")
 
     return instrumentation
 
@@ -209,6 +206,6 @@
                 if usage.completion_tokens:
                     metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
         except Exception:
-            pass
+            logger.exception("Error recording inference_tokens_total metric")
 
     return instrumentation
EOF
@@ -1,11 +1,14 @@
from collections.abc import Callable

import logging
from prometheus_client import Counter, Histogram
from prometheus_fastapi_instrumentator.metrics import Info

from api.utils.context import request_context

logger = logging.getLogger(__name__)


def _build_metric_name(namespace: str, name: str) -> str:
return f"{namespace}_{name}" if namespace else name

@@ -30,7 +27,7 @@
status_code=info.modified_status,
).inc()
except Exception:
pass
logger.exception("Error recording inference_requests_total metric")

return instrumentation

@@ -160,7 +157,7 @@
status_code=info.modified_status,
).observe(ttft)
except Exception:
pass
logger.exception("Error recording inference_ttft_milliseconds metric")

return instrumentation

@@ -184,7 +181,7 @@
if model and endpoint and usage and latency and usage.completion_tokens:
metric.labels(endpoint=endpoint, model=model).observe(usage.completion_tokens / (latency / 1000))
except Exception:
pass
logger.exception("Error recording inference_output_tokens_per_second metric")

return instrumentation

@@ -209,6 +206,6 @@
if usage.completion_tokens:
metric.labels(endpoint=endpoint, model=model, type="completion").inc(usage.completion_tokens)
except Exception:
pass
logger.exception("Error recording inference_tokens_total metric")

return instrumentation
Copilot is powered by AI and may make mistakes. Always verify output.
pass

return instrumentation
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def __init__(self, redis_client: AsyncRedis | Redis, load_balancing_metric: Metr
Get a provider to handle the request based on the specified routing strategy.

Args:
candidates (list[int]): The list of provider candidates (provider IDs) to choose from
redis_client (AsyncRedis): Redis client instance, required for least busy strategy
load_balancing_metric (Metric): The type of metric to use for performance evaluation

Expand Down
2 changes: 1 addition & 1 deletion api/utils/hooks_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ async def update_budget(usage: Usage):
# Update the budget
update_stmt = update(User).where(User.id == user_id).values(budget=new_budget, updated=func.now()).returning(User.budget)

result = await postgres_session.execute(update_stmt)
await postgres_session.execute(update_stmt)

except Exception as e:
logger.exception(f"Failed to update budget for user {user_id}: {e}")
Expand Down
1 change: 1 addition & 0 deletions api/utils/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class RouterName(StrEnum):
COLLECTIONS = ("collections", "api.endpoints.collections")
DOCUMENTS = ("documents", "api.endpoints.documents")
EMBEDDINGS = ("embeddings", "api.endpoints.embeddings")
HEALTH = ("health", "api.endpoints.health")
ME = ("me", "api.endpoints.me")
MODELS = ("models", "api.infrastructure.fastapi.endpoints.models")
MONITORING = ("monitoring", None)
Expand Down
Loading