[Serve] Add replica utilization metric for Ray Serve (ray-project#60758)

abrarsheikh · web-flow · commit 521cb0ece4d0 · 2026-02-05T22:31:14.000-08:00
### Summary Adds a new `ray_serve_replica_utilization_percent` Gauge metric that measures what percentage of a replica's capacity is being used over a rolling time window. This metric is useful for capacity planning and identifying underutilized or overloaded replicas. **Formula:** `(total_user_code_execution_time / (window_duration × max_ongoing_requests)) × 100` ### Changes - **New metric**: `ray_serve_replica_utilization_percent` with tags `deployment`, `replica`, `application` - **New module**: `rolling_window_accumulator.py` - a lock-free, thread-safe rolling window implementation using thread-local storage for minimal overhead on the request hot path (~0.5µs per `add()`) - **Configurable** via environment variables: - `RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S` (default: 600s) - `RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S` (default: 10s) - `RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS` (default: 60) ### Test plan - [x] Unit tests for `RollingWindowAccumulator` (32 tests covering single-threaded, multi-threaded, edge cases, and thread isolation) - [x] End-to-end integration test in `test_metrics.py` - [x] Benchmark confirms sub-microsecond overhead with no degradation under concurrent load <img width="285" height="168" alt="image" src="https://github.com/user-attachments/assets/ff59d071-56cc-4aab-8547-22b9780a318e" /> ### Related issue Closes ray-project#60755 <sup><a href="https://cursor.com/dashboard?tab=bugbot">Cursor Bugbot</a> found 1 potential issue for commit <u>920c18d</u></sup> --------- Signed-off-by: abrar <abrar@anyscale.com>
diff --git a/doc/source/serve/monitoring.md b/doc/source/serve/monitoring.md
@@ -662,6 +662,7 @@ These metrics track request throughput, errors, and latency at the replica level
 | Metric | Type | Tags | Description |
 |--------|------|------|-------------|
 | `ray_serve_replica_processing_queries` **[D]** | Gauge | `deployment`, `replica`, `application` | Current number of requests being processed by the replica. |
+| `ray_serve_replica_utilization_percent` **[D]** | Gauge | `deployment`, `replica`, `application` | Percentage of replica capacity used over a rolling window. Calculated as total user code execution time divided by maximum capacity (`window_duration × max_ongoing_requests`). Useful for capacity planning and identifying underutilized or overloaded replicas. Configure with `RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S` (default: 600s), `RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S` (default: 10s), and `RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS` (default: 60). |
 | `ray_serve_deployment_request_counter_total` **[D]** | Counter | `deployment`, `replica`, `route`, `application` | Total number of requests processed by the replica. |
 | `ray_serve_deployment_processing_latency_ms` **[D]** | Histogram | `deployment`, `replica`, `route`, `application` | Histogram of request processing time in milliseconds (excludes queue wait time). |
 | `ray_serve_deployment_error_counter_total` **[D]** | Counter | `deployment`, `replica`, `route`, `application` | Total number of exceptions raised while processing requests. |
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
@@ -165,6 +165,20 @@
     DEFAULT_BATCH_UTILIZATION_BUCKETS_PERCENT,
 )
 
+#: Replica utilization metric configuration.
+#: Rolling window duration for calculating replica utilization (in seconds).
+RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S = float(
+    get_env_str("RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S", "600")
+)
+#: Interval for reporting replica utilization metric (in seconds).
+RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S = float(
+    get_env_str("RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S", "10")
+)
+#: Number of buckets for the rolling window (determines granularity).
+RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS = int(
+    get_env_str("RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS", "60")
+)
+
 #: Histogram buckets for actual batch size.
 DEFAULT_BATCH_SIZE_BUCKETS = [
     1,
diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
@@ -70,6 +70,9 @@
     RAY_SERVE_RECORD_AUTOSCALING_STATS_TIMEOUT_S,
     RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_INTERVAL_S,
     RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH,
+    RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS,
+    RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S,
+    RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S,
     RAY_SERVE_REQUEST_PATH_LOG_BUFFER_SIZE,
     RAY_SERVE_RUN_SYNC_IN_THREADPOOL,
     RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING,
@@ -121,6 +124,7 @@
 from ray.serve._private.metrics_utils import InMemoryMetricsStore, MetricsPusher
 from ray.serve._private.proxy_request_response import ResponseStatus
 from ray.serve._private.replica_response_generator import ReplicaResponseGenerator
+from ray.serve._private.rolling_window_accumulator import RollingWindowAccumulator
 from ray.serve._private.serialization import RPCSerializer
 from ray.serve._private.task_consumer import TaskConsumerWrapper
 from ray.serve._private.thirdparty.get_asgi_route_name import (
@@ -299,6 +303,7 @@ def __init__(
         event_loop: asyncio.BaseEventLoop,
         autoscaling_config: Optional[AutoscalingConfig],
         ingress: bool,
+        max_ongoing_requests: int,
     ):
         self._replica_id = replica_id
         self._deployment_id = replica_id.deployment_id
@@ -309,6 +314,7 @@ def __init__(
             SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE
         )
         self._num_ongoing_requests = 0
+        self._max_ongoing_requests = max_ongoing_requests
         # Store event loop for scheduling async tasks from sync context
         self._event_loop = event_loop or asyncio.get_event_loop()
 
@@ -384,6 +390,26 @@ def __init__(
             boundaries=REQUEST_LATENCY_BUCKETS_MS,
         )
 
+        # Replica utilization tracking with rolling window.
+        # Tracks total user code execution time over a rolling window to calculate
+        # utilization as: user_code_time / (window_duration * max_ongoing_requests).
+        self._user_code_time_accumulator = RollingWindowAccumulator(
+            window_duration_s=RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S,
+            num_buckets=RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS,
+        )
+        self._replica_utilization_gauge = metrics.Gauge(
+            "serve_replica_utilization_percent",
+            description=(
+                "Percentage of replica capacity utilized by user code execution "
+                "over a rolling window. Calculated as: "
+                "user_code_time / (window_duration * max_ongoing_requests)."
+            ),
+        )
+        self._utilization_report_interval_s = (
+            RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S
+        )
+        self._event_loop.create_task(self._report_utilization_forever())
+
         self.set_autoscaling_config(autoscaling_config)
 
         if self._is_direct_ingress:
@@ -701,8 +727,56 @@ def get_num_ongoing_requests(self) -> int:
         """Get current total queue length of requests for this replica."""
         return self._num_ongoing_requests
 
+    def set_max_ongoing_requests(self, max_ongoing_requests: int) -> None:
+        """Update max_ongoing_requests when deployment config changes."""
+        self._max_ongoing_requests = max_ongoing_requests
+
+    async def _report_utilization_forever(self) -> None:
+        """Background task to emit utilization gauge continuously."""
+        consecutive_errors = 0
+        while True:
+            try:
+                await asyncio.sleep(self._utilization_report_interval_s)
+                utilization = self._calculate_utilization()
+                self._replica_utilization_gauge.set(utilization)
+                consecutive_errors = 0
+            except Exception:
+                logger.exception("Unexpected error reporting utilization metrics.")
+
+                # Exponential backoff starting at 1s and capping at 10s.
+                backoff_time_s = min(10, 2**consecutive_errors)
+                consecutive_errors += 1
+                await asyncio.sleep(backoff_time_s)
+
+    def _calculate_utilization(self) -> float:
+        """Calculate current utilization percentage based on rolling window.
+
+        Utilization is calculated as:
+            user_code_time / (window_duration * max_ongoing_requests)
+
+        This represents the percentage of the replica's theoretical maximum
+        capacity that was used for executing user code.
+        """
+        total_user_code_time_ms = self._user_code_time_accumulator.get_total()
+
+        # Max capacity = window_duration_ms * max_ongoing_requests
+        window_duration_ms = RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S * 1000
+        max_capacity_ms = window_duration_ms * self._max_ongoing_requests
+
+        if max_capacity_ms > 0:
+            utilization_percent = (total_user_code_time_ms / max_capacity_ms) * 100
+            # Cap at 100% (can theoretically exceed if requests overlap heavily)
+            utilization_percent = min(utilization_percent, 100.0)
+        else:
+            utilization_percent = 0.0
+
+        return utilization_percent
+
     def record_request_metrics(self, *, route: str, latency_ms: float, was_error: bool):
         """Records per-request metrics."""
+        # Track latency for utilization calculation (rolling window).
+        self._user_code_time_accumulator.add(latency_ms)
+
         if self._cached_metrics_enabled:
             self._cached_latencies[route].append(latency_ms)
             if was_error:
@@ -948,6 +1022,7 @@ def __init__(
             event_loop=self._event_loop,
             autoscaling_config=self._deployment_config.autoscaling_config,
             ingress=ingress,
+            max_ongoing_requests=self._deployment_config.max_ongoing_requests,
         )
 
         # Start event loop monitoring for the replica's main event loop.
@@ -1460,6 +1535,9 @@ async def reconfigure(
             self._metrics_manager.set_autoscaling_config(
                 deployment_config.autoscaling_config
             )
+            self._metrics_manager.set_max_ongoing_requests(
+                deployment_config.max_ongoing_requests
+            )
             if logging_config_changed:
                 self._configure_logger_and_profilers(deployment_config.logging_config)
 
diff --git a/python/ray/serve/_private/rolling_window_accumulator.py b/python/ray/serve/_private/rolling_window_accumulator.py
@@ -0,0 +1,219 @@
+import threading
+import time
+from typing import List
+
+
+class _ThreadBuckets:
+    """Per-thread bucket storage for rolling window accumulator.
+
+    Each thread gets its own instance to avoid lock contention on the hot path.
+    """
+
+    # This is a performance optimization to avoid creating a dictionary for the instance.
+    __slots__ = ("buckets", "current_bucket_idx", "last_rotation_time")
+
+    def __init__(self, num_buckets: int):
+        self.buckets = [0.0] * num_buckets
+        self.current_bucket_idx = 0
+        self.last_rotation_time = time.time()
+
+
+class _ThreadLocalRef(threading.local):
+    """Thread-local reference to the thread's _ThreadBuckets instance."""
+
+    def __init__(self):
+        super().__init__()
+        # by using threading.local, each thread gets its own instance of _ThreadBuckets.
+        self.data: _ThreadBuckets = None
+
+
+class RollingWindowAccumulator:
+    """Tracks cumulative values over a rolling time window.
+
+    Uses bucketing for memory efficiency - divides the window into N buckets
+    and rotates them as time passes. This allows efficient tracking of values
+    over a sliding window without storing individual data points.
+
+    Uses thread-local storage for lock-free writes on the hot path (add()).
+    Only get_total() requires synchronization to aggregate across threads.
+
+    Example:
+        # Create a 10-minute rolling window with 60 buckets (10s each)
+        accumulator = RollingWindowAccumulator(
+            window_duration_s=600.0,
+            num_buckets=60,
+        )
+
+        # Add values (lock-free, safe from multiple threads)
+        accumulator.add(100.0)
+        accumulator.add(50.0)
+
+        # Get total (aggregates across all threads)
+        total = accumulator.get_total()
+
+    Thread Safety:
+        - add() is lock-free after the first call from each thread
+        - get_total() acquires a lock to aggregate across threads
+        - Safe to call from multiple threads concurrently
+    """
+
+    def __init__(
+        self,
+        window_duration_s: float,
+        num_buckets: int = 60,
+    ):
+        """Initialize the rolling window accumulator.
+
+        Args:
+            window_duration_s: Total duration of the rolling window in seconds.
+                Values older than this are automatically expired.
+            num_buckets: Number of buckets to divide the window into. More buckets
+                gives finer granularity but uses slightly more memory. Default is 60,
+                which for a 10-minute window gives 10-second granularity.
+
+        Raises:
+            ValueError: If window_duration_s <= 0 or num_buckets <= 0.
+        """
+        if window_duration_s <= 0:
+            raise ValueError(
+                f"window_duration_s must be positive, got {window_duration_s}"
+            )
+        if num_buckets <= 0:
+            raise ValueError(f"num_buckets must be positive, got {num_buckets}")
+
+        self._window_duration_s = window_duration_s
+        self._num_buckets = num_buckets
+        self._bucket_duration_s = window_duration_s / num_buckets
+
+        # Thread-local reference to per-thread bucket data
+        self._local = _ThreadLocalRef()
+
+        # Track all per-thread bucket instances for aggregation
+        self._all_thread_data: List[_ThreadBuckets] = []
+        self._registry_lock = threading.Lock()
+
+    @property
+    def window_duration_s(self) -> float:
+        """The total duration of the rolling window in seconds."""
+        return self._window_duration_s
+
+    @property
+    def num_buckets(self) -> int:
+        """The number of buckets in the rolling window."""
+        return self._num_buckets
+
+    @property
+    def bucket_duration_s(self) -> float:
+        """The duration of each bucket in seconds."""
+        return self._bucket_duration_s
+
+    def _ensure_initialized(self) -> _ThreadBuckets:
+        """Ensure thread-local storage is initialized for the current thread.
+
+        This is called on every add() but the fast path (already initialized)
+        is just a single attribute check with no locking.
+
+        Returns:
+            The _ThreadBuckets instance for the current thread.
+        """
+        data = self._local.data
+        if data is not None:
+            return data
+
+        # Slow path: first call from this thread
+        data = _ThreadBuckets(self._num_buckets)
+        self._local.data = data
+
+        # Register for aggregation (only happens once per thread)
+        with self._registry_lock:
+            self._all_thread_data.append(data)
+
+        return data
+
+    def _rotate_buckets_if_needed(self, data: _ThreadBuckets) -> None:
+        """Rotate buckets for the given thread's storage.
+
+        Advances the current bucket index and clears old buckets as time passes.
+        """
+        now = time.time()
+        elapsed = now - data.last_rotation_time
+        buckets_to_advance = int(elapsed / self._bucket_duration_s)
+
+        if buckets_to_advance > 0:
+            if buckets_to_advance >= self._num_buckets:
+                # All buckets have expired, reset everything
+                data.buckets = [0.0] * self._num_buckets
+                data.current_bucket_idx = 0
+            else:
+                # Clear old buckets as we advance
+                for _ in range(buckets_to_advance):
+                    data.current_bucket_idx = (
+                        data.current_bucket_idx + 1
+                    ) % self._num_buckets
+                    data.buckets[data.current_bucket_idx] = 0.0
+
+            data.last_rotation_time = now
+
+    def add(self, value: float) -> None:
+        """Add a value to the current bucket.
+
+        This operation is lock-free for the calling thread after the first call.
+        Safe to call from multiple threads concurrently.
+
+        Args:
+            value: The value to add to the accumulator.
+        """
+        # Fast path: just check if initialized (no lock)
+        data = self._ensure_initialized()
+
+        # Lock-free: only touches thread-local data
+        self._rotate_buckets_if_needed(data)
+        data.buckets[data.current_bucket_idx] += value
+
+    def get_total(self) -> float:
+        """Get total value across all buckets in the window.
+
+        This aggregates values from all threads that have called add().
+        Expired buckets (older than window_duration_s) are not included.
+
+        Note: We are accepting some inaccuracy in the total value to avoid the overhead of a lock.
+        This is acceptable because we are only using this for utilization metrics, which are not
+        critical for the overall system. Given that the default window duration is 600s and the
+        default report interval is 10s, the inaccuracy is less than 0.16%.
+
+        Returns:
+            The sum of all non-expired values in the rolling window.
+        """
+        total = 0.0
+        now = time.time()
+
+        with self._registry_lock:
+            for data in self._all_thread_data:
+                # Calculate which buckets are still valid for this thread's data
+                elapsed = now - data.last_rotation_time
+                buckets_expired = int(elapsed / self._bucket_duration_s)
+
+                if buckets_expired >= self._num_buckets:
+                    # All buckets have expired for this thread
+                    continue
+
+                # Sum buckets that haven't expired
+                # Buckets are arranged in a circular buffer, with current_bucket_idx
+                # being the most recent. We need to skip buckets that have expired.
+                for i in range(self._num_buckets - buckets_expired):
+                    # Go backwards from current bucket
+                    idx = (data.current_bucket_idx - i) % self._num_buckets
+                    total += data.buckets[idx]
+
+        return total
+
+    def get_num_registered_threads(self) -> int:
+        """Get the number of threads that have called add().
+
+        Useful for debugging and testing.
+
+        Returns:
+            The number of threads registered with this accumulator.
+        """
+        with self._registry_lock:
+            return len(self._all_thread_data)
diff --git a/python/ray/serve/tests/BUILD.bazel b/python/ray/serve/tests/BUILD.bazel
@@ -244,6 +244,10 @@ py_test_module_list(
     timeout = "long",
     env = {
         "RAY_SERVE_ROUTER_QUEUE_LEN_GAUGE_THROTTLE_S": "0",
+        "RAY_SERVE_RUN_SYNC_IN_THREADPOOL": "1",
+        "RAY_SERVE_REPLICA_UTILIZATION_WINDOW_S": "1",
+        "RAY_SERVE_REPLICA_UTILIZATION_REPORT_INTERVAL_S": "1",
+        "RAY_SERVE_REPLICA_UTILIZATION_NUM_BUCKETS": "10",
     },
     files = [
         "test_deploy_app.py",
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
diff --git a/python/ray/serve/tests/unit/test_rolling_window_accumulator.py b/python/ray/serve/tests/unit/test_rolling_window_accumulator.py