ray-project · harshit-anyscale · Feb 6, 2026 · Feb 6, 2026
@@ -74,6 +74,14 @@ Ray Serve allows you to fine-tune the backoff behavior of the request router, wh
 - `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER`: The multiplier applied to the backoff time after each retry. Default is `2`.
 - `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S`: The maximum backoff time (in seconds) between retries. Default is `0.5`.
 
+### Set timeouts while probing replicas for queue length
+
+Ray Serve's request router probes replicas for their queue lengths to make intelligent load balancing decisions. You can tune the following environment variables to optimize this behavior for your workload:
+
+- `RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The initial timeout (in seconds) for waiting for replicas to respond with their queue length information. Default is `0.1`.
+- `RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The maximum timeout (in seconds) for queue length responses. When retrying with exponential backoff, the deadline increases but is capped at this value. Default is `1.0`.
+- `RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S`: How long (in seconds) cached queue length information from replicas is considered valid. After this timeout, the cache entry expires and the router must probe the replica again. Default is `10.0`.
+
 ### Configure locality-based routing
 
 Ray Serve routes requests to replicas based on locality to reduce network latency. The system applies locality routing in two scenarios: proxy-to-replica communication (HTTP/gRPC requests) and inter-deployment communication (replica-to-replica calls through `DeploymentHandle`).

@@ -343,6 +343,16 @@ def from_proto(cls, proto: DeploymentConfigProto):
                 else:
                     data["request_router_config"]["request_router_kwargs"] = {}
 
+            # Replace falsy proto defaults with None so Pydantic uses its defaults.
+            # This is important during rolling upgrades when older controllers
+            # send configs without these fields (proto3 defaults to 0.0).
+            if not data["request_router_config"].get("initial_backoff_s"):
+                data["request_router_config"]["initial_backoff_s"] = None
+            if not data["request_router_config"].get("backoff_multiplier"):
+                data["request_router_config"]["backoff_multiplier"] = None
+            if not data["request_router_config"].get("max_backoff_s"):
+                data["request_router_config"]["max_backoff_s"] = None
+
             data["request_router_config"] = RequestRouterConfig(
                 **data["request_router_config"]
             )

@@ -259,6 +259,9 @@ def get_env_bool(name: str, default: str) -> bool:
 # Environment variables that are fully deprecated and will be ignored.
 _fully_deprecated_env_vars = {
     "RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S": "http_options.keep_alive_timeout_s",
+    "RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S": "request_router_config.initial_backoff_s",
+    "RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER": "request_router_config.backoff_multiplier",
+    "RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S": "request_router_config.max_backoff_s",
 }
 
 

@@ -445,11 +445,6 @@ def _fulfill_next_pending_request(
 class RequestRouter(ABC):
     """Abstract interface for a request router (how the router calls it)."""
 
-    """Backoff parameters for request router."""
-    initial_backoff_s = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
-    backoff_multiplier = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
-    max_backoff_s = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
-
     # Deadline for replicas to respond with their queue length. If the response isn't
     # received within this deadline, the replica will not be considered.
     # If this deadline is repeatedly missed, it will be exponentially increased up to
@@ -478,6 +473,9 @@ def __init__(
         create_replica_wrapper_func: Optional[
             Callable[[RunningReplicaInfo], RunningReplica]
         ] = None,
+        initial_backoff_s: float = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+        backoff_multiplier: float = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+        max_backoff_s: float = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
         *args,
         **kwargs,
     ):
@@ -488,6 +486,11 @@ def __init__(
         self._create_replica_wrapper_func = create_replica_wrapper_func
         self._get_curr_time_s = get_curr_time_s if get_curr_time_s else time.time
 
+        # Backoff parameters for request routing, from RequestRouterConfig.
+        self.initial_backoff_s = initial_backoff_s
+        self.backoff_multiplier = backoff_multiplier
+        self.max_backoff_s = max_backoff_s
+
         # Current replicas available to be routed.
         # Updated via `update_replicas`.
         self._replica_id_set: Set[ReplicaID] = set()

@@ -42,6 +42,7 @@
     RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
     SERVE_LOGGER_NAME,
 )
+from ray.serve._private.constants_utils import warn_if_deprecated_env_var_set
 from ray.serve._private.event_loop_monitoring import EventLoopMonitor
 from ray.serve._private.long_poll import LongPollClient, LongPollNamespace
 from ray.serve._private.metrics_utils import (
@@ -495,6 +496,12 @@ async def create_event() -> asyncio.Event:
 
 
 class AsyncioRouter:
+
+    # Backoff parameters for request routing.
+    _initial_backoff_s: float
+    _backoff_multiplier: float
+    _max_backoff_s: float
+
     def __init__(
         self,
         controller_handle: ActorHandle,
@@ -639,6 +646,9 @@ def request_router(self) -> Optional[RequestRouter]:
                 prefer_local_node_routing=self._prefer_local_node_routing,
                 prefer_local_az_routing=RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
                 self_availability_zone=self._availability_zone,
+                initial_backoff_s=self._initial_backoff_s,
+                backoff_multiplier=self._backoff_multiplier,
+                max_backoff_s=self._max_backoff_s,
             )
             request_router.initialize_state(**(self._request_router_kwargs))
 
@@ -684,6 +694,19 @@ def update_deployment_config(self, deployment_config: DeploymentConfig):
         self._request_router_kwargs = (
             deployment_config.request_router_config.request_router_kwargs
         )
+
+        # Warn if deprecated env vars are set
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S")
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER")
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S")
+
+        self._initial_backoff_s = (
+            deployment_config.request_router_config.initial_backoff_s
+        )
+        self._backoff_multiplier = (
+            deployment_config.request_router_config.backoff_multiplier
+        )
+        self._max_backoff_s = deployment_config.request_router_config.max_backoff_s
         self._metrics_manager.update_deployment_config(
             deployment_config,
             curr_num_replicas=len(self.request_router.curr_replicas),

@@ -30,6 +30,9 @@
     DEFAULT_REQUEST_ROUTING_STATS_TIMEOUT_S,
     DEFAULT_TARGET_ONGOING_REQUESTS,
     DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
+    RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+    RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+    RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
     SERVE_LOGGER_NAME,
 )
 from ray.serve._private.utils import validate_ssl_config
@@ -248,6 +251,28 @@ class RequestRouterConfig(BaseModel):
         ),
     )
 
+    initial_backoff_s: Optional[float] = Field(
+        default=None,
+        description=(
+            "Initial backoff time (in seconds) before retrying to route a request "
+            "to a replica. Defaults to 0.025."
+        ),
+    )
+
+    backoff_multiplier: Optional[float] = Field(
+        default=None,
+        description=(
+            "Multiplier applied to the backoff time after each retry. " "Defaults to 2."
+        ),
+    )
+
+    max_backoff_s: Optional[float] = Field(
+        default=None,
+        description=(
+            "Maximum backoff time (in seconds) between retries. " "Defaults to 0.5."
+        ),
+    )
+
     @validator("request_router_kwargs", always=True)
     def request_router_kwargs_json_serializable(cls, v):
         if isinstance(v, bytes):
@@ -262,6 +287,18 @@ def request_router_kwargs_json_serializable(cls, v):
 
         return v
 
+    @validator("initial_backoff_s", always=True)
+    def set_initial_backoff_s_default(cls, v):
+        return v if v is not None else RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
+
+    @validator("backoff_multiplier", always=True)
+    def set_backoff_multiplier_default(cls, v):
+        return v if v is not None else RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
+
+    @validator("max_backoff_s", always=True)
+    def set_max_backoff_s_default(cls, v):
+        return v if v is not None else RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
+
     def __init__(self, **kwargs: dict[str, Any]):
         """Initialize RequestRouterConfig with the given parameters.
 

@@ -15,6 +15,9 @@
 from ray.serve._private.constants import (
     DEFAULT_AUTOSCALING_POLICY_NAME,
     DEFAULT_GRPC_PORT,
+    RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+    RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+    RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
 )
 from ray.serve._private.request_router import PowerOfTwoChoicesRequestRouter
 from ray.serve._private.utils import DEFAULT
@@ -1134,6 +1137,27 @@ def test_optional_field(self):
         assert "initial_replicas" not in result
 
 
+def test_request_router_config_backoff_params_proto_zero_defaults():
+    """Test that zero proto defaults don't override Pydantic defaults.
+
+    Proto3 fields default to 0.0 when not set. During rolling upgrades,
+    older controllers may send configs without backoff fields. This test
+    verifies that zero values are replaced with proper defaults to avoid
+    tight retry loops.
+    """
+    # Simulate a proto with zero default values (as if from older controller)
+    config = RequestRouterConfig(
+        initial_backoff_s=None,
+        backoff_multiplier=None,
+        max_backoff_s=None,
+    )
+
+    # Verify that None values are replaced with proper defaults
+    assert config.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
+    assert config.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
+    assert config.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
+
+
 if __name__ == "__main__":
     import sys
 

@@ -19,7 +19,12 @@
     ReplicaID,
     RequestMetadata,
 )
-from ray.serve._private.constants import RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S
+from ray.serve._private.constants import (
+    RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S,
+    RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+    RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+    RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
+)
 from ray.serve._private.replica_result import ReplicaResult
 from ray.serve._private.request_router import (
     PendingRequest,
@@ -2034,5 +2039,44 @@ async def test_rank_replicas_via_multiplex(
     ]
 
 
+def test_request_router_backoff_params_default():
+    """Test that backoff params use env var defaults when not specified."""
+    router = PowerOfTwoChoicesRequestRouter(
+        deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
+        handle_source=DeploymentHandleSource.REPLICA,
+        self_node_id=ROUTER_NODE_ID,
+        self_actor_id="fake-actor-id",
+        self_actor_handle=None,
+        get_curr_time_s=TIMER.time,
+    )
+
+    assert router.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
+    assert router.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
+    assert router.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
+
+
+def test_request_router_backoff_params_custom():
+    """Test that custom backoff params are properly set on the RequestRouter."""
+    custom_initial_backoff = 0.1
+    custom_multiplier = 5
+    custom_max_backoff = 3.0
+
+    router = PowerOfTwoChoicesRequestRouter(
+        deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
+        handle_source=DeploymentHandleSource.REPLICA,
+        self_node_id=ROUTER_NODE_ID,
+        self_actor_id="fake-actor-id",
+        self_actor_handle=None,
+        get_curr_time_s=TIMER.time,
+        initial_backoff_s=custom_initial_backoff,
+        backoff_multiplier=custom_multiplier,
+        max_backoff_s=custom_max_backoff,
+    )
+
+    assert router.initial_backoff_s == custom_initial_backoff
+    assert router.backoff_multiplier == custom_multiplier
+    assert router.max_backoff_s == custom_max_backoff
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))