ray-project · harshit-anyscale · Feb 6, 2026 · Feb 6, 2026 · gemini-code-assist · Feb 6, 2026
@@ -74,6 +74,14 @@ Ray Serve allows you to fine-tune the backoff behavior of the request router, wh
 - `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER`: The multiplier applied to the backoff time after each retry. Default is `2`.
 - `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S`: The maximum backoff time (in seconds) between retries. Default is `0.5`.
 
+### Set timeouts while probing replicas for queue length
+
+Ray Serve's request router probes replicas for their queue lengths to make intelligent load balancing decisions. You can tune the following environment variables to optimize this behavior for your workload:
+
+- `RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The initial timeout (in seconds) for waiting for replicas to respond with their queue length information. Default is `0.1`.
+- `RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The maximum timeout (in seconds) for queue length responses. When retrying with exponential backoff, the deadline increases but is capped at this value. Default is `1.0`.
+- `RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S`: How long (in seconds) cached queue length information from replicas is considered valid. After this timeout, the cache entry expires and the router must probe the replica again. Default is `10.0`.
+
 ### Configure locality-based routing
 
 Ray Serve routes requests to replicas based on locality to reduce network latency. The system applies locality routing in two scenarios: proxy-to-replica communication (HTTP/gRPC requests) and inter-deployment communication (replica-to-replica calls through `DeploymentHandle`).

@@ -259,6 +259,9 @@ def get_env_bool(name: str, default: str) -> bool:
 # Environment variables that are fully deprecated and will be ignored.
 _fully_deprecated_env_vars = {
     "RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S": "http_options.keep_alive_timeout_s",
+    "RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S": "request_router_config.initial_backoff_s",
+    "RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER": "request_router_config.backoff_multiplier",
+    "RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S": "request_router_config.max_backoff_s",
 }
 
 

@@ -445,11 +445,6 @@ def _fulfill_next_pending_request(
 class RequestRouter(ABC):
     """Abstract interface for a request router (how the router calls it)."""
 
-    """Backoff parameters for request router."""
-    initial_backoff_s = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
-    backoff_multiplier = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
-    max_backoff_s = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
-
     # Deadline for replicas to respond with their queue length. If the response isn't
     # received within this deadline, the replica will not be considered.
     # If this deadline is repeatedly missed, it will be exponentially increased up to
@@ -478,6 +473,9 @@ def __init__(
         create_replica_wrapper_func: Optional[
             Callable[[RunningReplicaInfo], RunningReplica]
         ] = None,
+        initial_backoff_s: float = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+        backoff_multiplier: float = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+        max_backoff_s: float = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
         *args,
         **kwargs,
     ):
@@ -488,6 +486,11 @@ def __init__(
         self._create_replica_wrapper_func = create_replica_wrapper_func
         self._get_curr_time_s = get_curr_time_s if get_curr_time_s else time.time
 
+        # Backoff parameters for request routing, from RequestRouterConfig.
+        self.initial_backoff_s = initial_backoff_s
+        self.backoff_multiplier = backoff_multiplier
+        self.max_backoff_s = max_backoff_s
+
         # Current replicas available to be routed.
         # Updated via `update_replicas`.
         self._replica_id_set: Set[ReplicaID] = set()

@@ -42,6 +42,7 @@
     RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
     SERVE_LOGGER_NAME,
 )
+from ray.serve._private.constants_utils import warn_if_deprecated_env_var_set
 from ray.serve._private.event_loop_monitoring import EventLoopMonitor
 from ray.serve._private.long_poll import LongPollClient, LongPollNamespace
 from ray.serve._private.metrics_utils import (
@@ -495,6 +496,12 @@ async def create_event() -> asyncio.Event:
 
 
 class AsyncioRouter:
+
+    # Backoff parameters for request routing.
+    _initial_backoff_s: float
+    _backoff_multiplier: float
+    _max_backoff_s: float
+
     def __init__(
         self,
         controller_handle: ActorHandle,
@@ -639,6 +646,9 @@ def request_router(self) -> Optional[RequestRouter]:
                 prefer_local_node_routing=self._prefer_local_node_routing,
                 prefer_local_az_routing=RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
                 self_availability_zone=self._availability_zone,
+                initial_backoff_s=self._initial_backoff_s,
+                backoff_multiplier=self._backoff_multiplier,
+                max_backoff_s=self._max_backoff_s,
             )
             request_router.initialize_state(**(self._request_router_kwargs))
 
@@ -684,6 +694,19 @@ def update_deployment_config(self, deployment_config: DeploymentConfig):
         self._request_router_kwargs = (
             deployment_config.request_router_config.request_router_kwargs
         )
+
+        # Warn if deprecated env vars are set
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S")
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER")
+        warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S")
+
+        self._initial_backoff_s = (
+            deployment_config.request_router_config.initial_backoff_s
+        )
+        self._backoff_multiplier = (
+            deployment_config.request_router_config.backoff_multiplier
+        )
+        self._max_backoff_s = deployment_config.request_router_config.max_backoff_s
         self._metrics_manager.update_deployment_config(
             deployment_config,
             curr_num_replicas=len(self.request_router.curr_replicas),

@@ -30,6 +30,9 @@
     DEFAULT_REQUEST_ROUTING_STATS_TIMEOUT_S,
     DEFAULT_TARGET_ONGOING_REQUESTS,
     DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
+    RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+    RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+    RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
     SERVE_LOGGER_NAME,
 )
 from ray.serve._private.utils import validate_ssl_config
@@ -248,6 +251,33 @@ class RequestRouterConfig(BaseModel):
         ),
     )
 
+    initial_backoff_s: float = Field(
+        default=RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+        description=(
+            "Initial backoff time (in seconds) before retrying to route a request "
+            "to a replica. Defaults to RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S "
+            "environment variable, or 0.025 if not set."
+        ),
+    )
+
+    backoff_multiplier: float = Field(
+        default=RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+        description=(
+            "Multiplier applied to the backoff time after each retry. "
+            "Defaults to RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER "
+            "environment variable, or 2 if not set."
+        ),
+    )
+
+    max_backoff_s: float = Field(
+        default=RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
+        description=(
+            "Maximum backoff time (in seconds) between retries. "
+            "Defaults to RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S "
+            "environment variable, or 0.5 if not set."
+        ),
+    )
+
     @validator("request_router_kwargs", always=True)
     def request_router_kwargs_json_serializable(cls, v):
         if isinstance(v, bytes):

@@ -19,7 +19,12 @@
     ReplicaID,
     RequestMetadata,
 )
-from ray.serve._private.constants import RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S
+from ray.serve._private.constants import (
+    RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S,
+    RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
+    RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
+    RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
+)
 from ray.serve._private.replica_result import ReplicaResult
 from ray.serve._private.request_router import (
     PendingRequest,
@@ -2034,5 +2039,44 @@ async def test_rank_replicas_via_multiplex(
     ]
 
 
+def test_request_router_backoff_params_default():
+    """Test that backoff params use env var defaults when not specified."""
+    router = PowerOfTwoChoicesRequestRouter(
+        deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
+        handle_source=DeploymentHandleSource.REPLICA,
+        self_node_id=ROUTER_NODE_ID,
+        self_actor_id="fake-actor-id",
+        self_actor_handle=None,
+        get_curr_time_s=TIMER.time,
+    )
+
+    assert router.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
+    assert router.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
+    assert router.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S
+
+
+def test_request_router_backoff_params_custom():
+    """Test that custom backoff params are properly set on the RequestRouter."""
+    custom_initial_backoff = 0.1
+    custom_multiplier = 5
+    custom_max_backoff = 3.0
+
+    router = PowerOfTwoChoicesRequestRouter(
+        deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
+        handle_source=DeploymentHandleSource.REPLICA,
+        self_node_id=ROUTER_NODE_ID,
+        self_actor_id="fake-actor-id",
+        self_actor_handle=None,
+        get_curr_time_s=TIMER.time,
+        initial_backoff_s=custom_initial_backoff,
+        backoff_multiplier=custom_multiplier,
+        max_backoff_s=custom_max_backoff,
+    )
+
+    assert router.initial_backoff_s == custom_initial_backoff
+    assert router.backoff_multiplier == custom_multiplier
+    assert router.max_backoff_s == custom_max_backoff
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))
@@ -1343,5 +1343,118 @@ def release_lock():
             assign_request_future.exception()
 
 
+@pytest.mark.asyncio
+class TestAsyncioRouterBackoffConfig:
+    """Test that backoff config flows from DeploymentConfig to RequestRouter."""
+
+    async def test_update_deployment_config_sets_backoff_params(self):
+        """Test that update_deployment_config extracts backoff params from config."""
+        from ray.serve.config import RequestRouterConfig
+
+        fake_request_router = FakeRequestRouter(use_queue_len_cache=False)
+        router = AsyncioRouter(
+            controller_handle=Mock(),
+            deployment_id=DeploymentID(name="test-deployment"),
+            handle_id="test-handle-id",
+            self_actor_id="test-node-id",
+            handle_source=DeploymentHandleSource.UNKNOWN,
+            event_loop=get_or_create_event_loop(),
+            enable_strict_max_ongoing_requests=False,
+            request_router=fake_request_router,
+            node_id="test-node-id",
+            availability_zone="test-az",
+            prefer_local_node_routing=False,
+            _request_router_initialized_event=asyncio.Event(),
+        )
+
+        # Create a DeploymentConfig with custom backoff params
+        custom_initial_backoff = 0.15
+        custom_multiplier = 4
+        custom_max_backoff = 2.5
+
+        deployment_config = DeploymentConfig.from_default(
+            request_router_config=RequestRouterConfig(
+                initial_backoff_s=custom_initial_backoff,
+                backoff_multiplier=custom_multiplier,
+                max_backoff_s=custom_max_backoff,
+            )
+        )
+
+        # Update the router with the config
+        router.update_deployment_config(deployment_config)
+
+        # Verify the backoff params were stored on the router
+        assert router._initial_backoff_s == custom_initial_backoff
+        assert router._backoff_multiplier == custom_multiplier
+        assert router._max_backoff_s == custom_max_backoff
+
+    async def test_deprecated_env_vars_emit_warning(self):
+        """Test that deprecated backoff env vars emit deprecation warnings."""
+        import os
+        import warnings
+
+        from ray.serve.config import RequestRouterConfig
+
+        fake_request_router = FakeRequestRouter(use_queue_len_cache=False)
+        router = AsyncioRouter(
+            controller_handle=Mock(),
+            deployment_id=DeploymentID(name="test-deployment"),
+            handle_id="test-handle-id",
+            self_actor_id="test-node-id",
+            handle_source=DeploymentHandleSource.UNKNOWN,
+            event_loop=get_or_create_event_loop(),
+            enable_strict_max_ongoing_requests=False,
+            request_router=fake_request_router,
+            node_id="test-node-id",
+            availability_zone="test-az",
+            prefer_local_node_routing=False,
+            _request_router_initialized_event=asyncio.Event(),
+        )
+
+        deployment_config = DeploymentConfig.from_default(
+            request_router_config=RequestRouterConfig()
+        )
+
+        # Set deprecated env vars
+        env_vars_to_test = [
+            "RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S",
+            "RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER",
+            "RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S",
+        ]
+
+        original_values = {}
+        for env_var in env_vars_to_test:
+            original_values[env_var] = os.environ.get(env_var)
+            os.environ[env_var] = "1"
+
+        try:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                router.update_deployment_config(deployment_config)
+
+                # Check that deprecation warnings were emitted for each env var
+                deprecation_warnings = [
+                    warning
+                    for warning in w
+                    if issubclass(warning.category, DeprecationWarning)
+                ]
+                assert len(deprecation_warnings) == 3
+
+                warning_messages = [
+                    str(warning.message) for warning in deprecation_warnings
+                ]
+                for env_var in env_vars_to_test:
+                    assert any(
+                        env_var in msg for msg in warning_messages
+                    ), f"Expected deprecation warning for {env_var}"
+        finally:
+            # Restore original env var values
+            for env_var in env_vars_to_test:
+                if original_values[env_var] is None:
+                    os.environ.pop(env_var, None)
+                else:
+                    os.environ[env_var] = original_values[env_var]
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))
@@ -123,6 +123,15 @@ message RequestRouterConfig {
 
   // kwargs which Ray Serve passes to the router class' initialize_state method.
   bytes request_router_kwargs = 5;
+
+  // Initial backoff time (in seconds) before retrying to route a request.
+  double initial_backoff_s = 6;
+
+  // Multiplier applied to the backoff time after each retry.
+  double backoff_multiplier = 7;
+
+  // Maximum backoff time (in seconds) between retries.
+  double max_backoff_s = 8;
 }
 //[End] ROUTING CONFIG