Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/source/serve/advanced-guides/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ Ray Serve allows you to fine-tune the backoff behavior of the request router, wh
- `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER`: The multiplier applied to the backoff time after each retry. Default is `2`.
- `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S`: The maximum backoff time (in seconds) between retries. Default is `0.5`.

### Set timeouts while probing replicas for queue length

Ray Serve's request router probes replicas for their queue lengths to make intelligent load balancing decisions. You can tune the following environment variables to optimize this behavior for your workload:

- `RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The initial timeout (in seconds) for waiting for replicas to respond with their queue length information. Default is `0.1`.
- `RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The maximum timeout (in seconds) for queue length responses. When retrying with exponential backoff, the deadline increases but is capped at this value. Default is `1.0`.
- `RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S`: How long (in seconds) cached queue length information from replicas is considered valid. After this timeout, the cache entry expires and the router must probe the replica again. Default is `10.0`.

### Configure locality-based routing

Ray Serve routes requests to replicas based on locality to reduce network latency. The system applies locality routing in two scenarios: proxy-to-replica communication (HTTP/gRPC requests) and inter-deployment communication (replica-to-replica calls through `DeploymentHandle`).
Expand Down
10 changes: 10 additions & 0 deletions python/ray/serve/_private/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,16 @@ def from_proto(cls, proto: DeploymentConfigProto):
else:
data["request_router_config"]["request_router_kwargs"] = {}

# Replace falsy proto defaults with None so Pydantic uses its defaults.
# This is important during rolling upgrades when older controllers
# send configs without these fields (proto3 defaults to 0.0).
if not data["request_router_config"].get("initial_backoff_s"):
data["request_router_config"]["initial_backoff_s"] = None
if not data["request_router_config"].get("backoff_multiplier"):
data["request_router_config"]["backoff_multiplier"] = None
if not data["request_router_config"].get("max_backoff_s"):
data["request_router_config"]["max_backoff_s"] = None

data["request_router_config"] = RequestRouterConfig(
**data["request_router_config"]
)
Expand Down
3 changes: 3 additions & 0 deletions python/ray/serve/_private/constants_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ def get_env_bool(name: str, default: str) -> bool:
# Environment variables that are fully deprecated and will be ignored.
_fully_deprecated_env_vars = {
"RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S": "http_options.keep_alive_timeout_s",
"RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S": "request_router_config.initial_backoff_s",
"RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER": "request_router_config.backoff_multiplier",
"RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S": "request_router_config.max_backoff_s",
}


Expand Down
13 changes: 8 additions & 5 deletions python/ray/serve/_private/request_router/request_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,6 @@ def _fulfill_next_pending_request(
class RequestRouter(ABC):
"""Abstract interface for a request router (how the router calls it)."""

"""Backoff parameters for request router."""
initial_backoff_s = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
backoff_multiplier = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
max_backoff_s = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S

# Deadline for replicas to respond with their queue length. If the response isn't
# received within this deadline, the replica will not be considered.
# If this deadline is repeatedly missed, it will be exponentially increased up to
Expand Down Expand Up @@ -478,6 +473,9 @@ def __init__(
create_replica_wrapper_func: Optional[
Callable[[RunningReplicaInfo], RunningReplica]
] = None,
initial_backoff_s: float = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
backoff_multiplier: float = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
max_backoff_s: float = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
*args,
**kwargs,
):
Expand All @@ -488,6 +486,11 @@ def __init__(
self._create_replica_wrapper_func = create_replica_wrapper_func
self._get_curr_time_s = get_curr_time_s if get_curr_time_s else time.time

# Backoff parameters for request routing, from RequestRouterConfig.
self.initial_backoff_s = initial_backoff_s
self.backoff_multiplier = backoff_multiplier
self.max_backoff_s = max_backoff_s

# Current replicas available to be routed.
# Updated via `update_replicas`.
self._replica_id_set: Set[ReplicaID] = set()
Expand Down
23 changes: 23 additions & 0 deletions python/ray/serve/_private/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
SERVE_LOGGER_NAME,
)
from ray.serve._private.constants_utils import warn_if_deprecated_env_var_set
from ray.serve._private.event_loop_monitoring import EventLoopMonitor
from ray.serve._private.long_poll import LongPollClient, LongPollNamespace
from ray.serve._private.metrics_utils import (
Expand Down Expand Up @@ -495,6 +496,12 @@ async def create_event() -> asyncio.Event:


class AsyncioRouter:

# Backoff parameters for request routing.
_initial_backoff_s: float
_backoff_multiplier: float
_max_backoff_s: float

def __init__(
self,
controller_handle: ActorHandle,
Expand Down Expand Up @@ -639,6 +646,9 @@ def request_router(self) -> Optional[RequestRouter]:
prefer_local_node_routing=self._prefer_local_node_routing,
prefer_local_az_routing=RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
self_availability_zone=self._availability_zone,
initial_backoff_s=self._initial_backoff_s,
backoff_multiplier=self._backoff_multiplier,
max_backoff_s=self._max_backoff_s,
)
request_router.initialize_state(**(self._request_router_kwargs))

Expand Down Expand Up @@ -684,6 +694,19 @@ def update_deployment_config(self, deployment_config: DeploymentConfig):
self._request_router_kwargs = (
deployment_config.request_router_config.request_router_kwargs
)

# Warn if deprecated env vars are set
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S")
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER")
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S")

self._initial_backoff_s = (
deployment_config.request_router_config.initial_backoff_s
)
self._backoff_multiplier = (
deployment_config.request_router_config.backoff_multiplier
)
self._max_backoff_s = deployment_config.request_router_config.max_backoff_s
self._metrics_manager.update_deployment_config(
deployment_config,
curr_num_replicas=len(self.request_router.curr_replicas),
Expand Down
37 changes: 37 additions & 0 deletions python/ray/serve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
DEFAULT_REQUEST_ROUTING_STATS_TIMEOUT_S,
DEFAULT_TARGET_ONGOING_REQUESTS,
DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
SERVE_LOGGER_NAME,
)
from ray.serve._private.utils import validate_ssl_config
Expand Down Expand Up @@ -248,6 +251,28 @@ class RequestRouterConfig(BaseModel):
),
)

initial_backoff_s: Optional[float] = Field(
default=None,
description=(
"Initial backoff time (in seconds) before retrying to route a request "
"to a replica. Defaults to 0.025."
),
)

backoff_multiplier: Optional[float] = Field(
default=None,
description=(
"Multiplier applied to the backoff time after each retry. " "Defaults to 2."
),
)

max_backoff_s: Optional[float] = Field(
default=None,
description=(
"Maximum backoff time (in seconds) between retries. " "Defaults to 0.5."
),
)

@validator("request_router_kwargs", always=True)
def request_router_kwargs_json_serializable(cls, v):
if isinstance(v, bytes):
Expand All @@ -262,6 +287,18 @@ def request_router_kwargs_json_serializable(cls, v):

return v

@validator("initial_backoff_s", always=True)
def set_initial_backoff_s_default(cls, v):
return v if v is not None else RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S

@validator("backoff_multiplier", always=True)
def set_backoff_multiplier_default(cls, v):
return v if v is not None else RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER

@validator("max_backoff_s", always=True)
def set_max_backoff_s_default(cls, v):
return v if v is not None else RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S

def __init__(self, **kwargs: dict[str, Any]):
"""Initialize RequestRouterConfig with the given parameters.

Expand Down
24 changes: 24 additions & 0 deletions python/ray/serve/tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
from ray.serve._private.constants import (
DEFAULT_AUTOSCALING_POLICY_NAME,
DEFAULT_GRPC_PORT,
RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
)
from ray.serve._private.request_router import PowerOfTwoChoicesRequestRouter
from ray.serve._private.utils import DEFAULT
Expand Down Expand Up @@ -1134,6 +1137,27 @@ def test_optional_field(self):
assert "initial_replicas" not in result


def test_request_router_config_backoff_params_proto_zero_defaults():
"""Test that zero proto defaults don't override Pydantic defaults.

Proto3 fields default to 0.0 when not set. During rolling upgrades,
older controllers may send configs without backoff fields. This test
verifies that zero values are replaced with proper defaults to avoid
tight retry loops.
"""
# Simulate a proto with zero default values (as if from older controller)
config = RequestRouterConfig(
initial_backoff_s=None,
backoff_multiplier=None,
max_backoff_s=None,
)

# Verify that None values are replaced with proper defaults
assert config.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
assert config.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
assert config.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S


if __name__ == "__main__":
import sys

Expand Down
46 changes: 45 additions & 1 deletion python/ray/serve/tests/unit/test_pow_2_request_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
ReplicaID,
RequestMetadata,
)
from ray.serve._private.constants import RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S
from ray.serve._private.constants import (
RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S,
RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
)
from ray.serve._private.replica_result import ReplicaResult
from ray.serve._private.request_router import (
PendingRequest,
Expand Down Expand Up @@ -2034,5 +2039,44 @@ async def test_rank_replicas_via_multiplex(
]


def test_request_router_backoff_params_default():
"""Test that backoff params use env var defaults when not specified."""
router = PowerOfTwoChoicesRequestRouter(
deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
handle_source=DeploymentHandleSource.REPLICA,
self_node_id=ROUTER_NODE_ID,
self_actor_id="fake-actor-id",
self_actor_handle=None,
get_curr_time_s=TIMER.time,
)

assert router.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
assert router.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
assert router.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S


def test_request_router_backoff_params_custom():
"""Test that custom backoff params are properly set on the RequestRouter."""
custom_initial_backoff = 0.1
custom_multiplier = 5
custom_max_backoff = 3.0

router = PowerOfTwoChoicesRequestRouter(
deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
handle_source=DeploymentHandleSource.REPLICA,
self_node_id=ROUTER_NODE_ID,
self_actor_id="fake-actor-id",
self_actor_handle=None,
get_curr_time_s=TIMER.time,
initial_backoff_s=custom_initial_backoff,
backoff_multiplier=custom_multiplier,
max_backoff_s=custom_max_backoff,
)

assert router.initial_backoff_s == custom_initial_backoff
assert router.backoff_multiplier == custom_multiplier
assert router.max_backoff_s == custom_max_backoff


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))
Loading