Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/source/serve/advanced-guides/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ Ray Serve allows you to fine-tune the backoff behavior of the request router, wh
- `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER`: The multiplier applied to the backoff time after each retry. Default is `2`.
- `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S`: The maximum backoff time (in seconds) between retries. Default is `0.5`.

### Set timeouts while probing replicas for queue length

Ray Serve's request router probes replicas for their queue lengths to make intelligent load balancing decisions. You can tune the following environment variables to optimize this behavior for your workload:

- `RAY_SERVE_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The initial timeout (in seconds) for waiting for replicas to respond with their queue length information. Default is `0.1`.
- `RAY_SERVE_MAX_QUEUE_LENGTH_RESPONSE_DEADLINE_S`: The maximum timeout (in seconds) for queue length responses. When retrying with exponential backoff, the deadline increases but is capped at this value. Default is `1.0`.
- `RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S`: How long (in seconds) cached queue length information from replicas is considered valid. After this timeout, the cache entry expires and the router must probe the replica again. Default is `10.0`.

### Configure locality-based routing

Ray Serve routes requests to replicas based on locality to reduce network latency. The system applies locality routing in two scenarios: proxy-to-replica communication (HTTP/gRPC requests) and inter-deployment communication (replica-to-replica calls through `DeploymentHandle`).
Expand Down
3 changes: 3 additions & 0 deletions python/ray/serve/_private/constants_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ def get_env_bool(name: str, default: str) -> bool:
# Environment variables that are fully deprecated and will be ignored.
_fully_deprecated_env_vars = {
"RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S": "http_options.keep_alive_timeout_s",
"RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S": "request_router_config.initial_backoff_s",
"RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER": "request_router_config.backoff_multiplier",
"RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S": "request_router_config.max_backoff_s",
}


Expand Down
13 changes: 8 additions & 5 deletions python/ray/serve/_private/request_router/request_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,6 @@ def _fulfill_next_pending_request(
class RequestRouter(ABC):
"""Abstract interface for a request router (how the router calls it)."""

"""Backoff parameters for request router."""
initial_backoff_s = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
backoff_multiplier = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
max_backoff_s = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S

# Deadline for replicas to respond with their queue length. If the response isn't
# received within this deadline, the replica will not be considered.
# If this deadline is repeatedly missed, it will be exponentially increased up to
Expand Down Expand Up @@ -478,6 +473,9 @@ def __init__(
create_replica_wrapper_func: Optional[
Callable[[RunningReplicaInfo], RunningReplica]
] = None,
initial_backoff_s: float = RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
backoff_multiplier: float = RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
max_backoff_s: float = RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
*args,
**kwargs,
):
Expand All @@ -488,6 +486,11 @@ def __init__(
self._create_replica_wrapper_func = create_replica_wrapper_func
self._get_curr_time_s = get_curr_time_s if get_curr_time_s else time.time

# Backoff parameters for request routing, from RequestRouterConfig.
self.initial_backoff_s = initial_backoff_s
self.backoff_multiplier = backoff_multiplier
self.max_backoff_s = max_backoff_s

# Current replicas available to be routed.
# Updated via `update_replicas`.
self._replica_id_set: Set[ReplicaID] = set()
Expand Down
23 changes: 23 additions & 0 deletions python/ray/serve/_private/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
SERVE_LOGGER_NAME,
)
from ray.serve._private.constants_utils import warn_if_deprecated_env_var_set
from ray.serve._private.event_loop_monitoring import EventLoopMonitor
from ray.serve._private.long_poll import LongPollClient, LongPollNamespace
from ray.serve._private.metrics_utils import (
Expand Down Expand Up @@ -495,6 +496,12 @@ async def create_event() -> asyncio.Event:


class AsyncioRouter:

# Backoff parameters for request routing.
_initial_backoff_s: float
_backoff_multiplier: float
_max_backoff_s: float

def __init__(
self,
controller_handle: ActorHandle,
Expand Down Expand Up @@ -639,6 +646,9 @@ def request_router(self) -> Optional[RequestRouter]:
prefer_local_node_routing=self._prefer_local_node_routing,
prefer_local_az_routing=RAY_SERVE_PROXY_PREFER_LOCAL_AZ_ROUTING,
self_availability_zone=self._availability_zone,
initial_backoff_s=self._initial_backoff_s,
backoff_multiplier=self._backoff_multiplier,
max_backoff_s=self._max_backoff_s,
)
request_router.initialize_state(**(self._request_router_kwargs))

Expand Down Expand Up @@ -684,6 +694,19 @@ def update_deployment_config(self, deployment_config: DeploymentConfig):
self._request_router_kwargs = (
deployment_config.request_router_config.request_router_kwargs
)

# Warn if deprecated env vars are set
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S")
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER")
warn_if_deprecated_env_var_set("RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S")

self._initial_backoff_s = (
deployment_config.request_router_config.initial_backoff_s
)
self._backoff_multiplier = (
deployment_config.request_router_config.backoff_multiplier
)
self._max_backoff_s = deployment_config.request_router_config.max_backoff_s
self._metrics_manager.update_deployment_config(
deployment_config,
curr_num_replicas=len(self.request_router.curr_replicas),
Expand Down
30 changes: 30 additions & 0 deletions python/ray/serve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
DEFAULT_REQUEST_ROUTING_STATS_TIMEOUT_S,
DEFAULT_TARGET_ONGOING_REQUESTS,
DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
SERVE_LOGGER_NAME,
)
from ray.serve._private.utils import validate_ssl_config
Expand Down Expand Up @@ -248,6 +251,33 @@ class RequestRouterConfig(BaseModel):
),
)

initial_backoff_s: float = Field(
default=RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
description=(
"Initial backoff time (in seconds) before retrying to route a request "
"to a replica. Defaults to RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S "
"environment variable, or 0.025 if not set."
),
Comment on lines 256 to 259
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The description is a bit confusing. To make it clearer that the environment variable is being deprecated, consider rephrasing it. For example:

"Initial backoff time (in seconds) before retrying to route a request to a replica. Defaults to 0.025. This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S` environment variable."

This applies to backoff_multiplier and max_backoff_s as well.

        description=(
            "Initial backoff time (in seconds) before retrying to route a request "
            "to a replica. Defaults to 0.025. This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S` environment variable."
        ),

)

backoff_multiplier: float = Field(
default=RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
description=(
"Multiplier applied to the backoff time after each retry. "
"Defaults to RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER "
"environment variable, or 2 if not set."
),
Comment on lines 264 to 266
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To make it clearer that the environment variable is being deprecated, consider rephrasing the description. For example:

"Multiplier applied to the backoff time after each retry. Defaults to 2. This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER` environment variable."
        description=(
            "Multiplier applied to the backoff time after each retry. Defaults to 2. "
            "This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER` environment variable."
        ),

)

max_backoff_s: float = Field(
default=RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
description=(
"Maximum backoff time (in seconds) between retries. "
"Defaults to RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S "
"environment variable, or 0.5 if not set."
),
Comment on lines 271 to 273
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To make it clearer that the environment variable is being deprecated, consider rephrasing the description. For example:

"Maximum backoff time (in seconds) between retries. Defaults to 0.5. This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S` environment variable."
        description=(
            "Maximum backoff time (in seconds) between retries. Defaults to 0.5. "
            "This can be overridden by the deprecated `RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S` environment variable."
        ),

)

@validator("request_router_kwargs", always=True)
def request_router_kwargs_json_serializable(cls, v):
if isinstance(v, bytes):
Expand Down
46 changes: 45 additions & 1 deletion python/ray/serve/tests/unit/test_pow_2_request_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
ReplicaID,
RequestMetadata,
)
from ray.serve._private.constants import RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S
from ray.serve._private.constants import (
RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S,
RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER,
RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S,
RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S,
)
from ray.serve._private.replica_result import ReplicaResult
from ray.serve._private.request_router import (
PendingRequest,
Expand Down Expand Up @@ -2034,5 +2039,44 @@ async def test_rank_replicas_via_multiplex(
]


def test_request_router_backoff_params_default():
"""Test that backoff params use env var defaults when not specified."""
router = PowerOfTwoChoicesRequestRouter(
deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
handle_source=DeploymentHandleSource.REPLICA,
self_node_id=ROUTER_NODE_ID,
self_actor_id="fake-actor-id",
self_actor_handle=None,
get_curr_time_s=TIMER.time,
)

assert router.initial_backoff_s == RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S
assert router.backoff_multiplier == RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER
assert router.max_backoff_s == RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S


def test_request_router_backoff_params_custom():
"""Test that custom backoff params are properly set on the RequestRouter."""
custom_initial_backoff = 0.1
custom_multiplier = 5
custom_max_backoff = 3.0

router = PowerOfTwoChoicesRequestRouter(
deployment_id=DeploymentID(name="TEST_DEPLOYMENT"),
handle_source=DeploymentHandleSource.REPLICA,
self_node_id=ROUTER_NODE_ID,
self_actor_id="fake-actor-id",
self_actor_handle=None,
get_curr_time_s=TIMER.time,
initial_backoff_s=custom_initial_backoff,
backoff_multiplier=custom_multiplier,
max_backoff_s=custom_max_backoff,
)

assert router.initial_backoff_s == custom_initial_backoff
assert router.backoff_multiplier == custom_multiplier
assert router.max_backoff_s == custom_max_backoff


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))
113 changes: 113 additions & 0 deletions python/ray/serve/tests/unit/test_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -1343,5 +1343,118 @@ def release_lock():
assign_request_future.exception()


@pytest.mark.asyncio
class TestAsyncioRouterBackoffConfig:
"""Test that backoff config flows from DeploymentConfig to RequestRouter."""

async def test_update_deployment_config_sets_backoff_params(self):
"""Test that update_deployment_config extracts backoff params from config."""
from ray.serve.config import RequestRouterConfig

fake_request_router = FakeRequestRouter(use_queue_len_cache=False)
router = AsyncioRouter(
controller_handle=Mock(),
deployment_id=DeploymentID(name="test-deployment"),
handle_id="test-handle-id",
self_actor_id="test-node-id",
handle_source=DeploymentHandleSource.UNKNOWN,
event_loop=get_or_create_event_loop(),
enable_strict_max_ongoing_requests=False,
request_router=fake_request_router,
node_id="test-node-id",
availability_zone="test-az",
prefer_local_node_routing=False,
_request_router_initialized_event=asyncio.Event(),
)

# Create a DeploymentConfig with custom backoff params
custom_initial_backoff = 0.15
custom_multiplier = 4
custom_max_backoff = 2.5

deployment_config = DeploymentConfig.from_default(
request_router_config=RequestRouterConfig(
initial_backoff_s=custom_initial_backoff,
backoff_multiplier=custom_multiplier,
max_backoff_s=custom_max_backoff,
)
)

# Update the router with the config
router.update_deployment_config(deployment_config)

# Verify the backoff params were stored on the router
assert router._initial_backoff_s == custom_initial_backoff
assert router._backoff_multiplier == custom_multiplier
assert router._max_backoff_s == custom_max_backoff

async def test_deprecated_env_vars_emit_warning(self):
"""Test that deprecated backoff env vars emit deprecation warnings."""
import os
import warnings

from ray.serve.config import RequestRouterConfig

fake_request_router = FakeRequestRouter(use_queue_len_cache=False)
router = AsyncioRouter(
controller_handle=Mock(),
deployment_id=DeploymentID(name="test-deployment"),
handle_id="test-handle-id",
self_actor_id="test-node-id",
handle_source=DeploymentHandleSource.UNKNOWN,
event_loop=get_or_create_event_loop(),
enable_strict_max_ongoing_requests=False,
request_router=fake_request_router,
node_id="test-node-id",
availability_zone="test-az",
prefer_local_node_routing=False,
_request_router_initialized_event=asyncio.Event(),
)

deployment_config = DeploymentConfig.from_default(
request_router_config=RequestRouterConfig()
)

# Set deprecated env vars
env_vars_to_test = [
"RAY_SERVE_ROUTER_RETRY_INITIAL_BACKOFF_S",
"RAY_SERVE_ROUTER_RETRY_BACKOFF_MULTIPLIER",
"RAY_SERVE_ROUTER_RETRY_MAX_BACKOFF_S",
]

original_values = {}
for env_var in env_vars_to_test:
original_values[env_var] = os.environ.get(env_var)
os.environ[env_var] = "1"

try:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
router.update_deployment_config(deployment_config)

# Check that deprecation warnings were emitted for each env var
deprecation_warnings = [
warning
for warning in w
if issubclass(warning.category, DeprecationWarning)
]
assert len(deprecation_warnings) == 3

warning_messages = [
str(warning.message) for warning in deprecation_warnings
]
for env_var in env_vars_to_test:
assert any(
env_var in msg for msg in warning_messages
), f"Expected deprecation warning for {env_var}"
finally:
# Restore original env var values
for env_var in env_vars_to_test:
if original_values[env_var] is None:
os.environ.pop(env_var, None)
else:
os.environ[env_var] = original_values[env_var]


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))
9 changes: 9 additions & 0 deletions src/ray/protobuf/serve.proto
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,15 @@ message RequestRouterConfig {

// kwargs which Ray Serve passes to the router class' initialize_state method.
bytes request_router_kwargs = 5;

// Initial backoff time (in seconds) before retrying to route a request.
double initial_backoff_s = 6;

// Multiplier applied to the backoff time after each retry.
double backoff_multiplier = 7;

// Maximum backoff time (in seconds) between retries.
double max_backoff_s = 8;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR lacks description explaining changes

Low Severity

⚠️ This PR needs a clearer title and/or description.

To help reviewers, please ensure your PR includes:

  • Title: A concise summary of the change
  • Description:
    • What problem does this solve?
    • How does this PR solve it?
    • Any relevant context for reviewers such as:
      • Why is the problem important to solve?
      • Why was this approach chosen over others?

See this list of PRs as examples for PRs that have gone above and beyond:

Fix in Cursor Fix in Web

}
//[End] ROUTING CONFIG

Expand Down