Skip to content

Commit db33c04

Browse files
kouroshHakhaclaude
andauthored
[serve] Add ControllerOptions for configurable controller runtime_env (#63352)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 31510bc commit db33c04

13 files changed

Lines changed: 451 additions & 14 deletions

File tree

ci/lint/pydoclint-baseline.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,8 +1477,6 @@ python/ray/serve/api.py
14771477
DOC103: Function `start`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**kwargs: ].
14781478
DOC201: Function `get_replica_context` does not have a return section in docstring
14791479
DOC201: Function `ingress` does not have a return section in docstring
1480-
DOC101: Function `run_many`: Docstring contains fewer arguments than in function signature.
1481-
DOC103: Function `run_many`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [_local_testing_mode: bool].
14821480
DOC101: Function `run`: Docstring contains fewer arguments than in function signature.
14831481
DOC103: Function `run`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [_local_testing_mode: bool].
14841482
DOC101: Function `multiplexed`: Docstring contains fewer arguments than in function signature.

doc/source/serve/api/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ See the [model composition guide](serve-model-composition) for how to update cod
9494
:toctree: doc/
9595
:template: autosummary/autopydantic.rst
9696
97+
serve.config.ControllerOptions
9798
serve.config.gRPCOptions
9899
serve.config.HTTPOptions
99100
serve.config.AutoscalingConfig

python/ray/dashboard/modules/serve/serve_head.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ async def put_all_applications(self, req: Request) -> Response:
170170
http_options=full_http_options,
171171
grpc_options=grpc_options,
172172
global_logging_config=config.logging_config,
173+
controller_options=config.controller_options,
173174
)
174175

175176
# Serve ignores HTTP options if it was already running when

python/ray/serve/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
status,
2727
)
2828
from ray.serve.batching import batch
29-
from ray.serve.config import HTTPOptions
29+
from ray.serve.config import ControllerOptions, HTTPOptions
3030
from ray.serve.utils import get_trace_context
3131

3232
except ModuleNotFoundError as e:
@@ -48,6 +48,7 @@
4848
"_run_many",
4949
"batch",
5050
"start",
51+
"ControllerOptions",
5152
"HTTPOptions",
5253
"get_replica_context",
5354
"get_deployment_actor",

python/ray/serve/_private/api.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
SERVE_NAMESPACE,
1616
)
1717
from ray.serve._private.default_impl import get_controller_impl
18-
from ray.serve.config import HTTPOptions, gRPCOptions
18+
from ray.serve.config import ControllerOptions, HTTPOptions, gRPCOptions
1919
from ray.serve.context import (
2020
_check_cached_client_alive,
2121
_get_global_client,
@@ -28,6 +28,22 @@
2828
logger = logging.getLogger(SERVE_LOGGER_NAME)
2929

3030

31+
def _coerce_controller_options(
32+
controller_options: Union[None, dict, ControllerOptions],
33+
) -> ControllerOptions:
34+
"""Normalize an optional dict / model into a validated ControllerOptions."""
35+
if controller_options is None:
36+
return ControllerOptions()
37+
if isinstance(controller_options, ControllerOptions):
38+
return controller_options
39+
if isinstance(controller_options, dict):
40+
return ControllerOptions.model_validate(controller_options)
41+
raise TypeError(
42+
"controller_options must be a dict, ControllerOptions, or None; got "
43+
f"{type(controller_options).__name__}."
44+
)
45+
46+
3147
def _check_http_options(
3248
client: ServeControllerClient, http_options: Union[dict, HTTPOptions]
3349
) -> None:
@@ -56,6 +72,7 @@ def _start_controller(
5672
http_options: Union[None, dict, HTTPOptions] = None,
5773
grpc_options: Union[None, dict, gRPCOptions] = None,
5874
global_logging_config: Union[None, dict, LoggingConfig] = None,
75+
controller_options: Union[None, dict, ControllerOptions] = None,
5976
**kwargs,
6077
) -> ActorHandle:
6178
"""Start Ray Serve controller.
@@ -95,7 +112,8 @@ def _start_controller(
95112
elif isinstance(global_logging_config, dict):
96113
global_logging_config = LoggingConfig(**global_logging_config)
97114

98-
controller_impl = get_controller_impl()
115+
controller_options = _coerce_controller_options(controller_options)
116+
controller_impl = get_controller_impl(controller_options=controller_options)
99117
controller = controller_impl.remote(
100118
http_options=http_options,
101119
grpc_options=grpc_options,
@@ -120,6 +138,7 @@ async def serve_start_async(
120138
http_options: Union[None, dict, HTTPOptions] = None,
121139
grpc_options: Union[None, dict, gRPCOptions] = None,
122140
global_logging_config: Union[None, dict, LoggingConfig] = None,
141+
controller_options: Union[None, dict, ControllerOptions] = None,
123142
**kwargs,
124143
) -> ServeControllerClient:
125144
"""Initialize a serve instance asynchronously.
@@ -134,6 +153,10 @@ async def serve_start_async(
134153

135154
usage_lib.record_library_usage("serve")
136155

156+
# Validate eagerly in the caller so a bad ``controller_options`` raises
157+
# locally rather than from the ``_start_controller`` Ray task.
158+
controller_options = _coerce_controller_options(controller_options)
159+
137160
client, _ = _check_cached_client_alive()
138161
if client is None:
139162
try:
@@ -143,7 +166,7 @@ async def serve_start_async(
143166
if client is not None:
144167
logger.info(
145168
f'Connecting to existing Serve app in namespace "{SERVE_NAMESPACE}".'
146-
" New http options will not be applied."
169+
" New http_options/controller_options will not be applied."
147170
)
148171
if http_options:
149172
_check_http_options(client, http_options)
@@ -152,7 +175,13 @@ async def serve_start_async(
152175
controller = (
153176
await ray.remote(_start_controller)
154177
.options(num_cpus=0)
155-
.remote(http_options, grpc_options, global_logging_config, **kwargs)
178+
.remote(
179+
http_options,
180+
grpc_options,
181+
global_logging_config,
182+
controller_options=controller_options,
183+
**kwargs,
184+
)
156185
)
157186

158187
client = ServeControllerClient(
@@ -167,6 +196,7 @@ def serve_start(
167196
http_options: Union[None, dict, HTTPOptions] = None,
168197
grpc_options: Union[None, dict, gRPCOptions] = None,
169198
global_logging_config: Union[None, dict, LoggingConfig] = None,
199+
controller_options: Union[None, dict, ControllerOptions] = None,
170200
**kwargs,
171201
) -> ServeControllerClient:
172202
"""Initialize a serve instance.
@@ -207,10 +237,17 @@ def serve_start(
207237
- grpc_servicer_functions(list): List of import paths for gRPC
208238
`add_servicer_to_server` functions to add to Serve's gRPC proxy.
209239
Default empty list, meaning not to start the gRPC server.
240+
controller_options: Optional ``ControllerOptions`` (or dict) for the
241+
Serve controller actor. Currently only ``runtime_env.env_vars``
242+
is honored; see ``ray.serve.config.ControllerOptions``. Only
243+
applied on first controller creation -- ignored if the controller
244+
is already running in this Ray cluster (a log line is emitted).
210245
"""
211246

212247
usage_lib.record_library_usage("serve")
213248

249+
controller_options = _coerce_controller_options(controller_options)
250+
214251
client, _ = _check_cached_client_alive()
215252
if client is None:
216253
try:
@@ -220,14 +257,18 @@ def serve_start(
220257
if client is not None:
221258
logger.info(
222259
f'Connecting to existing Serve app in namespace "{SERVE_NAMESPACE}".'
223-
" New http options will not be applied."
260+
" New http_options/controller_options will not be applied."
224261
)
225262
if http_options:
226263
_check_http_options(client, http_options)
227264
return client
228265

229266
controller = _start_controller(
230-
http_options, grpc_options, global_logging_config, **kwargs
267+
http_options,
268+
grpc_options,
269+
global_logging_config,
270+
controller_options=controller_options,
271+
**kwargs,
231272
)
232273

233274
client = ServeControllerClient(

python/ray/serve/_private/default_impl.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
inside_ray_client_context,
4343
resolve_deployment_response,
4444
)
45+
from ray.serve.config import ControllerOptions
4546
from ray.util.placement_group import PlacementGroup
4647

4748
# NOTE: Please read carefully before changing!
@@ -242,10 +243,17 @@ def get_proxy_handle(endpoint: DeploymentID, info: EndpointInfo):
242243
)
243244

244245

245-
def get_controller_impl():
246+
def get_controller_impl(controller_options: Optional[ControllerOptions] = None):
247+
"""Build the Ray actor class for the Serve controller.
248+
249+
``controller_options`` is the validated ``ControllerOptions`` model from
250+
``serve.start`` / ``serve.run`` / the YAML schema. Today only its
251+
``runtime_env`` field is consumed; future fields (num_cpus, resources,
252+
max_concurrency overrides) slot in here.
253+
"""
246254
from ray.serve._private.controller import ServeController
247255

248-
controller_impl = ray.remote(
256+
actor_options = dict(
249257
name=SERVE_CONTROLLER_NAME,
250258
namespace=SERVE_NAMESPACE,
251259
num_cpus=0,
@@ -255,9 +263,13 @@ def get_controller_impl():
255263
resources={HEAD_NODE_RESOURCE_NAME: 0.001},
256264
max_concurrency=CONTROLLER_MAX_CONCURRENCY,
257265
enable_task_events=RAY_SERVE_ENABLE_TASK_EVENTS,
258-
)(ServeController)
266+
)
267+
if controller_options is not None and controller_options.runtime_env:
268+
# The validator on ControllerOptions guarantees this is a dict
269+
# containing only the ``env_vars`` key with str->str entries.
270+
actor_options["runtime_env"] = controller_options.runtime_env
259271

260-
return controller_impl
272+
return ray.remote(**actor_options)(ServeController)
261273

262274

263275
def get_proxy_actor_class():

python/ray/serve/api.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
)
4242
from ray.serve.config import (
4343
AutoscalingConfig,
44+
ControllerOptions,
4445
DeploymentActorConfig,
4546
GangSchedulingConfig,
4647
HTTPOptions,
@@ -77,6 +78,7 @@ def start(
7778
http_options: Union[None, dict, HTTPOptions] = None,
7879
grpc_options: Union[None, dict, gRPCOptions] = None,
7980
logging_config: Union[None, dict, LoggingConfig] = None,
81+
controller_options: Union[None, dict, ControllerOptions] = None,
8082
**kwargs,
8183
):
8284
"""Start Serve on the cluster.
@@ -102,12 +104,19 @@ def start(
102104
class See `gRPCOptions` for supported options.
103105
logging_config: logging config options for the serve component (
104106
controller & proxy).
107+
controller_options: [EXPERIMENTAL] Options for the Serve controller actor.
108+
Currently scoped to a strictly-validated ``runtime_env.env_vars``
109+
(other ``runtime_env`` keys are rejected). See
110+
``ray.serve.config.ControllerOptions``. Only applied on first
111+
controller creation -- ignored if a Serve controller is already
112+
running in this Ray cluster.
105113
"""
106114
http_options = prepare_imperative_http_options(proxy_location, http_options)
107115
_private_api.serve_start(
108116
http_options=http_options,
109117
grpc_options=grpc_options,
110118
global_logging_config=logging_config,
119+
controller_options=controller_options,
111120
**kwargs,
112121
)
113122

@@ -727,6 +736,7 @@ def _run_many(
727736
wait_for_ingress_deployment_creation: bool = True,
728737
wait_for_applications_running: bool = True,
729738
_local_testing_mode: bool = False,
739+
controller_options: Union[None, dict, ControllerOptions] = None,
730740
) -> List[DeploymentHandle]:
731741
"""Run many applications and return the handles to their ingress deployments.
732742
@@ -787,6 +797,7 @@ def _run_many(
787797
client = _private_api.serve_start(
788798
http_options={"location": "EveryNode"},
789799
global_logging_config=None,
800+
controller_options=controller_options,
790801
)
791802

792803
# Record after Ray has been started.
@@ -814,6 +825,7 @@ def _run(
814825
logging_config: Optional[Union[Dict, LoggingConfig]] = None,
815826
_local_testing_mode: bool = False,
816827
external_scaler_enabled: bool = False,
828+
controller_options: Union[None, dict, ControllerOptions] = None,
817829
) -> DeploymentHandle:
818830
"""Run an application and return a handle to its ingress deployment.
819831
@@ -832,6 +844,7 @@ def _run(
832844
],
833845
wait_for_applications_running=_blocking,
834846
_local_testing_mode=_local_testing_mode,
847+
controller_options=controller_options,
835848
)[0]
836849

837850

@@ -842,6 +855,7 @@ def run_many(
842855
wait_for_ingress_deployment_creation: bool = True,
843856
wait_for_applications_running: bool = True,
844857
_local_testing_mode: bool = False,
858+
controller_options: Union[None, dict, ControllerOptions] = None,
845859
) -> List[DeploymentHandle]:
846860
"""Run many applications and return the handles to their ingress deployments.
847861
@@ -858,6 +872,13 @@ def run_many(
858872
`wait_for_ingress_deployment_creation=True`,
859873
because the ingress deployments must be created
860874
before the applications can be running.
875+
_local_testing_mode: Internal flag enabling in-process local testing
876+
mode. Not part of the public API.
877+
controller_options: [EXPERIMENTAL] Options for the Serve controller
878+
actor (e.g. ``runtime_env.env_vars`` for HAProxy / controller-side
879+
tunables). See ``ray.serve.config.ControllerOptions``. Only applied
880+
on first controller creation -- ignored if a Serve controller is
881+
already running in this Ray cluster.
861882
862883
Returns:
863884
List[DeploymentHandle]: A list of handles that can be used
@@ -868,6 +889,7 @@ def run_many(
868889
wait_for_ingress_deployment_creation=wait_for_ingress_deployment_creation,
869890
wait_for_applications_running=wait_for_applications_running,
870891
_local_testing_mode=_local_testing_mode,
892+
controller_options=controller_options,
871893
)
872894

873895
if blocking:
@@ -885,6 +907,7 @@ def run(
885907
logging_config: Optional[Union[Dict, LoggingConfig]] = None,
886908
_local_testing_mode: bool = False,
887909
external_scaler_enabled: bool = False,
910+
controller_options: Union[None, dict, ControllerOptions] = None,
888911
) -> DeploymentHandle:
889912
"""Run an application and return a handle to its ingress deployment.
890913
@@ -910,6 +933,11 @@ def run(
910933
be applied to all deployments which doesn't have logging config.
911934
external_scaler_enabled: Whether external autoscaling is enabled for
912935
this application.
936+
controller_options: [EXPERIMENTAL] Options for the Serve controller
937+
actor (e.g. ``runtime_env.env_vars`` for HAProxy / controller-side
938+
tunables). See ``ray.serve.config.ControllerOptions``. Only applied
939+
on first controller creation -- ignored if a Serve controller is
940+
already running in this Ray cluster.
913941
914942
Returns:
915943
DeploymentHandle: A handle that can be used to call the application.
@@ -921,6 +949,7 @@ def run(
921949
logging_config=logging_config,
922950
_local_testing_mode=_local_testing_mode,
923951
external_scaler_enabled=external_scaler_enabled,
952+
controller_options=controller_options,
924953
)
925954

926955
if blocking:

0 commit comments

Comments
 (0)