Skip to content

Commit 6c35629

Browse files
authored
[autoscaler v2] Interface between autoscaler and gcs (ray-project#34680)
Why are these changes needed? This PR introduce the interface between GCS and Autoscaler. Specifically it introduces 2 APIs GetClusterResourceState: Autoscaler will query this interface to get cluster resource usage, which includes nodes (state and resource ulitization), as well as pending requests, which include ResourceRequest, GangResourceRequest, as well as ClusterResourceConstraint. For NodeState, it includes NodeStatus, which can transit from ALIVE -> DEAD, or ALIVE -> DRAIN_PENDING -> DRAINING -> DRAINED -> DEAD, or ALIVE -> DRAIN_PENDING -> DRAIN_FAILED. it also includes instance_id where the autoscaler is aware of, this allows autoscaler to do reconsiliation if available. For ResourceRequest, it comes with a PlacementConstraint which only support AntiAffinityConstraint today, which the semantics the resource request can't be allocated on a node with the same label/value specified in the AntiAffinityConstraint There is also GangResourceRequest, which has gang scheduling semantics where the requests in the gang should be all fulfilled atomically. ReportAutoscalingState: Autoscaler will also report its own state back to cluster using this API, where it includes all instances (including both pending launch), as well as infeasible requests. Instance state could transition from QUEUED -> REQUESTED -> BOOTSTRAPPING -> ALIVE -> TERMINATING -> DEAD. two special states are TO_BE_PREEMPTED and TO_BE_DRAINED, where one is force preemption, another is collaborating draining (can be reversed). It also reports back requests that infeasible, associated with a specific request version.
1 parent 17df2ef commit 6c35629

File tree

2 files changed

+236
-0
lines changed

2 files changed

+236
-0
lines changed

src/ray/protobuf/BUILD

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,3 +347,18 @@ cc_proto_library(
347347
name = "usage_cc_proto",
348348
deps = [":usage_proto"],
349349
)
350+
351+
proto_library(
352+
name = "autoscaler_proto",
353+
srcs = ["experimental/autoscaler.proto"],
354+
)
355+
356+
python_grpc_compile(
357+
name = "autoscaler_py_proto",
358+
deps = [":autoscaler_proto"],
359+
)
360+
361+
cc_proto_library(
362+
name = "autoscaler_cc_proto",
363+
deps = [":autoscaler_proto"],
364+
)
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// Copyright 2023 The Ray Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
syntax = "proto3";
16+
option cc_enable_arenas = true;
17+
18+
package ray.rpc;
19+
20+
// ============= Cluster Resources ====================
21+
//
22+
// Following fields represents the Cluster Resources autoscaler interested
23+
// in.
24+
25+
// Represents an anti-affinity constraint. A bundle with this constraint
26+
// can't be allocated to a node that has a label with the same name and
27+
// value. This is used to implement placement group anti-affinity.
28+
//
29+
// For placement group, the label_name is "_PG" (reserved),
30+
// and the label_value is the placement group id.
31+
message AntiAffinityConstraint {
32+
string label_name = 1;
33+
string label_value = 2;
34+
// If true, the label will be created on the node
35+
// where the request with this constraint is scheduled.
36+
bool create_label_on_schedule = 3;
37+
}
38+
39+
message PlacementConstraint {
40+
AntiAffinityConstraint anti_affinity = 1;
41+
}
42+
43+
message ResourceRequest {
44+
// resource requirements for the request.
45+
map<string, double> resources_bundle = 1;
46+
// placement constraint for the request. multiple constraints
47+
// form AND semantics.
48+
repeated PlacementConstraint placement_constraints = 2;
49+
}
50+
51+
message ResourceRequestByCount {
52+
ResourceRequest request = 1;
53+
int64 count = 2;
54+
}
55+
56+
// All bundles in the same resource request require gang
57+
// allocation semantics: they should be allocated all or nothing.
58+
message GangResourceRequest {
59+
// a map from bundles to the number of bundles requested.
60+
repeated ResourceRequest requests = 1;
61+
}
62+
63+
// Cluster resource constraint represents minimial cluster size requirement,
64+
// this is issued through ray.autoscaler.sdk.request_resources.
65+
message ClusterResourceConstraint {
66+
// If not empty, the cluster should have the capacity (total resource) to
67+
// fit the min_resources.
68+
map<string, double> min_resources = 1;
69+
// If not emtpy, the cluster should have the capacity (total resource) to fit
70+
// the min_bundles.
71+
repeated ResourceRequest min_bundles = 2;
72+
// Id of the job who issued this constraint.
73+
string job_id = 3;
74+
}
75+
76+
message NodeState {
77+
enum NodeStatus {
78+
// Node is alive.
79+
ALIVE = 0;
80+
// Node is dead.
81+
DEAD = 1;
82+
// Node is being drained.
83+
DRAIN_PENDING = 2;
84+
// Node is being drained.
85+
DRAIN_FAILED = 3;
86+
// Node is being drained.
87+
DRAINING = 4;
88+
// Node is already drained, and ready to be removed.
89+
DRAINED = 5;
90+
}
91+
// The node id internal to Ray.
92+
string node_id = 11;
93+
94+
// The instance id that the node is running on.
95+
// This is passed in when the node is registered.
96+
string instance_id = 12;
97+
98+
// The available resources on the node.
99+
// Reserved resource names: CPU, GPU, MEMORY, OBJECT_STORE_MEMORY
100+
map<string, double> available_resources = 13;
101+
102+
// The corresponding total resources on the node.
103+
map<string, double> total_resources = 14;
104+
105+
// Dynamic labels associated with the node.
106+
// Reserved dynamic label names: _PG
107+
map<string, string> dynamic_labels = 15;
108+
109+
// A monotonic increasing version of the node resource state.
110+
int64 node_state_version = 16;
111+
112+
// The status of the node.
113+
NodeStatus status = 17;
114+
}
115+
116+
// ============= Autoscaling State Service API =======================
117+
//
118+
// Autoscaler periodically calls to
119+
// two snapshot APIs, GetClusterResourceState
120+
// and ReportAutoscalingState.
121+
// The GetClusterResourceState will return a snapshot
122+
// of Ray state that Autoscaler interested, along with
123+
// the cluster_resource_state_version (version).
124+
//
125+
// Separately, autoscaler will constantly making decisions
126+
// based on the latest Ray state, and also change its
127+
// state based on the information from node provider.
128+
// Autoscaler will periodically report its state to GCS
129+
// through ReportAutoscalingState API.
130+
131+
message GetClusterResourceStateRequest {
132+
// The last seen cluster resource state version. The default value is reserved for if a
133+
// previous scheduling state has never been seen.
134+
int64 last_seen_cluster_resource_state_version = 1;
135+
}
136+
137+
message GetClusterResourceStateReply {
138+
// an monotonically increasing version of the cluster resources.
139+
int64 cluster_resource_state_version = 1;
140+
// last seen autoscaler state.
141+
int64 last_seen_autoscaler_state_version = 2;
142+
// Current cluster resources.
143+
repeated NodeState node_states = 3;
144+
// Resource requests pending scheduling.
145+
repeated ResourceRequestByCount pending_resource_requests = 4;
146+
// Gang resource requests pending scheduling.
147+
repeated GangResourceRequest pending_gang_resource_requests = 5;
148+
// Cluster resource constraints.
149+
// There could be multiple constraints issued by different
150+
// jobs. Autoscaler to make sure all constraints are satisfied.
151+
repeated ClusterResourceConstraint cluster_resource_constraints = 6;
152+
}
153+
154+
message Instance {
155+
enum InstanceStatus {
156+
// The unspecified state - most likey it is queued.
157+
INSTANCE_STATUS_UNSPECIFIED = 0;
158+
// Instance is starting. The first state update received from the
159+
// instance.
160+
STARTING = 1;
161+
// The instance is running - one of two states of a healthy instance.
162+
RUNNING = 2;
163+
// The instance is idle - one of two states of a healthy instance.
164+
IDLE = 3;
165+
// The instance is stopping - usually follows from the RUNNING, IDLE,
166+
// PREEMPT_REQUEST or DRAIN_REQUEST state.
167+
STOPPING = 4;
168+
// The instance is stopped - follows from the STOPPING state.
169+
STOPPED = 5;
170+
// The instance is in a bad state - but it is still able to send updates.
171+
FAILING = 6;
172+
// The subscribe service moves instances to this state if they
173+
// have been idle for too long. This allows the cluster manager to
174+
// make a final decision on whether or not to commence a drain
175+
// sequence for this instance.
176+
DRAIN_CONFIRMATION_PENDING = 7;
177+
// The instance should be drained, Ray should start draining process
178+
// but could reject if failed to drain.
179+
DRAIN_REQUEST = 8;
180+
// The instance will be reempted by the instance manager, regardless
181+
// of whether it is drainable or not.
182+
PREEMPT_REQUEST = 9;
183+
}
184+
// an unique id for the instance that's generated by the
185+
// instance manager. This may be optional if
186+
// the instance hasn't be started yet.
187+
string instance_id = 11;
188+
// the status of the instance.
189+
InstanceStatus status = 12;
190+
// the node id of the instance.
191+
string node_type = 13;
192+
// The corresponding total resources on the node.
193+
map<string, double> total_resources = 14;
194+
// timestamp of the last state changed.
195+
int64 timestamp_since_last_state_change = 15;
196+
}
197+
198+
message ReportAutoscalingStateRequest {
199+
int64 last_seen_cluster_resource_state_version = 1;
200+
// A monotonically increasing version identifies
201+
// the state of autoscaler.
202+
// Note: for the same cluster resource state, the
203+
// autoscaler state might be different, since
204+
// the autoscaler's state could also be updated by
205+
// node provider.
206+
int64 autoscaler_state_version = 2;
207+
repeated Instance instances = 3;
208+
// infeasible resource requests.
209+
repeated ResourceRequest infeasible_resource_requests = 4;
210+
repeated ClusterResourceConstraint infeasible_gange_resource_requests = 5;
211+
repeated ClusterResourceConstraint infeasible_cluster_resource_constraints = 6;
212+
}
213+
214+
message ReportAutoscalingStateReply {}
215+
216+
service AutoscalerStateService {
217+
rpc GetClusterResourceState(GetClusterResourceStateRequest)
218+
returns (GetClusterResourceStateReply);
219+
rpc ReportAutoscalingState(ReportAutoscalingStateRequest)
220+
returns (ReportAutoscalingStateReply);
221+
}

0 commit comments

Comments
 (0)