Skip to content

Commit db198e2

Browse files
committed
Add RayService interpreter
Signed-off-by: seanlaii <[email protected]>
1 parent 2dac564 commit db198e2

File tree

12 files changed

+3215
-151
lines changed

12 files changed

+3215
-151
lines changed

pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayCluster/customizations.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ spec:
8585
desiredObj.status = {}
8686
end
8787
88+
desiredObj.spec = nil
89+
8890
-- If only one item, use it directly
8991
if #statusItems == 1 then
9092
desiredObj.status = statusItems[1].status

pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayCluster/testdata/aggregatestatus-test.yaml

Lines changed: 346 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
# test case for aggregating status of RayCluster
2-
# case1. RayCluster with two status items
2+
# case1. RayCluster with two status items from different clusters
3+
# case2. RayCluster with single status item
4+
# case3. RayCluster with unhealthy cluster (replica failure)
35

46
name: "RayCluster with two status items"
5-
description: "Test aggregating status of RayCluster with two status items"
7+
description: "Test aggregating status of RayCluster with two status items from different clusters"
68
desiredObj:
79
apiVersion: ray.io/v1
810
kind: RayCluster
911
metadata:
1012
name: sample
13+
namespace: default
1114
spec:
1215
rayVersion: '2.46.0'
1316
headGroupSpec:
@@ -40,14 +43,15 @@ desiredObj:
4043
template:
4144
spec:
4245
containers:
43-
image: rayproject/ray:2.46.0
44-
resources:
45-
limits:
46-
cpu: 1
47-
memory: 1G
48-
requests:
49-
cpu: 1
50-
memory: 1G
46+
- name: ray-worker
47+
image: rayproject/ray:2.46.0
48+
resources:
49+
limits:
50+
cpu: 1
51+
memory: 1G
52+
requests:
53+
cpu: 1
54+
memory: 1G
5155
statusItems:
5256
- applied: true
5357
clusterName: member1
@@ -137,3 +141,335 @@ statusItems:
137141
ready: "2025-09-21T03:56:50Z"
138142
operation: AggregateStatus
139143
output:
144+
aggregatedStatus:
145+
apiVersion: ray.io/v1
146+
kind: RayCluster
147+
metadata:
148+
name: sample
149+
namespace: default
150+
status:
151+
conditions:
152+
- lastTransitionTime: "2025-09-21T03:55:30Z"
153+
message: ""
154+
reason: HeadPodRunningAndReady
155+
status: "True"
156+
type: HeadPodReady
157+
- lastTransitionTime: "2025-09-21T03:55:45Z"
158+
message: All Ray Pods are ready for the first time
159+
reason: AllPodRunningAndReadyFirstTime
160+
status: "True"
161+
type: RayClusterProvisioned
162+
- lastTransitionTime: "2025-09-21T03:54:44Z"
163+
message: ""
164+
reason: RayClusterSuspended
165+
status: "False"
166+
type: RayClusterSuspended
167+
- lastTransitionTime: "2025-09-21T03:54:44Z"
168+
message: ""
169+
reason: RayClusterSuspending
170+
status: "False"
171+
type: RayClusterSuspending
172+
- lastTransitionTime: "2025-09-21T03:56:30Z"
173+
message: ""
174+
reason: HeadPodRunningAndReady
175+
status: "True"
176+
type: HeadPodReady
177+
- lastTransitionTime: "2025-09-21T03:56:50Z"
178+
message: All Ray Pods are ready for the first time
179+
reason: AllPodRunningAndReadyFirstTime
180+
status: "True"
181+
type: RayClusterProvisioned
182+
- lastTransitionTime: "2025-09-21T03:54:50Z"
183+
message: ""
184+
reason: RayClusterSuspended
185+
status: "False"
186+
type: RayClusterSuspended
187+
- lastTransitionTime: "2025-09-21T03:54:50Z"
188+
message: ""
189+
reason: RayClusterSuspending
190+
status: "False"
191+
type: RayClusterSuspending
192+
readyWorkerReplicas: 4
193+
availableWorkerReplicas: 3
194+
maxWorkerReplicas: 15
195+
minWorkerReplicas: 3
196+
desiredWorkerReplicas: 4
197+
desiredCPU: "6"
198+
desiredGPU: "1"
199+
desiredMemory: 8G
200+
desiredTPU: "0"
201+
lastUpdateTime: "2025-09-21T03:56:50Z"
202+
endpoints:
203+
client: "10001"
204+
dashboard: "8265"
205+
gcs-server: "6379"
206+
metrics: "8080"
207+
head:
208+
podIP: 10.244.0.6
209+
podName: sample-head-9cvfc
210+
serviceIP: 10.244.0.6
211+
serviceName: sample-head-svc
212+
state: ready
213+
stateTransitionTimes:
214+
ready: "2025-09-21T03:55:45Z"
215+
216+
---
217+
name: "RayCluster with single status item"
218+
description: "Test aggregating status of RayCluster with single status item"
219+
desiredObj:
220+
apiVersion: ray.io/v1
221+
kind: RayCluster
222+
metadata:
223+
name: single-cluster
224+
namespace: default
225+
spec:
226+
rayVersion: '2.46.0'
227+
headGroupSpec:
228+
template:
229+
spec:
230+
containers:
231+
- name: ray-head
232+
image: rayproject/ray:2.46.0
233+
resources:
234+
requests:
235+
cpu: 500m
236+
memory: 1G
237+
workerGroupSpecs:
238+
- replicas: 2
239+
groupName: small-group
240+
template:
241+
spec:
242+
containers:
243+
- name: ray-worker
244+
image: rayproject/ray:2.46.0
245+
statusItems:
246+
- applied: true
247+
clusterName: member1
248+
status:
249+
availableWorkerReplicas: 2
250+
conditions:
251+
- lastTransitionTime: "2025-09-22T10:00:00Z"
252+
message: ""
253+
reason: HeadPodRunningAndReady
254+
status: "True"
255+
type: HeadPodReady
256+
- lastTransitionTime: "2025-09-22T10:01:00Z"
257+
message: All Ray Pods are ready for the first time
258+
reason: AllPodRunningAndReadyFirstTime
259+
status: "True"
260+
type: RayClusterProvisioned
261+
- lastTransitionTime: "2025-09-22T10:01:00Z"
262+
message: ""
263+
reason: NoReplicaFailure
264+
status: "False"
265+
type: RayClusterReplicaFailure
266+
desiredCPU: "2.5"
267+
desiredMemory: 3G
268+
desiredWorkerReplicas: 2
269+
endpoints:
270+
client: "10001"
271+
dashboard: "8265"
272+
gcs-server: "6379"
273+
head:
274+
podIP: 10.244.1.5
275+
podName: single-cluster-head-abc12
276+
serviceIP: 10.96.1.100
277+
serviceName: single-cluster-head-svc
278+
lastUpdateTime: "2025-09-22T10:01:00Z"
279+
maxWorkerReplicas: 2
280+
minWorkerReplicas: 2
281+
readyWorkerReplicas: 2
282+
state: ready
283+
operation: AggregateStatus
284+
output:
285+
aggregatedStatus:
286+
apiVersion: ray.io/v1
287+
kind: RayCluster
288+
metadata:
289+
name: single-cluster
290+
namespace: default
291+
status:
292+
availableWorkerReplicas: 2
293+
conditions:
294+
- lastTransitionTime: "2025-09-22T10:00:00Z"
295+
message: ""
296+
reason: HeadPodRunningAndReady
297+
status: "True"
298+
type: HeadPodReady
299+
- lastTransitionTime: "2025-09-22T10:01:00Z"
300+
message: All Ray Pods are ready for the first time
301+
reason: AllPodRunningAndReadyFirstTime
302+
status: "True"
303+
type: RayClusterProvisioned
304+
- lastTransitionTime: "2025-09-22T10:01:00Z"
305+
message: ""
306+
reason: NoReplicaFailure
307+
status: "False"
308+
type: RayClusterReplicaFailure
309+
desiredCPU: "2.5"
310+
desiredMemory: 3G
311+
desiredWorkerReplicas: 2
312+
endpoints:
313+
client: "10001"
314+
dashboard: "8265"
315+
gcs-server: "6379"
316+
head:
317+
podIP: 10.244.1.5
318+
podName: single-cluster-head-abc12
319+
serviceIP: 10.96.1.100
320+
serviceName: single-cluster-head-svc
321+
lastUpdateTime: "2025-09-22T10:01:00Z"
322+
maxWorkerReplicas: 2
323+
minWorkerReplicas: 2
324+
readyWorkerReplicas: 2
325+
state: ready
326+
327+
---
328+
name: "RayCluster with unhealthy cluster"
329+
description: "Test aggregating status when one cluster has replica failure"
330+
desiredObj:
331+
apiVersion: ray.io/v1
332+
kind: RayCluster
333+
metadata:
334+
name: unhealthy-cluster
335+
namespace: production
336+
spec:
337+
rayVersion: '2.46.0'
338+
headGroupSpec:
339+
template:
340+
spec:
341+
containers:
342+
- name: ray-head
343+
image: rayproject/ray:2.46.0
344+
workerGroupSpecs:
345+
- replicas: 5
346+
groupName: large-group
347+
template:
348+
spec:
349+
containers:
350+
- name: ray-worker
351+
image: rayproject/ray:2.46.0
352+
resources:
353+
requests:
354+
cpu: 2
355+
memory: 4G
356+
statusItems:
357+
- applied: true
358+
clusterName: member1
359+
status:
360+
availableWorkerReplicas: 5
361+
conditions:
362+
- lastTransitionTime: "2025-09-23T12:00:00Z"
363+
message: ""
364+
reason: HeadPodRunningAndReady
365+
status: "True"
366+
type: HeadPodReady
367+
- lastTransitionTime: "2025-09-23T12:05:00Z"
368+
message: All Ray Pods are ready
369+
reason: AllPodRunningAndReady
370+
status: "True"
371+
type: RayClusterProvisioned
372+
desiredCPU: "11"
373+
desiredGPU: "0"
374+
desiredMemory: 21G
375+
desiredTPU: "0"
376+
desiredWorkerReplicas: 5
377+
endpoints:
378+
client: "10001"
379+
dashboard: "8265"
380+
gcs-server: "6379"
381+
head:
382+
podIP: 10.244.2.10
383+
podName: unhealthy-cluster-head-xyz
384+
serviceIP: 10.96.2.200
385+
serviceName: unhealthy-cluster-head-svc
386+
lastUpdateTime: "2025-09-23T12:05:00Z"
387+
maxWorkerReplicas: 5
388+
minWorkerReplicas: 0
389+
readyWorkerReplicas: 5
390+
state: ready
391+
- applied: true
392+
clusterName: member2
393+
status:
394+
availableWorkerReplicas: 2
395+
conditions:
396+
- lastTransitionTime: "2025-09-23T12:00:30Z"
397+
message: ""
398+
reason: HeadPodRunningAndReady
399+
status: "True"
400+
type: HeadPodReady
401+
- lastTransitionTime: "2025-09-23T12:03:00Z"
402+
message: Some Ray Pods are not ready
403+
reason: SomePodNotReady
404+
status: "False"
405+
type: RayClusterProvisioned
406+
- lastTransitionTime: "2025-09-23T12:03:00Z"
407+
message: "3 replicas failed to start"
408+
reason: WorkerReplicasFailed
409+
status: "True"
410+
type: RayClusterReplicaFailure
411+
desiredCPU: "11"
412+
desiredGPU: "0"
413+
desiredMemory: 21G
414+
desiredTPU: "0"
415+
desiredWorkerReplicas: 5
416+
lastUpdateTime: "2025-09-23T12:10:00Z"
417+
maxWorkerReplicas: 5
418+
minWorkerReplicas: 0
419+
readyWorkerReplicas: 2
420+
state: unhealthy
421+
operation: AggregateStatus
422+
output:
423+
aggregatedStatus:
424+
apiVersion: ray.io/v1
425+
kind: RayCluster
426+
metadata:
427+
name: unhealthy-cluster
428+
namespace: production
429+
status:
430+
conditions:
431+
- lastTransitionTime: "2025-09-23T12:00:00Z"
432+
message: ""
433+
reason: HeadPodRunningAndReady
434+
status: "True"
435+
type: HeadPodReady
436+
- lastTransitionTime: "2025-09-23T12:05:00Z"
437+
message: All Ray Pods are ready
438+
reason: AllPodRunningAndReady
439+
status: "True"
440+
type: RayClusterProvisioned
441+
- lastTransitionTime: "2025-09-23T12:00:30Z"
442+
message: ""
443+
reason: HeadPodRunningAndReady
444+
status: "True"
445+
type: HeadPodReady
446+
- lastTransitionTime: "2025-09-23T12:03:00Z"
447+
message: Some Ray Pods are not ready
448+
reason: SomePodNotReady
449+
status: "False"
450+
type: RayClusterProvisioned
451+
- lastTransitionTime: "2025-09-23T12:03:00Z"
452+
message: "3 replicas failed to start"
453+
reason: WorkerReplicasFailed
454+
status: "True"
455+
type: RayClusterReplicaFailure
456+
readyWorkerReplicas: 7
457+
availableWorkerReplicas: 7
458+
desiredWorkerReplicas: 10
459+
desiredCPU: "22"
460+
desiredGPU: "0"
461+
desiredMemory: 42G
462+
desiredTPU: "0"
463+
maxWorkerReplicas: 10
464+
minWorkerReplicas: 0
465+
lastUpdateTime: "2025-09-23T12:10:00Z"
466+
endpoints:
467+
client: "10001"
468+
dashboard: "8265"
469+
gcs-server: "6379"
470+
head:
471+
podIP: 10.244.2.10
472+
podName: unhealthy-cluster-head-xyz
473+
serviceIP: 10.96.2.200
474+
serviceName: unhealthy-cluster-head-svc
475+
state: ready

0 commit comments

Comments
 (0)