Skip to content

Commit 2ce1e71

Browse files
committed
Add ray experiments
remove creds
1 parent 49662f3 commit 2ce1e71

File tree

2 files changed

+952
-0
lines changed

2 files changed

+952
-0
lines changed
+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: finetuneflan
5+
namespace: default
6+
spec:
7+
priority: 9
8+
resources:
9+
GenericItems:
10+
- custompodresources:
11+
- limits:
12+
cpu: 2
13+
memory: 8G
14+
nvidia.com/gpu: 0
15+
replicas: 1
16+
requests:
17+
cpu: 2
18+
memory: 8G
19+
nvidia.com/gpu: 0
20+
- limits:
21+
cpu: 2
22+
memory: 8G
23+
nvidia.com/gpu: 1
24+
replicas: 2
25+
requests:
26+
cpu: 1
27+
memory: 2G
28+
nvidia.com/gpu: 1
29+
generictemplate:
30+
apiVersion: ray.io/v1alpha1
31+
kind: RayCluster
32+
metadata:
33+
labels:
34+
appwrapper.mcad.ibm.com: finetuneflan
35+
controller-tools.k8s.io: '1.0'
36+
name: finetuneflan
37+
namespace: default
38+
spec:
39+
autoscalerOptions:
40+
idleTimeoutSeconds: 60
41+
imagePullPolicy: Always
42+
resources:
43+
limits:
44+
cpu: 500m
45+
memory: 512Mi
46+
requests:
47+
cpu: 500m
48+
memory: 512Mi
49+
upscalingMode: Default
50+
enableInTreeAutoscaling: false
51+
headGroupSpec:
52+
rayStartParams:
53+
block: 'true'
54+
dashboard-host: 0.0.0.0
55+
num-gpus: '0'
56+
serviceType: ClusterIP
57+
template:
58+
spec:
59+
containers:
60+
- env:
61+
- name: MY_POD_IP
62+
valueFrom:
63+
fieldRef:
64+
fieldPath: status.podIP
65+
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
66+
imagePullPolicy: Always
67+
lifecycle:
68+
preStop:
69+
exec:
70+
command:
71+
- /bin/sh
72+
- -c
73+
- ray stop
74+
name: ray-head
75+
ports:
76+
- containerPort: 6379
77+
name: gcs
78+
- containerPort: 8265
79+
name: dashboard
80+
- containerPort: 10001
81+
name: client
82+
resources:
83+
limits:
84+
cpu: 2
85+
memory: 8G
86+
nvidia.com/gpu: 0
87+
requests:
88+
cpu: 2
89+
memory: 8G
90+
nvidia.com/gpu: 0
91+
rayVersion: 1.12.0
92+
workerGroupSpecs:
93+
- groupName: small-group-finetuneflan
94+
maxReplicas: 2
95+
minReplicas: 2
96+
rayStartParams:
97+
block: 'true'
98+
num-gpus: '1'
99+
replicas: 2
100+
template:
101+
metadata:
102+
annotations:
103+
key: value
104+
labels:
105+
key: value
106+
spec:
107+
containers:
108+
- env:
109+
- name: MY_POD_IP
110+
valueFrom:
111+
fieldRef:
112+
fieldPath: status.podIP
113+
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
114+
lifecycle:
115+
preStop:
116+
exec:
117+
command:
118+
- /bin/sh
119+
- -c
120+
- ray stop
121+
name: machine-learning
122+
resources:
123+
limits:
124+
cpu: 2
125+
memory: 8G
126+
nvidia.com/gpu: 1
127+
requests:
128+
cpu: 1
129+
memory: 2G
130+
nvidia.com/gpu: 1
131+
initContainers:
132+
- command:
133+
- sh
134+
- -c
135+
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
136+
do echo waiting for myservice; sleep 2; done
137+
image: busybox:1.28
138+
name: init-myservice
139+
replicas: 1
140+
- generictemplate:
141+
apiVersion: route.openshift.io/v1
142+
kind: Route
143+
metadata:
144+
labels:
145+
odh-ray-cluster-service: finetuneflan-head-svc
146+
name: ray-dashboard-finetuneflan
147+
namespace: default
148+
spec:
149+
port:
150+
targetPort: dashboard
151+
to:
152+
kind: Service
153+
name: finetuneflan-head-svc
154+
replica: 1
155+
Items: []

0 commit comments

Comments
 (0)