-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathglue-cluster.yaml
166 lines (164 loc) · 6.37 KB
/
glue-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
apiVersion: cluster.ray.io/v1
kind: RayCluster
metadata:
name: glue-cluster
spec:
# The maximum number of workers nodes to launch in addition to the head node.
maxWorkers: 10
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscalingSpeed: 0.0
# If a node is idle for this many minutes, it will be removed.
idleTimeoutMinutes: 99999
# Specify the pod type for the ray head node (as configured below).
headPodType: head-node
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
rayResources: {"GPU": 0}
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: head-
spec:
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
image: projectcodeflare/codeflare-glue:latest
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: AWS_SECRET_ACCESS_KEY
- name: ENDPOINT_URL
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: ENDPOINT_URL
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
- containerPort: 5001 # Used by iperf
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: "2"
memory: "32G"
ephemeral-storage: "60G"
nvidia.com/gpu: "0"
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
cpu: "2"
memory: "32G"
ephemeral-storage: "60G"
nvidia.com/gpu: "0"
- name: worker-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 8
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 8
# User-specified custom resources for use by Ray.
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
rayResources: {"foo": 1, "bar": 0}
podConfig:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: projectcodeflare/codeflare-glue:latest
env:
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: AWS_SECRET_ACCESS_KEY
- name: ENDPOINT_URL
valueFrom:
secretKeyRef:
name: glue-s3-creds
key: ENDPOINT_URL
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: "8"
memory: "16G"
nvidia.com/gpu: "1"
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
cpu: "8"
memory: "16G"
nvidia.com/gpu: "1"
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379