-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathray_cluster.yaml
196 lines (196 loc) · 8.9 KB
/
ray_cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 2
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 10
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 2
# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
autoscaling_mode: default
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: 58jmk88t9it
# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: 58jmk88t9it-writer
# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: lchu/horovod-ray:v3
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 2
memory: 8Gi
ephemeral-storage: 2Gi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 8Gi
cpu: 2
ephemeral-storage: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: ray-worker-
# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: ray-worker
spec:
serviceAccountName: default
# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: lchu/horovod-ray:v3
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 2
memory: 8Gi
ephemeral-storage: 2Gi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 8Gi
cpu: 2
ephemeral-storage: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # --webui-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076