-
Notifications
You must be signed in to change notification settings - Fork 111
Expand file tree
/
Copy pathmarin-us-central2-vllm.yaml
More file actions
248 lines (219 loc) · 7.36 KB
/
marin-us-central2-vllm.yaml
File metadata and controls
248 lines (219 loc) · 7.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#####################################################
# THIS FILE IS AUTOGENERATED #
# Update the template or the script, not this file! #
#####################################################
# Unique Identifier for the Head Node + Workers
cluster_name: marin-us-central2-vllm
# Maximum Workers (excluding Head Node)
max_workers: 1024
auth:
ssh_private_key: ~/.ssh/marin_ray_cluster.pem
ssh_public_key: ~/.ssh/marin_ray_cluster.pub
ssh_user: ray
# Configure GCP
provider:
type: gcp
region: us-central2
availability_zone: us-central2-b
project_id: hai-gcp-models
docker:
image: "us-central2-docker.pkg.dev/hai-gcp-models/marin/marin_vllm:1bc975e12"
container_name: "ray_docker"
pull_before_run: true
worker_run_options:
# always restart the worker if it happens to crash
- --restart=on-failure
- --rm=0 # Ray likes to add --rm which doesn't work with --restart=on-failure
- --privileged
- --ulimit memlock=-1:-1 #
- --ulimit nofile=1048576:1048576
- --shm-size=200gb
- -v
- "/tmp:/tmp"
- -e MARIN_PREFIX=gs://marin-us-central2
- -e BUCKET=marin-us-central2
- -e MARIN_LOCAL_CACHE_DIR=/tmp/marin-cache
- -e AUTOSCALER_HEARTBEAT_TIMEOUT_S=600
- -e TPU_MIN_LOG_LEVEL=3
- -e TPU_STDERR_LOG_LEVEL=3
- -e TPU_LOG_DIR=disabled
# this lets the worker run docker commands and have them run as sibling containers
- -v "/var/run/docker.sock:/var/run/docker.sock"
- -e RAY_AUTH_MODE=token
- -e RAY_AUTH_TOKEN_PATH=/home/ray/.ray/auth_token
head_run_options:
- --privileged
- -v "/tmp:/tmp"
- --ulimit nofile=1048576:1048576
- -e RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS=64
- -e MARIN_PREFIX=gs://marin-us-central2
- -e BUCKET=marin-us-central2
- -e MARIN_LOCAL_CACHE_DIR=/tmp/marin-cache
- -e AUTOSCALER_HEARTBEAT_TIMEOUT_S=600
- -e TPU_MIN_LOG_LEVEL=3
- -e TPU_STDERR_LOG_LEVEL=3
- -e TPU_LOG_DIR=disabled
- -e RAY_AUTH_MODE=token
- -e RAY_AUTH_TOKEN_PATH=/home/ray/.ray/auth_token
initialization_commands:
- which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f)
- yes | gcloud auth configure-docker us-central2-docker.pkg.dev
# always run this because ray doesn't run with sudo
- sudo usermod -aG docker $USER
# we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker
# socket. This isn't the best security practice, but it's the easiest way to get this working.
- sudo chmod 666 /var/run/docker.sock
setup_commands:
# set the GCP project because it's not injected by default
- gcloud config set project hai-gcp-models
- gcloud config set compute/region us-central2
- gcloud config set compute/zone us-central2-b
- mkdir $HOME/.cache/huggingface -p
- mkdir $HOME/.cache/openai -p
- mkdir -p $HOME/.ssh && gcloud compute project-info describe --format="value(commonInstanceMetadata.items[?key==\"ssh-keys\"].value)" > $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/authorized_keys
- gcloud secrets versions access latest --secret=RAY_CLUSTER_PUBLIC_KEY > $HOME/.ssh/marin_ray_cluster.pub
- gcloud secrets versions access latest --secret=HF_TOKEN > $HOME/.cache/huggingface/token
- gcloud secrets versions access latest --secret=OPENAI_API_KEY > $HOME/.cache/openai/token
- mkdir -p $HOME/.ray
- gcloud secrets versions access latest --secret=RAY_AUTH_TOKEN > $HOME/.ray/auth_token
- chmod 600 $HOME/.ray/auth_token
- mkdir -p /tmp/marin-cache
# cf https://github.com/ray-project/ray/blob/0bc6ec86ffd0fc0d4e43fb339ffe0ac03ee5531b/python/ray/autoscaler/_private/constants.py#L66
# this is set to 30s by default, which is much too short for our use case
worker_setup_commands:
# delete any old ray session data
# We keep the /tmp/ray directory in case we are running on the head node.
# This shouldn't be a concern on new clusters where driver jobs are disabled on the head node.
- rm -rf /tmp/ray/session_*/runtime_resources/
# Set Head Node == `ray_head_default`
head_node_type: head_default
# List of Available Node Types
available_node_types:
# Head Node =>> On-Demand, sets Min/Max Workers = 0 (Prevent Scheduling Tasks on Head Node)
head_default:
min_workers: 0
max_workers: 0
resources: {"CPU": 0, "head_node": 1}
# GCP-Specific Configuration; by default, Ray will configure unspecified fields (e.g., subnets, ssh-keys)
# => Ref: https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
node_config:
machineType: n2-standard-8
# Create a Persistent Disk w/ 200 GBs
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 200
# Set Source Image =>> Ubuntu 22.04 Base VM
sourceImage: projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts
tpu_worker:
max_workers: 1024
min_workers: 1
node_config:
acceleratorType: v4-8
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_16:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-16
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_32:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-32
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_64:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-64
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_128:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-128
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_256:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-256
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_512:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-512
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_1024:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-1024
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_2048:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-2048
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4
tpu_slice_v4_4096:
max_workers: 1024
min_workers: 0
node_config:
acceleratorType: v4-4096
runtimeVersion: tpu-ubuntu2204-base
schedulingConfig:
preemptible: true
resources:
CPU: 120
TPU: 4