marin/infra/marin-us-central2-vllm.yaml at e31b2700e865e98af417a8e3c7042df4e8a93b71 · marin-community/marin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#####################################################
#           THIS FILE IS AUTOGENERATED              #
# Update the template or the script, not this file! #
#####################################################
# Unique Identifier for the Head Node + Workers
cluster_name: marin-us-central2-vllm

# Maximum Workers (excluding Head Node)
max_workers: 1024

auth:
  ssh_private_key:  ~/.ssh/marin_ray_cluster.pem
  ssh_public_key:  ~/.ssh/marin_ray_cluster.pub
  ssh_user: ray

# Configure GCP
provider:
  type: gcp
  region: us-central2
  availability_zone: us-central2-b
  project_id: hai-gcp-models

docker:
    image: "us-central2-docker.pkg.dev/hai-gcp-models/marin/marin_vllm:1bc975e12"
    container_name: "ray_docker"
    pull_before_run: true
    worker_run_options:
      # always restart the worker if it happens to crash
      - --restart=on-failure
      - --rm=0  # Ray likes to add --rm which doesn't work with --restart=on-failure
      - --privileged
      - --ulimit memlock=-1:-1  #
      - --ulimit nofile=1048576:1048576
      - --shm-size=200gb
      - -v
      - "/tmp:/tmp"
      - -e MARIN_PREFIX=gs://marin-us-central2
      - -e BUCKET=marin-us-central2
      - -e MARIN_LOCAL_CACHE_DIR=/tmp/marin-cache
      - -e AUTOSCALER_HEARTBEAT_TIMEOUT_S=600
      - -e TPU_MIN_LOG_LEVEL=3
      - -e TPU_STDERR_LOG_LEVEL=3
      - -e TPU_LOG_DIR=disabled
      # this lets the worker run docker commands and have them run as sibling containers
      - -v "/var/run/docker.sock:/var/run/docker.sock"

      - -e RAY_AUTH_MODE=token
      - -e RAY_AUTH_TOKEN_PATH=/home/ray/.ray/auth_token

    head_run_options:
      - --privileged
      - -v "/tmp:/tmp"
      - --ulimit nofile=1048576:1048576
      - -e RAY_TPU_MAX_CONCURRENT_ACTIVE_CONNECTIONS=64
      - -e MARIN_PREFIX=gs://marin-us-central2
      - -e BUCKET=marin-us-central2
      - -e MARIN_LOCAL_CACHE_DIR=/tmp/marin-cache
      - -e AUTOSCALER_HEARTBEAT_TIMEOUT_S=600
      - -e TPU_MIN_LOG_LEVEL=3
      - -e TPU_STDERR_LOG_LEVEL=3
      - -e TPU_LOG_DIR=disabled

      - -e RAY_AUTH_MODE=token
      - -e RAY_AUTH_TOKEN_PATH=/home/ray/.ray/auth_token


initialization_commands:
  - which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f)
  - yes | gcloud auth configure-docker us-central2-docker.pkg.dev
  # always run this because ray doesn't run with sudo
  - sudo usermod -aG docker $USER
  # we want to launch docker containers from inside docker, which means we need to loosen the permissions on the docker
  # socket. This isn't the best security practice, but it's the easiest way to get this working.
  - sudo chmod 666 /var/run/docker.sock

setup_commands:
  # set the GCP project because it's not injected by default
  - gcloud config set project hai-gcp-models
  - gcloud config set compute/region us-central2
  - gcloud config set compute/zone us-central2-b
  - mkdir $HOME/.cache/huggingface -p
  - mkdir $HOME/.cache/openai -p
  - mkdir -p $HOME/.ssh && gcloud compute project-info describe --format="value(commonInstanceMetadata.items[?key==\"ssh-keys\"].value)" > $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/authorized_keys
  - gcloud secrets versions access latest --secret=RAY_CLUSTER_PUBLIC_KEY > $HOME/.ssh/marin_ray_cluster.pub
  - gcloud secrets versions access latest --secret=HF_TOKEN > $HOME/.cache/huggingface/token
  - gcloud secrets versions access latest --secret=OPENAI_API_KEY > $HOME/.cache/openai/token

  - mkdir -p $HOME/.ray
  - gcloud secrets versions access latest --secret=RAY_AUTH_TOKEN > $HOME/.ray/auth_token
  - chmod 600 $HOME/.ray/auth_token

  - mkdir -p /tmp/marin-cache
  # cf https://github.com/ray-project/ray/blob/0bc6ec86ffd0fc0d4e43fb339ffe0ac03ee5531b/python/ray/autoscaler/_private/constants.py#L66
  # this is set to 30s by default, which is much too short for our use case

worker_setup_commands:
  # delete any old ray session data
  # We keep the /tmp/ray directory in case we are running on the head node.
  # This shouldn't be a concern on new clusters where driver jobs are disabled on the head node.
  - rm -rf /tmp/ray/session_*/runtime_resources/

# Set Head Node == `ray_head_default`
head_node_type: head_default


# List of Available Node Types
available_node_types:
  # Head Node =>> On-Demand, sets Min/Max Workers = 0 (Prevent Scheduling Tasks on Head Node)
  head_default:
    min_workers: 0
    max_workers: 0
    resources: {"CPU": 0, "head_node": 1}

    # GCP-Specific Configuration; by default, Ray will configure unspecified fields (e.g., subnets, ssh-keys)
    #   => Ref: https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
    node_config:
      machineType: n2-standard-8

      # Create a Persistent Disk w/ 200 GBs
      disks:
        - boot: true
          autoDelete: true
          type: PERSISTENT
          initializeParams:
            diskSizeGb: 200

            # Set Source Image =>> Ubuntu 22.04 Base VM
            sourceImage: projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts
  tpu_worker:
    max_workers: 1024
    min_workers: 1
    node_config:
      acceleratorType: v4-8
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_16:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-16
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_32:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-32
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_64:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-64
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_128:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-128
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_256:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-256
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_512:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-512
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_1024:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-1024
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_2048:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-2048
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4

  tpu_slice_v4_4096:
    max_workers: 1024
    min_workers: 0
    node_config:
      acceleratorType: v4-4096
      runtimeVersion: tpu-ubuntu2204-base
      schedulingConfig:
        preemptible: true
    resources:
      CPU: 120
      TPU: 4