-
Notifications
You must be signed in to change notification settings - Fork 423
Expand file tree
/
Copy pathsingle_node.sky.yaml
More file actions
33 lines (28 loc) · 977 Bytes
/
single_node.sky.yaml
File metadata and controls
33 lines (28 loc) · 977 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
name: areal-test-skypilot
resources:
accelerators: A100:2
autostop:
idle_minutes: 10
down: true
cpus: 8+
memory: 32GB+
disk_size: 256GB
image_id: docker:ghcr.io/inclusionai/areal-runtime:v1.0.2-sglang
num_nodes: 1
file_mounts:
/storage: # Should be consistent with the storage paths set in gsm8k_grpo_ray.yaml
source: s3://my-bucket/ # or gs://, https://<azure_storage_account>.blob.core.windows.net/<container>, r2://, cos://<region>/<bucket>, oci://<bucket_name>
mode: MOUNT # MOUNT or COPY or MOUNT_CACHED. Defaults to MOUNT. Optional.
workdir: .
run: |
python3 examples/math/gsm8k_rl.py \
--config examples/math/gsm8k_grpo.yaml \
scheduler.type=local \
experiment_name=gsm8k-grpo \
trial_name=trial0 \
cluster.n_nodes=1 \
cluster.n_gpus_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
rollout.backend=sglang:d1 \
actor.backend=fsdp:d1 \
train_dataset.batch_size=4 \
actor.mb_spec.max_tokens_per_mb=4096