-
Notifications
You must be signed in to change notification settings - Fork 131
Expand file tree
/
Copy pathgeneformer_pretrain.yaml
More file actions
67 lines (66 loc) · 2.12 KB
/
geneformer_pretrain.yaml
File metadata and controls
67 lines (66 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
scope: perf
time_limit: 3600
key_segments:
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
data_path: False
val_check_interval: False
lr: False
script_args:
# All arguments referenced in the script string must be specified here.
# Arguments not referenced in the script string must have the 'arg' field specified.
# See jet/core/configs.py for the specification of the configuration class
workspace: /workspace/bionemo2
data_path: /data/cellxgene_scdl
model: geneformer
variant: train
config_name: 10M
precision: [bf16-mixed]
gpus: 8
max_steps: 1000
lr: 0.001
val_check_interval: 500
acc_grad: 1
products:
- nodes: 1
batch_size: 32
- nodes: 2
batch_size: 32
script: |-
COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
if [ "$SLURM_LOCALID" = "0" ]; then
df -h;
echo $NEW_DATA_PATH;
time cp -r ${data_path}/ $NEW_DATA_PATH;
touch $COPY_FLAG
fi
# All ranks wait until install flag file appears
while [ ! -f $COPY_FLAG ]; do
sleep 1
done
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
--data-dir $NEW_DATA_PATH \
--experiment-name ${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
--num-gpus ${gpus} \
--save-last-checkpoint \
--num-nodes ${nodes} \
--val-check-interval ${val_check_interval} \
--num-dataset-workers 8 \
--num-steps ${max_steps} \
--seq-length 2048 \
--limit-val-batches 8 \
--micro-batch-size ${batch_size} \
--resume-if-exists \
--log-every-n-steps 50 \
--create-tflops-callback \
--lr ${lr} \
--create-tensorboard-logger \
--result-dir=${tensorboard_dir} \
--wandb-group=${model}_${variant}_${config_name}__${target} \
--wandb-project ${wandb_project_name} \
--wandb-job-type=${pipeline_label} \
--cosine-rampup-frac 0.004331629559040111 \
--cosine-hold-frac 0.021658147795200554 \
--accumulate-grad-batches ${acc_grad} \
--precision ${precision} \
--disable-checkpointing;