Skip to content
Merged
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions ci/benchmarks/partial-conv/evo2_finetuning.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
scope: partial-conv
time_limit: 14400
key_segments:
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
dataset_config: False
dataset_dir: False
data_base_path: False
num_workers: False
limit_val_batches: False
val_check_interval: False
experiment_name: False
workspace: False
restore_from_checkpoint_path: False
script_args:
# All arguments referenced in the script string must be specified here.
# Arguments not referenced in the script string must have the 'arg' field specified.
# See jet/core/configs.py for the specification of the configuration class
workspace: /workspace/bionemo2
data_base_path: /data/evo2
restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k
nodes: [1]
model: evo2
model_size: 1b
num_workers: 1
limit_val_batches: 20
dataset_config: training_data_config.yaml
dataset_dir: preprocessed_data
val_check_interval: 5
seq_length: 8192
warmup_steps: 10
activation_checkpoint_layers: 2
lr: 0.000015
min_lr: 0.0000149
accumulate_grad_batches: 4
max_steps: 10
gpus: 1
clip_grad: 250
weight_decay: 0.001
attention_dropout: 0.01
hidden_dropout: 0.01
stop_steps: 200
batch_size: 2
variant: finetune
precision: fp8
products:
- variant: finetune
lora_enabled: ""
task: finetune_from_ckpt
experiment_name: evo2-finetune
- variant: lora_finetune
lora_enabled: "--lora-finetune"
task: lora_finetune_from_ckpt
experiment_name: evo2-lora-finetune
script: |-
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \
-d ${data_base_path}/${dataset_config} \
--dataset-dir=${data_base_path}/${dataset_dir} \
--ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \
${lora_enabled} \
--model-size=${model_size} \
--max-steps=${max_steps} \
--experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \
--lr=${lr} \
--min-lr=${min_lr} \
--warmup-steps=${warmup_steps} \
--result-dir=${tensorboard_dir} \
--micro-batch-size=${batch_size} \
--grad-acc-batches=${accumulate_grad_batches} \
--limit-val-batches=${limit_val_batches} \
--seq-length=${seq_length} \
--clip-grad=${clip_grad} \
--wd=${weight_decay} \
--attention-dropout=${attention_dropout} \
--hidden-dropout=${hidden_dropout} \
--num-layers 4 \
--hybrid-override-pattern SDH* \
--devices=${gpus} \
--num-nodes=${nodes} \
--val-check-interval=${val_check_interval} \
--wandb-project=${wandb_project_name} \
--wandb-group=${model}_${variant}_${model_size}_${task}_${target} \
--create-tensorboard-logger \
--activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
--disable-checkpointing \
--early-stop-on-step=${stop_steps};
Loading