NVIDIA · jwilber · Aug 29, 2025 · Aug 12, 2025 · Aug 15, 2025 · Aug 18, 2025
@@ -0,0 +1,85 @@
+scope: partial-conv
+time_limit: 14400
+key_segments:
+  # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
+  dataset_config: False
+  dataset_dir: False
+  data_base_path: False
+  num_workers: False
+  limit_val_batches: False
+  val_check_interval: False
+  experiment_name: False
+  workspace: False
+  restore_from_checkpoint_path: False
+script_args:
+  # All arguments referenced in the script string must be specified here.
+  # Arguments not referenced in the script string must have the 'arg' field specified.
+  # See jet/core/configs.py for the specification of the configuration class
+  workspace: /workspace/bionemo2
+  data_base_path: /data/evo2
+  restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k
+  nodes: [1]
+  model: evo2
+  model_size: 1b
+  num_workers: 1
+  limit_val_batches: 20
+  dataset_config: training_data_config.yaml
+  dataset_dir: preprocessed_data
+  val_check_interval: 5
+  seq_length: 8192
+  warmup_steps: 10
+  activation_checkpoint_layers: 2
+  lr: 0.000015
+  min_lr: 0.0000149
+  accumulate_grad_batches: 4
+  max_steps: 10
+  gpus: 1
+  clip_grad: 250
+  weight_decay: 0.001
+  attention_dropout: 0.01
+  hidden_dropout: 0.01
+  stop_steps: 200
+  batch_size: 2
+  variant: finetune
+  precision: fp8
+  products:
+    - variant: finetune
+      lora_enabled: ""
+      task: finetune_from_ckpt
+      experiment_name: evo2-finetune
+    - variant: lora_finetune
+      lora_enabled: "--lora-finetune"
+      task: lora_finetune_from_ckpt
+      experiment_name: evo2-lora-finetune
+script: |-
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \
+    -d ${data_base_path}/${dataset_config} \
+    --dataset-dir=${data_base_path}/${dataset_dir} \
+    --ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \
+    ${lora_enabled} \
+    --model-size=${model_size} \
+    --max-steps=${max_steps} \
+    --experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \
+    --lr=${lr} \
+    --min-lr=${min_lr} \
+    --warmup-steps=${warmup_steps} \
+    --result-dir=${tensorboard_dir} \
+    --micro-batch-size=${batch_size} \
+    --grad-acc-batches=${accumulate_grad_batches} \
+    --limit-val-batches=${limit_val_batches} \
+    --seq-length=${seq_length} \
+    --clip-grad=${clip_grad} \
+    --wd=${weight_decay} \
+    --attention-dropout=${attention_dropout} \
+    --hidden-dropout=${hidden_dropout} \
+    --num-layers 4 \
+    --hybrid-override-pattern SDH* \
+    --devices=${gpus} \
+    --num-nodes=${nodes} \
+    --val-check-interval=${val_check_interval} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${model_size}_${task}_${target} \
+    --create-tensorboard-logger \
+    --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
+    --disable-checkpointing \
+    --early-stop-on-step=${stop_steps};