add config overrides for llama recipe (#1343)

pstjohn · web-flow · commit e9b27f78b029 · 2025-11-24T22:07:13.000Z
Updates the hydra config to let us pass model size information via the
hydra configs

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3_native_te/example_checkpoint/config.json b/bionemo-recipes/recipes/llama3_native_te/example_checkpoint/config.json
@@ -12,18 +12,25 @@
     "AutoModelForSequenceClassification": "llama3_nv.NVLlamaForSequenceClassification",
     "AutoModelForTokenClassification": "llama3_nv.NVLlamaForTokenClassification"
   },
+  "bos_token_id": 128000,
   "dtype": "bfloat16",
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
   "head_dim": 64,
   "hidden_act": "silu",
-  "hidden_size": 384,
+  "hidden_size": 2048,
   "initializer_range": 0.02,
-  "intermediate_size": 1536,
+  "intermediate_size": 8192,
   "max_position_embeddings": 131072,
   "mlp_bias": false,
   "model_type": "llama",
-  "num_attention_heads": 6,
-  "num_hidden_layers": 2,
-  "num_key_value_heads": 6,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 32.0,
@@ -33,12 +40,8 @@
     "rope_type": "llama3"
   },
   "rope_theta": 500000.0,
-  "tie_word_embeddings": false,
+  "tie_word_embeddings": true,
   "transformers_version": "4.57.1",
   "use_cache": true,
-  "vocab_size": 256,
-  "bos_token_id": 2,
-  "eos_token_id": 0,
-  "pad_token_id": 1,
-  "attn_input_format": "bshd"
+  "vocab_size": 128256
 }
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml
@@ -11,65 +11,50 @@ defaults:
 # Use tiny Llama config for fast convergence testing
 model_tag: ./example_checkpoint
 
-# Training steps - enough to see convergence on small dataset
-num_train_steps: 1000
+num_train_steps: 270_000
 
-# Dataset configuration - use small test dataset
 dataset:
-  tokenizer_path: ./example_checkpoint  # Tokenizer included in checkpoint directory
-  micro_batch_size: 1  # Conservative for single GPU
-  num_workers: 2
-  max_seq_length: 8192  # Full Llama3 context length
-  stride: 400  # 400bp overlap for 8K context
-  buffer_size: 10_000  # Smaller buffer for faster iteration
-  use_lazy_tokenization: true
-  use_stateful_dataloader: false  # Until https://github.com/pytorch/pytorch/pull/163102 is resolved with torchdata.
+  micro_batch_size: 1 # Conservative for single GPU
   load_dataset_kwargs:
-    path: "parquet"
-    data_files: "genomic_sequences_2mb.parquet"  # 2MB convergence test data in recipe directory
+    path: "arcinstitute/opengenome2"
+    data_dir: "json/pretraining_or_both_phases"
     split: "train"
-    streaming: true  # Use streaming to avoid loading entire dataset into memory
+    streaming: true # Use streaming to avoid loading entire dataset into memory
 
-# Optimizer - higher LR for faster convergence on small model
 adamw_kwargs:
-  lr: 5e-4  # Higher than default for faster convergence
+  lr: 5e-4
   fused: true
   betas: [0.9, 0.98]
   eps: 1e-8
   weight_decay: 0.01
 
-# Learning rate scheduler
 lr_scheduler_kwargs:
-  num_warmup_steps: 100  # Quick warmup (10% of training)
-  num_training_steps: 1000
+  num_warmup_steps: 20_000
+  num_training_steps: 500_000
 
-# Checkpoint configuration - disabled for fast convergence testing
 checkpoint:
-  ckpt_dir: null  # No checkpoints
-  save_final_model: false  # Don't save final model
-  resume_from_checkpoint: false  # Start fresh for convergence test
-  save_every_n_steps: null  # No intermediate checkpoints
+  ckpt_dir: null # No checkpoints
+  save_final_model: false # Don't save final model
+  resume_from_checkpoint: false # Start fresh for convergence test
+  save_every_n_steps: null # No intermediate checkpoints
 
-# Logging - frequent logging to track convergence
 logger:
-  frequency: 10  # Log every 10 steps
+  frequency: 100
 
 # WandB configuration
 wandb_init_args:
   project: "llama3-genomic-convergence"
   name: "tiny-llama-convergence-test"
-  mode: "online"  # Online mode for real-time dashboard
+  mode: "online"
   tags:
     - convergence-test
     - tiny-model
     - 1M-params
     - 8192-context
 
-# Meta device and torch compile
 use_meta_device: false
-use_torch_compile: false  # Disable for debugging
+use_torch_compile: false
 
-# FP8 configuration - disabled for convergence testing
 fp8_config:
   enabled: false
   fp8_recipe: transformer_engine.common.recipe.DelayedScaling
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml
@@ -3,42 +3,39 @@ defaults:
   - _self_
 
 # Training config
-model_tag: ./example_checkpoint  # Use tiny Llama config for testing (4 layers, 384 hidden, ~9.6M params)
+model_tag: ./example_checkpoint # Use tiny Llama config for testing (4 layers, 384 hidden, ~9.6M params)
+
+config_kwargs:
+  num_hidden_layers: 2
+  hidden_size: 384
+  intermediate_size: 1536
+  num_attention_heads: 6
+  num_key_value_heads: 6
+
 num_train_steps: 250
 
 # We want this on in CI/CD to validate that the script runs successfully with torch.compile.
-use_torch_compile: false  # Disable for faster startup during testing
+use_torch_compile: true # Disable for faster startup during testing
 
 dataset:
-  tokenizer_path: ./example_checkpoint  # Tokenizer included in checkpoint directory
-  micro_batch_size: 1  # Small batch size for limited GPU memory
-  num_workers: 1
-  max_seq_length: 1024  # Smaller window for testing
-  stride: 100  # Smaller stride for testing
-  buffer_size: 10_000  # Smaller buffer for testing
-  use_lazy_tokenization: true
-  use_stateful_dataloader: false  # Until https://github.com/pytorch/pytorch/pull/163102 is resolved with torchdata.
+  micro_batch_size: 1 # Small batch size for limited GPU memory
   load_dataset_kwargs:
     path: "parquet"
     split: "train"
     data_files: "test_genomic_sequences.parquet"  # Use local test file in recipe directory
-
+    streaming: True
 
 # WandB config
 wandb_init_args:
   name: "llama3_8B_genomic_sanity"
   mode: "offline"
-  project: null  # Set to null by default, override with +wandb_init_args.project=your-project
 
 # Learning rate scheduler config
 lr_scheduler_kwargs:
-  num_warmup_steps: 10  # Shorter warmup for quick testing
-  num_training_steps: 250  # Match num_train_steps
+  num_warmup_steps: 10 # Shorter warmup for quick testing
 
 checkpoint:
   ckpt_dir: null
-  resume_from_checkpoint: true
-  save_every_n_steps: 50
   save_final_model: false
 
 logger:
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml
@@ -1,5 +1,13 @@
 # Training config
-model_tag: ??? # E.g., meta-llama/Meta-Llama-3-8B or a local path
+model_tag: ??? # E.g., meta-llama/Llama-3.2-1B or a local path
+config_kwargs: # Arguments to pass to the AutoConfig.from_pretrained method
+  trust_remote_code: true
+  vocab_size: 256  # Overrides to the default config that comes from meta-llama/Llama-3.2-1B
+  tie_word_embeddings: false
+  eos_token_id: 0
+  pad_token_id: 1
+  bos_token_id: 2
+
 num_train_steps: ???
 
 # TODO: Once BIONEMO-2583 and BIONEMO-2719 are fixed, enable this by default and simplify training scripts to remove the
@@ -14,23 +22,23 @@ use_torch_compile: false
 use_gradient_checkpointing: false
 
 dataset:
-  tokenizer_path: ./example_checkpoint   # Set to the path of your tokenizer (e.g., ./example_checkpoint)
+  tokenizer_path: ${model_tag} # Set to the path of your tokenizer (e.g., ./example_checkpoint)
   micro_batch_size: 8
   num_workers: 1
-  max_seq_length: 8192  # Window size for genomic sequences
-  stride: 200  # Overlap for windowing
-  buffer_size: 500_000  # Shuffle buffer size
+  max_seq_length: 8192 # Window size for genomic sequences
+  stride: 200 # Overlap for windowing
+  buffer_size: 500_000 # Shuffle buffer size
   use_lazy_tokenization: true
-  use_stateful_dataloader: false  # Until https://github.com/pytorch/pytorch/pull/163102 is resolved with torchdata.
+  use_stateful_dataloader: false # Until https://github.com/pytorch/pytorch/pull/163102 is resolved with torchdata.
   load_dataset_kwargs:
-    path: "parquet"
+    path: ???
     split: "train"
     streaming: True
 
 # WandB config
 wandb_init_args:
   name: ???
-  project: null  # Optional: set to your wandb project name
+  project: null # Optional: set to your wandb project name
 
 # mFSDP config
 fully_shard_kwargs:
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_ddp.py b/bionemo-recipes/recipes/llama3_native_te/train_ddp.py
@@ -59,7 +59,7 @@ def main(args: DictConfig) -> float | None:
     )
 
     # Create an empty Llama3 model with a causal language model head, e.g. "meta-llama/Meta-Llama-3-8B".
-    config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, dtype=torch.bfloat16)
+    config = AutoConfig.from_pretrained(args.model_tag, dtype=torch.bfloat16, **args.config_kwargs)
     # Use SDPA (Scaled Dot-Product Attention) to avoid materializing large causal masks
     # config.attn_implementation = "sdpa"
 
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_fsdp2.py b/bionemo-recipes/recipes/llama3_native_te/train_fsdp2.py
@@ -66,7 +66,7 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
     )
 
     # Create an empty Llama3 model with a causal language model head, e.g. "meta-llama/Meta-Llama-3-8B".
-    config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, dtype=torch.bfloat16)
+    config = AutoConfig.from_pretrained(args.model_tag, dtype=torch.bfloat16, **args.config_kwargs)
     # Use SDPA (Scaled Dot-Product Attention) to avoid materializing large causal masks
     # config.attn_implementation = "sdpa"
 

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def main(args: DictConfig) -> float \| None:`
`59`	`59`	`)`
`60`	`60`
`61`	`61`	`# Create an empty Llama3 model with a causal language model head, e.g. "meta-llama/Meta-Llama-3-8B".`
`62`		`- config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, dtype=torch.bfloat16)`
	`62`	`+ config = AutoConfig.from_pretrained(args.model_tag, dtype=torch.bfloat16, **args.config_kwargs)`
`63`	`63`	`# Use SDPA (Scaled Dot-Product Attention) to avoid materializing large causal masks`
`64`	`64`	`# config.attn_implementation = "sdpa"`
`65`	`65`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def main(args: DictConfig) -> float \| None: # noqa: C901`
`66`	`66`	`)`
`67`	`67`
`68`	`68`	`# Create an empty Llama3 model with a causal language model head, e.g. "meta-llama/Meta-Llama-3-8B".`
`69`		`- config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, dtype=torch.bfloat16)`
	`69`	`+ config = AutoConfig.from_pretrained(args.model_tag, dtype=torch.bfloat16, **args.config_kwargs)`
`70`	`70`	`# Use SDPA (Scaled Dot-Product Attention) to avoid materializing large causal masks`
`71`	`71`	`# config.attn_implementation = "sdpa"`
`72`	`72`