NVIDIA
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/Dockerfile‎
Lines changed: 9 additions & 0 deletions b/‎bionemo-recipes/recipes/llama3_native_te/Dockerfile‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/dataset.py‎
Lines changed: 26 additions & 32 deletions b/‎bionemo-recipes/recipes/llama3_native_te/dataset.py‎
Lines changed: 26 additions & 32 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L2_lingua_1b.yaml‎
Lines changed: 25 additions & 11 deletions b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L2_lingua_1b.yaml‎
Lines changed: 25 additions & 11 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml‎
Lines changed: 12 additions & 16 deletions b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml‎
Lines changed: 12 additions & 16 deletions
@@ -0,0 +1,9 @@
+# syntax=docker/dockerfile:1.4
+FROM nvcr.io/nvidia/pytorch:25.11-py3
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements.txt,target=/requirements.txt \
+    PIP_CONSTRAINT= pip install -r /requirements.txt
+
+WORKDIR /workspace/bionemo
+COPY . .
@@ -36,9 +36,10 @@ def create_tokenized_dataset(
     load_dataset_kwargs: dict,
     max_seq_length: int = 8192,
     stride: int = 200,
-    buffer_size: int = 500_000,
+    buffer_size: int = 5_000,
     use_lazy_tokenization: bool = True,
     text_column: str = "text",
+    tokenize_batch_size: int = 100,
 ):
     """Create a tokenized dataset with windowing.
 
@@ -51,20 +52,28 @@ def create_tokenized_dataset(
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
         text_column: Name of the column containing genomic sequences (default: "text").
+        tokenize_batch_size: The batch size for tokenization.
 
     Returns:
         Tuple of (tokenized_dataset, tokenizer).
     """
     logger.info(f"Loading dataset with kwargs: {load_dataset_kwargs}")
     dataset = datasets.load_dataset(**load_dataset_kwargs)
-    logger.info(f"Loaded dataset: {dataset}")
 
     if isinstance(dataset, datasets.IterableDataset):
-        dataset = datasets.distributed.split_dataset_by_node(
-            dataset,
-            rank=distributed_config.rank,
-            world_size=distributed_config.world_size,
-        )
+        # Hugging Face's `split_dataset_by_node` is quite sensitive to the total number of shards -- if the number of
+        # shards is not perfectly divisible by the world size, it defaults to loading the same shards on all nodes and
+        # using strided sampling to avoid loading the same data on all nodes. This can be quite inefficient with large
+        # numbers of shards and workers, so we use `dataset.shard` instead.
+        if distributed_config.world_size > dataset.num_shards:
+            logger.info(f"Sharding dataset with {dataset.num_shards} shards with split_dataset_by_node")
+            dataset = datasets.distributed.split_dataset_by_node(
+                dataset, rank=distributed_config.rank, world_size=distributed_config.world_size
+            )
+        else:
+            logger.info(f"Sharding dataset with {dataset.num_shards} shards with dataset.shard")
+            dataset = dataset.shard(num_shards=distributed_config.world_size, index=distributed_config.rank)
+
         dataset = dataset.shuffle(seed=42, buffer_size=buffer_size)
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
@@ -86,33 +95,11 @@ def tokenize_with_windowing(examples):
         # Using dataset.map on a non-streaming dataset will automatically perform and cache the transform
         tokenized_dataset = dataset.with_transform(tokenize_with_windowing)
     else:
-        # WORKAROUND for OpenGenome2 inconsistent schema:
-        # OpenGenome2 has inconsistent schemas across shards - some have 'record' column, some don't.
-        # This causes dataset.column_names to be None for streaming IterableDataset.
-        #
-        # For IterableDataset with None column_names (OpenGenome2):
-        #   - Must explicitly list columns to remove: [text_column, "record"]
-        #   - IterableDataset.map() handles missing columns gracefully
-        #
-        # For regular Dataset (non-streaming, or streaming with consistent schema like ESM2):
-        #   - Use dataset.column_names (which is available and accurate)
-        #   - Dataset.map() raises error if column doesn't exist
-        #
-        # TODO: Remove this workaround once Arc Institute fixes OpenGenome2 schema consistency.
-        # When all shards have the same columns, dataset.column_names will work for both cases.
-        if isinstance(dataset, datasets.IterableDataset) and dataset.column_names is None:
-            # Streaming dataset: column_names may be None due to inconsistent schema
-            columns_to_remove = [text_column, "record"]
-        else:
-            # Non-streaming dataset: use actual column names
-            columns_to_remove = dataset.column_names
-
-        logger.info(f"Applying dataset.map with columns to remove: {columns_to_remove}")
-
-        tokenized_dataset = dataset.map(
+        tokenized_dataset = dataset.select_columns(text_column).map(
             tokenize_with_windowing,
             batched=True,
-            remove_columns=columns_to_remove,
+            batch_size=tokenize_batch_size,
+            remove_columns=[text_column],
         )
 
     return tokenized_dataset, tokenizer
@@ -124,6 +111,7 @@ def create_bshd_dataloader(
     load_dataset_kwargs: dict,
     micro_batch_size: int,
     num_workers: int = 1,
+    prefetch_factor: int = 4,
     max_seq_length: int = 8192,
     stride: int = 200,
     seed: int = 42,
@@ -142,6 +130,7 @@ def create_bshd_dataloader(
         load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
         micro_batch_size: The batch size per device.
         num_workers: The number of workers to use for the dataloader.
+        prefetch_factor: The prefetch factor to use for the dataloader.
         max_seq_length: The maximum length of sequences (window size).
         stride: The stride for windowing (overlap = stride tokens).
         seed: The seed to use for the distributed sampler and data collator.
@@ -164,6 +153,7 @@ def create_bshd_dataloader(
         buffer_size=buffer_size,
         use_lazy_tokenization=use_lazy_tokenization,
         text_column=text_column,
+        tokenize_batch_size=micro_batch_size * prefetch_factor,
     )
 
     if isinstance(tokenized_dataset, datasets.IterableDataset):
@@ -207,6 +197,7 @@ def create_bshd_dataloader(
         num_workers=num_workers,
         pin_memory=True if not use_stateful_dataloader else False,
         persistent_workers=num_workers > 0,
+        prefetch_factor=prefetch_factor if num_workers > 0 else None,
     )
 
     return train_dataloader, tokenized_dataset if sampler is None else sampler
@@ -219,6 +210,7 @@ def create_thd_dataloader(
     micro_batch_size: int | None = None,
     token_micro_batch_size: int | None = None,
     num_workers: int = 1,
+    prefetch_factor: int = 4,
     max_seq_length: int = 8192,
     stride: int = 200,
     buffer_size: int = 500_000,
@@ -238,6 +230,7 @@ def create_thd_dataloader(
         token_micro_batch_size: The maximum number of tokens per batch. If None, the micro_batch_size * max_seq_length
             will be used. Defaults to None.
         num_workers: The number of workers to use for the dataloader.
+        prefetch_factor: The prefetch factor to use for the dataloader.
         max_seq_length: The maximum length of sequences (window size).
         stride: The stride for windowing (overlap = stride tokens).
         seed: The seed to use for the distributed sampler and data collator.
@@ -292,6 +285,7 @@ def create_thd_dataloader(
         num_workers=num_workers,
         pin_memory=True if not use_stateful_dataloader else False,
         persistent_workers=num_workers > 0,
+        prefetch_factor=prefetch_factor if num_workers > 0 else None,
     )
 
     return train_dataloader, tokenized_dataset
@@ -52,7 +52,7 @@ logger:
   frequency: 100
 
 # WandB configuration
-wandb_init_args:
+wandb:
   project: "llama3-genomic-convergence"
   name: "tiny-llama-convergence-test"
   mode: "online"
 
@@ -34,7 +34,7 @@ dataset:
     streaming: True
 
 # WandB config
-wandb_init_args:
+wandb:
   name: "llama3_8B_genomic_sanity"
   mode: "offline"
 
 
@@ -6,18 +6,23 @@ defaults:
 
 config_name_or_path: ./model_configs/meta-llama/Llama-3.2-1B
 
-wandb_init_args:
+config_kwargs:
+  attn_input_format: thd
+
+use_sequence_packing: true
+
+wandb:
   name: lingua-1b-te
   project: null # Optional: set to your wandb project name
 
 num_train_steps: 60_000
 
 dataset:
   tokenizer_name_or_path: nvidia/Llama-3.1-8B-Instruct-FP8
-  micro_batch_size: 1
-  num_workers: 1
-  max_seq_length: 8192
-  stride: 1024
+  micro_batch_size: 4
+  num_workers: 8
+  max_seq_length: 4096
+  stride: 512
   buffer_size: 5_000
   use_lazy_tokenization: true
   use_stateful_dataloader: false
@@ -30,19 +35,28 @@ dataset:
     streaming: True
 
 adamw_kwargs:
-  lr: 3e-4
+  lr: .003
   fused: true
   betas: [0.9, 0.95]
-  eps: 1e-5
-  weight_decay: 0.1
+  eps: 0.00000001
+  weight_decay: 0.033
 
 lr_scheduler_kwargs:
-  num_warmup_steps: 2_000
-  num_decay_steps: 60_000
+  num_warmup_steps: 5_000
+  num_decay_steps: 55_000 # total_steps - num_warmup_steps = 60_000 - 5_000
+  min_lr_ratio: 0.000001
 
 # Checkpoint config
 checkpoint:
   ckpt_dir: null
   save_final_model: true
   resume_from_checkpoint: true
-  save_every_n_steps: 1_000
+  save_every_n_steps: 10_000
+
+profiler:
+  enabled: false
+  schedule:
+    wait: 125
+    warmup: 125
+    active: 10
+    repeat: 1
@@ -33,23 +33,10 @@ dataset:
     streaming: True
 
 # WandB config
-wandb_init_args:
+wandb:
   name: ???
   project: null # Optional: set to your wandb project name
 
-# mFSDP config
-fully_shard_kwargs:
-  zero_dp_strategy: "optim_grads_params"
-  calculate_per_token_loss: false
-  init_model_with_meta_device: ${use_meta_device}
-  check_for_nan_in_grad: true
-  grad_reduce_in_fp32: false
-  preserve_fp32_weights: true
-  overlap_grad_reduce: true
-  overlap_param_gather: true
-  sync_model_each_microbatch: true
-  average_in_collective: false
-
 # TransformerEngine FP8 config. See
 # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more information on
 # supported formats.
@@ -63,7 +50,7 @@ fp8_config:
 
 # Optimizer config
 adamw_kwargs:
-  lr: 3e-4
+  lr: 3e-3
   fused: true
   betas: [0.9, 0.95]
   eps: 1e-5
@@ -72,7 +59,8 @@ adamw_kwargs:
 # Learning rate scheduler config
 lr_scheduler_kwargs:
   num_warmup_steps: 2_000
-  num_decay_steps: 500_000
+  num_decay_steps: 498_000
+  min_lr_ratio: 0.000001
 
 # Checkpoint config
 checkpoint:
@@ -83,3 +71,11 @@ checkpoint:
 
 logger:
   frequency: 100
+
+profiler:
+  enabled: false
+  schedule:
+    wait: 10
+    warmup: 10
+    active: 3
+    repeat: 1