NVIDIA · jwilber · Oct 21, 2025 · Oct 21, 2025 · Oct 23, 2025 · Oct 23, 2025
@@ -61,6 +61,8 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             "train/perplexity": torchmetrics.text.Perplexity(ignore_index=-100),
             "train/gpu_memory_allocated_max_gb": torchmetrics.MaxMetric(),
             "train/gpu_memory_allocated_mean_gb": torchmetrics.MeanMetric(),
+            "train/avg_sequence_length_unpadded": torchmetrics.MeanMetric(),
+            "train/avg_sequence_length_padded": torchmetrics.MeanMetric(),
         }
 
         self.metrics = torchmetrics.MetricCollection(metrics_dict)
@@ -94,6 +96,12 @@ def log_step(
         # 1 is the padding token for ESM-2.
         num_unpadded_tokens = batch["input_ids"][batch["input_ids"] != 1].numel()
 
+        # debugging tflops
+        batch_size = batch["input_ids"].shape[0]
+        avg_seq_length_unpadded = num_unpadded_tokens / batch_size
+        avg_seq_length_padded = num_tokens / batch_size
+
+
         self.min_loss = min(self.min_loss, outputs.loss.item())
         step_time, self.previous_step_time = time.perf_counter() - self.previous_step_time, time.perf_counter()
 
@@ -103,6 +111,9 @@ def log_step(
         self.metrics["train/step_time"].update(step_time)
         self.metrics["train/tokens_per_second"].update(num_tokens / step_time)
         self.metrics["train/unpadded_tokens_per_second"].update(num_unpadded_tokens / step_time)
+        # log tflops
+        self.metrics["train/avg_sequence_length_unpadded"].update(avg_seq_length_unpadded)
+        self.metrics["train/avg_sequence_length_padded"].update(avg_seq_length_padded)
 
         # Handle sequence packing for torchmetrics calculation.
         if outputs.logits.dim() < 3:

@@ -82,43 +82,47 @@ use_distributed_checkpoint_fsdp2: false
 ############################################################
 products:
   # TE bshd perf, FSDP2
-  - config: L1_15B_perf_test
-    task_cmd: train_fsdp2
-    parallelism_strategy: fsdp2
-    thd_enabled: false
-    wandb_name: "esm2_native_15b__fsdp2__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "esm2-native-15b-fsdp2"
+  # - config: L1_15B_perf_test
+  #   task_cmd: train_fsdp2
+  #   parallelism_strategy: fsdp2
+  #   thd_enabled: false
+  #   micro_batch_size: 8
+  #   wandb_name: "esm2_native_15b__fsdp2__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+  #   job_name: "esm2-native-15b-fsdp2"
   # TE thd perf, FSDP2
-  - config: L1_15B_perf_test
-    task_cmd: train_fsdp2
-    parallelism_strategy: fsdp2
-    thd_enabled: true
-    wandb_name: "esm2_native_15b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "esm2-native-15b-fsdp2-thd"
+  # - config: L1_15B_perf_test
+  #   task_cmd: train_fsdp2
+  #   parallelism_strategy: fsdp2
+  #   thd_enabled: true
+  #   micro_batch_size: 8
+  #   wandb_name: "esm2_native_15b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+  #   job_name: "esm2-native-15b-fsdp2-thd"
   # TE bshd perf, MFSDP
   - config: L1_15B_perf_test
     task_cmd: train_mfsdp
     parallelism_strategy: mfsdp
     thd_enabled: false
+    micro_batch_size: 8
     wandb_name: "esm2_native_15b__mfsdp__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "esm2-native-15b-mfsdp"
+    job_name: "esm2-native-15b-mfsdp-bs8"
   # TE thd perf, MFSDP
   - config: L1_15B_perf_test
     task_cmd: train_mfsdp
     parallelism_strategy: mfsdp
     thd_enabled: true
+    micro_batch_size: 8
     wandb_name: "esm2_native_15b__mfsdp__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "esm2-native-15b-mfsdp-thd"
+    job_name: "esm2-native-15b-mfsdp-thd-bs8"
   # OSS performance baseline, until torch.compile is figured out
-  - config: L1_15B_perf_test
-    model_tag: facebook/esm2_t48_15B_UR50D
-    task_cmd: train_fsdp2
-    parallelism_strategy: fsdp2
-    thd_enabled: false
-    use_torch_compile: false
-    micro_batch_size: 4
-    wandb_name: "esm2_native_15b__fsdp2__baseline__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "esm2-native-15b-fsdp2-baseline"
+  # - config: L1_15B_perf_test
+  #   model_tag: facebook/esm2_t48_15B_UR50D
+  #   task_cmd: train_fsdp2
+  #   parallelism_strategy: fsdp2
+  #   thd_enabled: false
+  #   use_torch_compile: false
+  #   micro_batch_size: 4
+  #   wandb_name: "esm2_native_15b__fsdp2__baseline__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+  #   job_name: "esm2-native-15b-fsdp2-baseline"
 
 ############################################################
 # run script
@@ -136,8 +140,9 @@ run_script: |
     --master_addr=$MASTER_ADDR \
     --master_port=$MASTER_PORT \
     ${task_cmd}.py \
-    wandb_init_args.mode=${wandb_init_args.mode} \
-    +wandb_init_args.project=${wandb_init_args.project} \
+    --config-name ${config}.yaml \
+    +wandb_init_args.mode=${wandb_init_args.mode} \
+    wandb_init_args.project=${wandb_init_args.project} \
     +wandb_init_args.group=${wandb_init_args.group} \
     +wandb_init_args.job_type=${wandb_init_args.job_type} \
     wandb_init_args.name=${wandb_name} \