address second round of claude review

Separius · Separius · commit 7d791db061b2 · 2026-05-11T06:44:23.000-07:00
- training_loop: skip train_dataloader on non-master ranks; batch the per-iter
  pickle-based all_gather_object to log-chunk boundaries; init input_ids on
  device.
- bypass_checkpoint_utils: atomic latest-symlink replacement via tmp+rename.
- sewing_kit/utils: rewrite batched_normalized_mse_loss to relative-L2
  (sum((x-t)^2) / (sum(t^2)+eps)); drop epsilon-offset-inside-MSE trick.
- child_init: hand each pruning mixin its own keys_to_remove copy, merge after,
  so composition order can't corrupt the state dict.
- puzzletron_nas_plugin: warn with the offending directory path when reusing
  existing activation scores / pruned checkpoints.
- pruning_utils: probe llm_config in _lm_attrs for InternVL-style configs.
- hydra_utils: route 5-arg warmup_steps calls with a fractional 4th arg to
  the legacy 4-arg path to avoid ZeroDivisionError.
- tests: relative-L2 properties (scale-invariance, zero-both, finiteness on
  zero target); new regression test pinning FunctionTarget kwarg dispatch.

Signed-off-by: Sepehr Sameni &lt;ssameni@nvidia.com&gt;
diff --git a/modelopt/torch/puzzletron/bypass_distillation/bypass_checkpoint_utils.py b/modelopt/torch/puzzletron/bypass_distillation/bypass_checkpoint_utils.py
@@ -15,6 +15,7 @@
 
 """Checkpoint utilities for bypass distillation."""
 
+import os
 import re
 from collections import OrderedDict
 from pathlib import Path
@@ -222,10 +223,15 @@ def save_bypass_checkpoint(
     save_checkpoint_from_shards(model=model, checkpoint_dir=checkpoint_dir, descriptor=descriptor)
 
     if dist.is_master():
-        # Create 'latest' symlink
+        # Create 'latest' symlink via tmp-symlink + atomic rename so concurrent
+        # readers on a shared filesystem never observe a missing `latest`. The
+        # plain unlink + symlink_to pair leaves a brief window where the link
+        # doesn't exist; Path.replace (== os.replace) is atomic on POSIX.
         latest_symlink = Path(cfg.bypass.experiment_dir) / "latest"
-        latest_symlink.unlink(missing_ok=True)
-        latest_symlink.symlink_to(checkpoint_dir.name)
+        tmp_symlink = latest_symlink.with_name(f".latest_tmp_{os.getpid()}")
+        tmp_symlink.unlink(missing_ok=True)
+        tmp_symlink.symlink_to(checkpoint_dir.name)
+        tmp_symlink.replace(latest_symlink)
         # Save config args json
         json_dump(cfg.bypass, checkpoint_dir / "args.json")
         # Save completed file
diff --git a/modelopt/torch/puzzletron/bypass_distillation/training_loop.py b/modelopt/torch/puzzletron/bypass_distillation/training_loop.py
@@ -57,7 +57,7 @@
 
 from .bypass_checkpoint_utils import find_latest_run_dir, load_local_state, save_bypass_checkpoint
 from .bypass_utils import get_distributed_modules_ownership, set_experiment_dir, set_experiment_id
-from .data_classes import GlobalRank, IterNum, IterStatistics, LocalTrainingStats, TimeToSaveSignal
+from .data_classes import GlobalRank, IterNum, IterStatistics, TimeToSaveSignal
 from .stitched_model_factory import StitchedModuleDescriptor, StitchedModulesProcessOwnership
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -118,6 +118,30 @@ def launch_bypass_distillation(hydra_cfg: DictConfig) -> None:
     mprint("Bypass distillation sweep completed")
 
 
+def _flush_loss_buffer(
+    local_buffer: dict[int, dict[str, float]],
+    stitched_losses_history: Optional[dict[int, dict[str, float]]],
+) -> None:
+    """All-gather buffered per-iter losses and merge into master's history.
+
+    Pickle-based ``all_gather_object`` was previously called on every micro-batch;
+    batching to log-chunk boundaries reduces that cost ~``iters_per_log_chunk``×.
+    All ranks must call this so the collective doesn't deadlock; only master
+    actually accumulates into ``stitched_losses_history``.
+    """
+    if not local_buffer:
+        return
+    gathered: list[Optional[dict[int, dict[str, float]]]] = [None] * dist.size()
+    torch.distributed.all_gather_object(gathered, local_buffer)
+    if dist.is_master():
+        assert stitched_losses_history is not None
+        for rank_buf in gathered:
+            if rank_buf is None:
+                continue
+            for it, losses in rank_buf.items():
+                stitched_losses_history.setdefault(it, {}).update(losses)
+
+
 def train(
     cfg: DictConfig,
     descriptor: ModelDescriptor,
@@ -126,7 +150,7 @@ def train(
     teacher_stitched_model: StitchedModule,
     stitched_module_descriptors: OrderedDict[str, StitchedModuleDescriptor],
     stitched_modules_process_ownership: StitchedModulesProcessOwnership,
-    train_dataloader: DataLoader,
+    train_dataloader: Optional[DataLoader],
     val_dataloader: Optional[DataLoader],
     student_model_config: PretrainedConfig,
     skip_first_batches: int = 0,
@@ -211,13 +235,18 @@ def train(
         f"Grad scaling status: {'enabled' if cfg.bypass.training.use_grad_scaling else 'disabled'}"
     )
 
-    train_iterator = iter(train_dataloader)
+    # Only master consumes the dataloader — `next(train_iterator)` is gated by
+    # `if dist.is_master()` further down. Building the iterator (or running
+    # skip_first_batches against it) on non-master ranks wastes startup time
+    # and memory proportional to the dataset, since each tokenizes the full
+    # corpus only to throw it away.
+    train_iterator = iter(train_dataloader) if dist.is_master() else None
 
     # Advance past the first `skip_first_batches` batches before the training loop
     # starts. Used either to skip a known-bad batch range during debugging, or to
     # roll the data iterator forward when resuming a run (model + optimizer state
     # are restored from the checkpoint, but the dataloader itself starts fresh).
-    if skip_first_batches > 0:
+    if dist.is_master() and skip_first_batches > 0:
         mprint(f"Skipping first {skip_first_batches} batches before training")
         for _ in range(skip_first_batches):
             next(train_iterator)
@@ -233,8 +262,21 @@ def train(
     best_steps_by_name: dict[str, int] = dict(cfg.bypass.get("best_steps_by_name", {}))
     # Anchor for the "Δ from initial" column: per-block loss from the first log chunk.
     initial_losses_by_name: dict[str, float] = dict(cfg.bypass.get("initial_losses_by_name", {}))
-    # Buffer variables
-    input_ids = torch.zeros(1, 1, dtype=torch.int64)
+
+    # log_interval is in optimizer-step units; multiply by grad_accum to land in
+    # micro-batch units, which is what the per-iter loss collection counts.
+    iters_per_log_chunk = (
+        cfg.bypass.training.log_interval * cfg.bypass.training.grad_accumulation_steps
+    )
+    # Per-rank local buffer of {iter_num: {block_name: loss}}. We accumulate
+    # losses locally on every rank and only collide them via all_gather_object
+    # at log-chunk boundaries — the object collective is pickle-based and
+    # was previously the per-iter sync cost. See `_flush_loss_buffer` below.
+    local_losses_buffer: dict[int, dict[str, float]] = {}
+    # Buffer variables. Initialise on the active device so non-master ranks
+    # never hand a CPU tensor to a downstream GPU op if the master-only-fetch
+    # invariant is ever relaxed (today only master replaces this in the loop).
+    input_ids = torch.zeros(1, 1, dtype=torch.int64, device=device)
 
     aprint(
         f"previous rank: {str(prev_rank):<5} next rank: {str(next_rank):<5} {owned_stitched_module_indices=}"
@@ -247,6 +289,11 @@ def train(
         # and incremented at the END of each iteration, so we must use `>` (not `>=`)
         # to ensure step `max_steps` itself runs before exiting.
         if cfg.bypass.step_num > cfg.bypass.training.max_steps:
+            # Drain any residual buffered losses (< log-chunk boundary) so the
+            # final partial chunk's stats reach master and can be logged before
+            # the function returns. Must run on every rank — collective op.
+            _flush_loss_buffer(local_losses_buffer, stitched_losses_history)
+            local_losses_buffer.clear()
             if (
                 cfg.bypass.model.model_overrides.save_checkpoint_when_done
                 and not cfg.bypass.disable_checkpoint_save
@@ -386,25 +433,17 @@ def train(
         else:
             iter_stitched_module_losses = {}
 
-        # Collect losses from all ranks using all_gather_object
-        local_training_stats = LocalTrainingStats(
-            iter_num=cfg.bypass.iter_num,
-            stitched_module_losses=iter_stitched_module_losses,
-        )
-        all_training_stats = [None] * dist.size()
-        torch.distributed.all_gather_object(all_training_stats, local_training_stats)
-
-        if dist.is_master():
-            if cfg.bypass.iter_num == resumed_iter_num:
-                mprint(f"Starting from iter {cfg.bypass.iter_num}")
+        if dist.is_master() and cfg.bypass.iter_num == resumed_iter_num:
+            mprint(f"Starting from iter {cfg.bypass.iter_num}")
 
-            # Merge all stats into the losses history
-            assert stitched_losses_history is not None
-            merged_losses: dict[str, float] = {}
-            for stats in all_training_stats:
-                if stats is not None:
-                    merged_losses.update(stats.stitched_module_losses)
-            stitched_losses_history[cfg.bypass.iter_num] = merged_losses
+        # Buffer this rank's per-block losses locally. The collide-across-ranks
+        # gather happens only at log-chunk boundaries (`_flush_loss_buffer`),
+        # which cuts the per-iter pickle-based all_gather_object cost down to
+        # one gather per `iters_per_log_chunk` micro-batches.
+        local_losses_buffer[cfg.bypass.iter_num] = iter_stitched_module_losses
+        if len(local_losses_buffer) >= iters_per_log_chunk:
+            _flush_loss_buffer(local_losses_buffer, stitched_losses_history)
+            local_losses_buffer.clear()
 
         cfg.bypass.token_count += cfg.bypass.training.tokens_per_iter
         iter_t1 = time.time()
@@ -441,11 +480,9 @@ def train(
         # Logging
         if dist.is_master():
             assert stitched_losses_history is not None
-            # log_interval is in optimizer-step units; the underlying history is
-            # per-iter (micro-batch), so the chunk window is grad_accum × wider.
-            iters_per_log_chunk = (
-                cfg.bypass.training.log_interval * cfg.bypass.training.grad_accumulation_steps
-            )
+            # `iters_per_log_chunk` is computed once before the loop (in
+            # micro-batch units = log_interval × grad_accum) and reused for
+            # both the gather-batching threshold and this log drain.
             while len(stitched_losses_history) >= iters_per_log_chunk:
                 lowest_iter = next(iter(stitched_losses_history.keys()))
 
@@ -830,23 +867,37 @@ def run_bypassed_training(cfg: DictConfig):
             load_streaming_fn if not cfg.bypass.data.load_from_disk else load_from_disk_fn
         )
 
-        train_dataloader = create_train_dataloader(
-            seed=seed,
-            tokenizer=tokenizer,
-            block_size=cfg.bypass.data.block_size,
-            dataset_path=cfg.dataset_path,
-            content_field=cfg.bypass.data.data_column,
-            fim_rate=cfg.bypass.data.fim_rate,
-            fim_spm_rate=cfg.bypass.data.fim_spm_rate,
-            micro_batch_size=cfg.bypass.training.micro_batch_size,
-            load_dataset_fn=load_dataset_fn,
-            keep_in_memory=cfg.bypass.data.keep_in_memory,
-            source_datasets_to_discard=cfg.bypass.data.get("source_datasets_to_discard", tuple()),
-            bos_rate=cfg.bypass.data.bos_rate,
-            shuffle_seed=cfg.bypass.data.shuffle_train_data_seed,
-        )
+        # Only master ever fetches from the train dataloader (training_loop.train
+        # gates `next(train_iterator)` on `dist.is_master()`), so skip the
+        # potentially-large HF dataset load + tokenisation on non-master ranks.
+        if dist.is_master():
+            train_dataloader = create_train_dataloader(
+                seed=seed,
+                tokenizer=tokenizer,
+                block_size=cfg.bypass.data.block_size,
+                dataset_path=cfg.dataset_path,
+                content_field=cfg.bypass.data.data_column,
+                fim_rate=cfg.bypass.data.fim_rate,
+                fim_spm_rate=cfg.bypass.data.fim_spm_rate,
+                micro_batch_size=cfg.bypass.training.micro_batch_size,
+                load_dataset_fn=load_dataset_fn,
+                keep_in_memory=cfg.bypass.data.keep_in_memory,
+                source_datasets_to_discard=cfg.bypass.data.get(
+                    "source_datasets_to_discard", tuple()
+                ),
+                bos_rate=cfg.bypass.data.bos_rate,
+                shuffle_seed=cfg.bypass.data.shuffle_train_data_seed,
+            )
+        else:
+            train_dataloader = None
 
         val_dataloader = None
+        # Note: val_dataloader is kept constructed on every rank even though only
+        # master reads from it inside calculate_losses_pipeline. The validation
+        # block uses `val_dataloader is not None` as a "validation enabled" gate
+        # that must agree across ranks — and calculate_losses_pipeline itself is
+        # pipeline-parallel and requires every rank to enter it. Skipping
+        # construction on non-master ranks would break those invariants.
         if not cfg.bypass.disable_validation:
             val_dataloader = create_validation_dataloader(
                 accelerator=None,
diff --git a/modelopt/torch/puzzletron/pruning/pruning_utils.py b/modelopt/torch/puzzletron/pruning/pruning_utils.py
@@ -72,11 +72,11 @@ def _lm_attrs(config):
 
     VL configs nest language-model fields like ``num_attention_heads``, ``head_dim``,
     and ``hidden_size`` under a sub-config. The attribute name varies by family —
-    ``text_config`` (Qwen3-VL, Llava, Idefics) and ``language_config`` (Llama-4 and
-    a handful of others) are both common. Probe both before falling back to the
-    raw config.
+    ``text_config`` (Qwen3-VL, Llava, Idefics), ``language_config`` (Llama-4 and a
+    handful of others), and ``llm_config`` (InternVL and friends) are all common.
+    Probe each before falling back to the raw config.
     """
-    for attr in ("text_config", "language_config"):
+    for attr in ("text_config", "language_config", "llm_config"):
         sub = getattr(config, attr, None)
         if sub is not None:
             return sub
diff --git a/modelopt/torch/puzzletron/puzzletron_nas_plugin.py b/modelopt/torch/puzzletron/puzzletron_nas_plugin.py
@@ -219,7 +219,9 @@ def convert_puzzletron_model(model: nn.Module, config: PuzzletronConfig) -> Conv
     activations_log_dir = Path(hydra_cfg.pruning.activations_log_dir)
     if activations_log_dir.exists() and any(activations_log_dir.glob("rank_*.pth")):
         mprint(
-            f"Puzzletron Progress {score_step}/{N}: pruning activation scores already exist, skipping scoring"
+            f"Puzzletron Progress {score_step}/{N}: pruning activation scores already "
+            f"exist at {activations_log_dir} — delete this directory to re-score with "
+            f"the current config."
         )
         dist.barrier()
     else:
@@ -231,7 +233,9 @@ def convert_puzzletron_model(model: nn.Module, config: PuzzletronConfig) -> Conv
     if dist.is_master():
         if pruned_ckpts_dir.exists() and any(pruned_ckpts_dir.iterdir()):
             mprint(
-                f"Puzzletron Progress {prune_step}/{N}: pruned checkpoints already exist, skipping pruning"
+                f"Puzzletron Progress {prune_step}/{N}: pruned checkpoints already "
+                f"exist at {pruned_ckpts_dir} — delete this directory to re-prune with "
+                f"the current config."
             )
         else:
             mprint(
diff --git a/modelopt/torch/puzzletron/sewing_kit/utils.py b/modelopt/torch/puzzletron/sewing_kit/utils.py
@@ -35,7 +35,6 @@
 import torch._dynamo
 import torch.distributed
 import torch.nn as nn
-import torch.nn.functional as F
 import torch.utils._pytree as pytree
 from torch import Tensor
 from torch._subclasses import FakeTensor, FakeTensorMode
@@ -483,29 +482,14 @@ def batched_normalized_mse_loss(
     epsilon: float = 1e-6,
     batch_dims: Sequence[int] = (0,),
 ) -> torch.Tensor:
-    """Like normalized_mse_loss, but normalization is done on non-batch dims, then averaged.
-
-    Useful when activations within a batch item should be normalized independently
-    rather than normalizing across the full batch.
-
-    Note: this slightly diverges from the original Puzzle implementation. With
-    per-batch-element normalization, an all-zero target slice produces a
-    denominator of ``epsilon ** 2 ~= 1e-12``, which then explodes the loss for
-    that slice (the global-reduction variant in ``normalized_mse_loss`` dilutes
-    it across non-zero elements, hiding the issue). We clamp the denominator
-    to a floor of ``epsilon`` so the per-element minimum matches the intent of
-    the epsilon term. The clamp only triggers on near-zero target slices —
-    typical activations are unaffected.
-
-    The denominator uses ``MSE(target, epsilon_tensor)`` rather than
-    ``mean(target ** 2)`` for consistency with ``normalized_mse_loss``; the
-    ``clamp(min=epsilon)`` below already handles zero-target slices, so the
-    epsilon offset inside the MSE is redundant but harmless at ``1e-6``.
+    """Per-batch-element relative-L2 loss.
+
+    For each batch element, computes ``||input - target||^2 / (||target||^2 + eps)``
+    over the non-batch dims, then averages across batch elements. The additive
+    ``epsilon`` in the denominator handles all-zero target slices without a hard
+    clamp and makes the loss scale-invariant when ``||target||^2 >> eps``.
     """
-    norm_dims = list(set(range(input.ndim)) - set(batch_dims))
-    norm_of_target_vectors = F.mse_loss(
-        target, torch.zeros_like(target) + epsilon, reduction="none"
-    ).mean(norm_dims)
-    norm_of_target_vectors = norm_of_target_vectors.clamp(min=epsilon)
-    loss = F.mse_loss(input, target, reduction="none").mean(norm_dims) / norm_of_target_vectors
-    return loss.mean()
+    norm_dims = [d for d in range(input.ndim) if d not in batch_dims]
+    num = ((input - target) ** 2).sum(dim=norm_dims)
+    den = (target**2).sum(dim=norm_dims) + epsilon
+    return (num / den).mean()
diff --git a/modelopt/torch/puzzletron/tools/bypassed_training/child_init.py b/modelopt/torch/puzzletron/tools/bypassed_training/child_init.py
@@ -86,9 +86,14 @@ def _process_single_layer(
     # Delegate to pruning_mixin if available (supports a single mixin or a list of mixins).
     # When the bypass factory composes multiple mixins (e.g. experts_removal + kv_heads),
     # it passes them as a list so each can contribute its slice of the layer state dict.
+    # Each mixin gets its own copy of keys_to_remove and the unions are merged afterward,
+    # so ordering between mixins can't corrupt the state dict even if a future pair of
+    # mixins ever happens to touch overlapping keys.
     if pruning_mixin is not None:
         _mixins = pruning_mixin if isinstance(pruning_mixin, list) else [pruning_mixin]
+        merged_keys_to_remove = dict(keys_to_remove)
         for _mixin in _mixins:
+            mixin_keys = dict(keys_to_remove)
             _layer_out = _mixin.prune_single_layer(
                 layer_idx=layer_idx,
                 parent_state_dict=parent_state_dict,
@@ -104,10 +109,11 @@ def _process_single_layer(
                 is_original_mha=is_original_mha,
                 head_size=head_size,
                 hidden_size=hidden_size,
-                keys_to_remove=keys_to_remove,
+                keys_to_remove=mixin_keys,
             )
             layer_out_state_dict.update(_layer_out)
-        return layer_out_state_dict, keys_to_remove
+            merged_keys_to_remove.update(mixin_keys)
+        return layer_out_state_dict, merged_keys_to_remove
 
     # Legacy inline processing (fallback when no pruning_mixin)
 
diff --git a/modelopt/torch/puzzletron/tools/hydra_utils.py b/modelopt/torch/puzzletron/tools/hydra_utils.py
@@ -54,6 +54,12 @@ def _warmup_steps_resolver(*args):
     if len(args) == 4:
         t, b, m, p = args
         return warmup_steps(t, b, m, pct=p)
+    # A 5-arg call where the 4th arg is a fractional float almost certainly
+    # means `pct` landed in the `grad_accum` slot — `int(0.05) == 0` would
+    # later raise ZeroDivisionError inside `warmup_steps`. Treat it as legacy.
+    if len(args) == 5 and isinstance(args[3], float) and args[3] < 1.0:
+        t, b, m, p, _ = args
+        return warmup_steps(t, b, m, pct=p)
     return warmup_steps(*args)
 
 
diff --git a/tests/unit/torch/puzzletron/test_bypass_losses.py b/tests/unit/torch/puzzletron/test_bypass_losses.py
diff --git a/tests/unit/torch/puzzletron/test_sewing_kit_function_target_kwargs.py b/tests/unit/torch/puzzletron/test_sewing_kit_function_target_kwargs.py