NVIDIA-NeMo
diff --git a/‎src/megatron/bridge/data/energon/hf_task_encoder.py‎
Lines changed: 13 additions & 28 deletions b/‎src/megatron/bridge/data/energon/hf_task_encoder.py‎
Lines changed: 13 additions & 28 deletions
diff --git a/‎src/megatron/bridge/data/hf_datasets/conversation_dataset.py‎
Lines changed: 12 additions & 33 deletions b/‎src/megatron/bridge/data/hf_datasets/conversation_dataset.py‎
Lines changed: 12 additions & 33 deletions
diff --git a/‎src/megatron/bridge/data/hf_datasets/provider.py‎
Lines changed: 2 additions & 15 deletions b/‎src/megatron/bridge/data/hf_datasets/provider.py‎
Lines changed: 2 additions & 15 deletions
diff --git a/‎src/megatron/bridge/data/hf_datasets/text_collate.py‎
Lines changed: 11 additions & 3 deletions b/‎src/megatron/bridge/data/hf_datasets/text_collate.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/megatron/bridge/data/sequence_packing.py‎
Lines changed: 5 additions & 23 deletions b/‎src/megatron/bridge/data/sequence_packing.py‎
Lines changed: 5 additions & 23 deletions
diff --git a/‎src/megatron/bridge/data/vlm_batching.py‎
Lines changed: 3 additions & 3 deletions b/‎src/megatron/bridge/data/vlm_batching.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/megatron/bridge/models/gemma_vl/data/collate_fn.py‎
Lines changed: 2 additions & 2 deletions b/‎src/megatron/bridge/models/gemma_vl/data/collate_fn.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/megatron/bridge/models/glm_vl/data/collate_fn.py‎
Lines changed: 7 additions & 2 deletions b/‎src/megatron/bridge/models/glm_vl/data/collate_fn.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/megatron/bridge/models/kimi_vl/data/collate_fn.py‎
Lines changed: 7 additions & 2 deletions b/‎src/megatron/bridge/models/kimi_vl/data/collate_fn.py‎
Lines changed: 7 additions & 2 deletions
@@ -20,7 +20,6 @@
 """
 
 import dataclasses
-import inspect
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple
 
@@ -100,7 +99,7 @@ def __init__(
         visual_keys: Sequence[str] = ("pixel_values",),
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
-        collate_fn: Callable[[list, Any], dict[str, Any]] | None = None,
+        collate_fn: Callable[..., dict[str, Any]] | None = None,
         pad_to_max_length: bool = False,
         pad_to_multiple_of: int = 128,
         enable_in_batch_packing: bool = False,
@@ -127,31 +126,6 @@ def __init__(
                 )
             self._collate_impl = COLLATE_FNS[collate_key]
 
-    def _supported_collate_kwargs(self) -> dict[str, Any]:
-        """Return encoder options accepted by the selected collate function."""
-        try:
-            parameters = inspect.signature(self._collate_impl).parameters
-        except (TypeError, ValueError):
-            return {}
-
-        accepts_kwargs = any(param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values())
-        candidates: dict[str, Any] = {
-            "visual_keys": self.visual_keys,
-            "sequence_length": self.seq_length,
-            "pad_to_max_length": self.pad_to_max_length,
-            "pad_to_multiple_of": self.pad_to_multiple_of,
-            "pack_sequences": self.enable_in_batch_packing,
-            "pack_sequences_pad_to_multiple_of": self.in_batch_packing_pad_to_multiple_of,
-        }
-        if self.min_pixels is not None:
-            candidates["min_pixels"] = self.min_pixels
-        if self.max_pixels is not None:
-            candidates["max_pixels"] = self.max_pixels
-
-        if accepts_kwargs:
-            return candidates
-        return {key: value for key, value in candidates.items() if key in parameters}
-
     def encode_sample(self, sample: ChatMLSample) -> HFEnergonSample:
         """Normalize a single ChatML sample into a HF-style collate example.
 
@@ -185,7 +159,18 @@ def collate_fn(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
             The exact batch dictionary returned by the selected HF collate
             function for this processor type.
         """
-        return self._collate_impl(examples, self.processor, **self._supported_collate_kwargs())
+        return self._collate_impl(
+            examples,
+            self.processor,
+            visual_keys=self.visual_keys,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+            sequence_length=self.seq_length,
+            pad_to_max_length=self.pad_to_max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            pack_sequences=self.enable_in_batch_packing,
+            in_batch_packing_pad_to_multiple_of=self.in_batch_packing_pad_to_multiple_of,
+        )
 
     # ------------------------------------------------------------------
     # batch
 
@@ -36,7 +36,7 @@ def __init__(
         base_examples: List[Dict[str, Any]],
         target_length: int,
         processor: Any,
-        collate_impl: Optional[Callable[[list, Any], Dict[str, torch.Tensor]]] = None,
+        collate_impl: Optional[Callable[..., Dict[str, torch.Tensor]]] = None,
         sequence_length: int | None = None,
         pad_to_max_length: bool = False,
         pad_to_multiple_of: int = 128,
@@ -49,48 +49,27 @@ def __init__(
         self._processor = processor
         # Choose collate implementation by processor type name when not provided
         collate_key = type(processor).__name__ if processor is not None else "default"
-        if collate_impl is not None:
-            selected_impl = collate_impl
-        else:
+        if collate_impl is None:
             from megatron.bridge.data.vlm_datasets.collate import COLLATE_FNS
 
             if collate_key not in COLLATE_FNS:
                 raise ValueError(
                     f"No conversation collate function registered for processor type '{collate_key}'. "
                     "Add it to COLLATE_FNS or pass collate_impl explicitly."
                 )
-            selected_impl = COLLATE_FNS[collate_key]
+            collate_impl = COLLATE_FNS[collate_key]
+        assert collate_impl is not None
 
-        # If in-batch packing is requested, bind the selected collate's packing
-        # kwargs via functools.partial so the DataLoader just calls f(batch, processor).
-        import inspect
-        from functools import partial
-
-        sig = inspect.signature(selected_impl)
-        collate_kwargs: dict[str, Any] = {}
-        if sequence_length is not None and "sequence_length" in sig.parameters:
-            collate_kwargs["sequence_length"] = sequence_length
-            if "pad_to_max_length" in sig.parameters:
-                collate_kwargs["pad_to_max_length"] = pad_to_max_length
-            if "pad_to_multiple_of" in sig.parameters:
-                collate_kwargs["pad_to_multiple_of"] = pad_to_multiple_of
-
-        if enable_in_batch_packing:
-            if "pack_sequences" in sig.parameters:
-                collate_kwargs["pack_sequences"] = True
-                if "pack_sequences_pad_to_multiple_of" in sig.parameters:
-                    collate_kwargs["pack_sequences_pad_to_multiple_of"] = in_batch_packing_pad_to_multiple_of
-            else:
-                raise ValueError(
-                    f"Collate function {getattr(selected_impl, '__name__', selected_impl)} "
-                    f"does not accept in-batch packing. Use a collate that supports packing "
-                    f"(e.g. nemotron_omni_collate_fn)."
-                )
-        if collate_kwargs:
-            selected_impl = partial(selected_impl, **collate_kwargs)
+        collate_kwargs: dict[str, Any] = {
+            "sequence_length": sequence_length,
+            "pad_to_max_length": pad_to_max_length,
+            "pad_to_multiple_of": pad_to_multiple_of,
+            "pack_sequences": enable_in_batch_packing,
+            "in_batch_packing_pad_to_multiple_of": in_batch_packing_pad_to_multiple_of,
+        }
 
         def _bound_collate(batch: list) -> Dict[str, torch.Tensor]:
-            return selected_impl(batch, self._processor)  # type: ignore[call-arg]
+            return collate_impl(batch, self._processor, **collate_kwargs)
 
         self.collate_fn = _bound_collate
 
 
@@ -14,7 +14,6 @@
 
 """Provider that builds conversation datasets from HuggingFace datasets."""
 
-import inspect
 import logging
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
@@ -78,7 +77,7 @@ class HFConversationDatasetProvider(DatasetProvider):
     do_test: bool = True
 
     # Optional collate override. If None, inferred from processor type.
-    collate_impl: Optional[Callable[[list, Any], Dict[str, torch.Tensor]]] = None
+    collate_impl: Optional[Callable[..., Dict[str, torch.Tensor]]] = None
 
     # Keep parity with GPTDatasetConfig usage in batching utilities
     skip_getting_attention_mask_from_dataset: bool = True
@@ -99,18 +98,6 @@ class HFConversationDatasetProvider(DatasetProvider):
     # ConfigContainer fills this from model CP/SP constraints when available.
     in_batch_packing_pad_to_multiple_of: int = 1
 
-    def _collate_supports_packing(self, processor: Any) -> bool:
-        collate_key = type(processor).__name__ if processor is not None else "default"
-        if self.collate_impl is not None:
-            selected_impl = self.collate_impl
-        else:
-            from megatron.bridge.data.vlm_datasets.collate import COLLATE_FNS
-
-            selected_impl = COLLATE_FNS.get(collate_key)
-        if selected_impl is None:
-            return False
-        return "pack_sequences" in inspect.signature(selected_impl).parameters
-
     def _get_maker(self) -> Callable[..., List[Dict[str, Any]]]:
         return get_hf_dataset_maker(self.maker_name)
 
@@ -139,7 +126,7 @@ def _build_split_dataset(
             sequence_length=self.seq_length,
             pad_to_max_length=self.pad_to_max_length,
             pad_to_multiple_of=self.pad_to_multiple_of,
-            enable_in_batch_packing=self.enable_in_batch_packing and self._collate_supports_packing(processor),
+            enable_in_batch_packing=self.enable_in_batch_packing,
             in_batch_packing_pad_to_multiple_of=self.in_batch_packing_pad_to_multiple_of,
         )
 
 
@@ -156,11 +156,13 @@ def text_chat_collate_fn(
     processor: Any,
     *,
     max_length: int | None = None,
+    sequence_length: int | None = None,
     pad_to_max_length: bool = False,
+    pad_to_multiple_of: int = 128,
     warn_on_all_masked: bool = True,
     ignore_index: int = IGNORE_INDEX,
     pack_sequences: bool = False,
-    pack_sequences_pad_to_multiple_of: int = 1,
+    in_batch_packing_pad_to_multiple_of: int = 1,
 ) -> dict[str, Any]:
     """Collate text-only HF chat examples using the shared assistant-mask path.
 
@@ -170,20 +172,26 @@ def text_chat_collate_fn(
         processor: A HF tokenizer or processor. It must expose
             ``apply_chat_template`` directly or through ``processor.tokenizer``.
         max_length: Optional tokenizer truncation length.
+        sequence_length: Optional tokenizer truncation length used by
+            conversation-dataset providers.
         pad_to_max_length: If set with ``max_length``, pad every row to
             ``max_length`` instead of the longest row in the batch.
+        pad_to_multiple_of: Accepted for parity with VLM collate functions.
         warn_on_all_masked: Forwarded to assistant-mask construction.
         ignore_index: Label ignore value for masked targets.
         pack_sequences: If True, flatten the padded microbatch and emit
             packed-sequence metadata for GPT-style training steps.
-        pack_sequences_pad_to_multiple_of: Optional per-sequence length multiple
+        in_batch_packing_pad_to_multiple_of: Optional per-sequence length multiple
             used when ``pack_sequences`` inserts padding for CP/SP constraints.
 
     Returns:
         Batch dictionary with VLM-style ``input_ids`` and GPT-style ``tokens``
         aliases, shifted ``labels`` and ``loss_mask``, ``position_ids``, and
         optional tokenizer fields such as ``attention_mask``.
     """
+    del pad_to_multiple_of
+
+    max_length = max_length if max_length is not None else sequence_length
     tokenizer = get_processor_tokenizer(processor)
     conversations = [_normalize_text_conversation(example) for example in examples]
     rendered_texts = [_render_chat(conversation, processor, tokenizer) for conversation in conversations]
@@ -232,6 +240,6 @@ def text_chat_collate_fn(
             batch,
             pad_token_id=int(pad_token_id),
             ignore_index=ignore_index,
-            pad_to_multiple_of=pack_sequences_pad_to_multiple_of,
+            pad_to_multiple_of=in_batch_packing_pad_to_multiple_of,
         )
     return batch
@@ -16,16 +16,12 @@
 
 from __future__ import annotations
 
-import logging
 from collections.abc import MutableMapping
 from typing import Any
 
 import torch
 
 
-logger = logging.getLogger(__name__)
-
-
 def _sequence_lengths(tokens: torch.Tensor, *, pad_token_id: int, padding_mask: torch.Tensor | None) -> list[int]:
     lengths = []
     batch_size, seq_len = tokens.shape
@@ -202,25 +198,11 @@ def pack_batch_sequences(
         "attention_mask": padding_mask,
         "position_ids": position_ids,
     }
-    try:
-        pack_padded_sequences_in_batch(
-            batch,
-            pad_token_id=pad_token_id,
-            pad_to_multiple_of=pad_to_multiple_of,
-        )
-    except ValueError as exc:
-        if str(exc) != "Cannot pack a batch with no non-padding tokens.":
-            raise
-        logger.warning("No valid sequences found in batch, skipping packing")
-        return (
-            tokens[:, :0],
-            labels[:, :0] if labels is not None else None,
-            loss_mask[:, :0] if loss_mask is not None else None,
-            attention_mask,
-            position_ids[:, :0],
-            torch.tensor([0], dtype=torch.int32, device=tokens.device),
-            torch.tensor(0, dtype=torch.int32, device=tokens.device),
-        )
+    pack_padded_sequences_in_batch(
+        batch,
+        pad_token_id=pad_token_id,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
 
     return (
         batch["input_ids"],
 
@@ -107,7 +107,7 @@ def prepare_vlm_batch_for_training(
     pad_to_max_length: bool = False,
     pad_to_multiple_of: int = 128,
     pack_sequences: bool = False,
-    pack_sequences_pad_to_multiple_of: int = 1,
+    in_batch_packing_pad_to_multiple_of: int = 1,
     pad_token_id: int = 0,
     ignore_index: int = IGNORE_INDEX,
 ) -> None:
@@ -125,7 +125,7 @@ def prepare_vlm_batch_for_training(
             ``pad_to_max_length`` is false.
         pack_sequences: If true, flatten the microbatch and emit packed-sequence
             metadata instead of returning a padded attention mask.
-        pack_sequences_pad_to_multiple_of: Per-sequence packed length multiple
+        in_batch_packing_pad_to_multiple_of: Per-sequence packed length multiple
             for CP/SP constraints.
         pad_token_id: Token value for inserted padding.
         ignore_index: Label value for inserted padding.
@@ -147,7 +147,7 @@ def prepare_vlm_batch_for_training(
             batch,
             pad_token_id=pad_token_id,
             ignore_index=ignore_index,
-            pad_to_multiple_of=pack_sequences_pad_to_multiple_of,
+            pad_to_multiple_of=in_batch_packing_pad_to_multiple_of,
             tokens_key=token_key,
         )
         # Legacy VLM packing always carried both padded and unpadded metadata,
 
@@ -39,7 +39,7 @@ def gemma3_vl_collate_fn(
     pad_to_max_length: bool = False,
     pad_to_multiple_of: int = 128,
     pack_sequences: bool = False,
-    pack_sequences_pad_to_multiple_of: int = 1,
+    in_batch_packing_pad_to_multiple_of: int = 1,
 ) -> dict[str, torch.Tensor]:
     """Collate function for Gemma3 VL models."""
     skipped_tokens = extract_skipped_token_ids(processor)
@@ -113,7 +113,7 @@ def gemma3_vl_collate_fn(
         pad_to_max_length=pad_to_max_length,
         pad_to_multiple_of=pad_to_multiple_of,
         pack_sequences=pack_sequences,
-        pack_sequences_pad_to_multiple_of=pack_sequences_pad_to_multiple_of,
+        in_batch_packing_pad_to_multiple_of=in_batch_packing_pad_to_multiple_of,
         ignore_index=IGNORE_INDEX,
     )
     return batch
 
@@ -28,11 +28,14 @@ def glm4v_collate_fn(
     examples: list,
     processor,
     *,
+    visual_keys: object = None,
+    min_pixels: int | None = None,
+    max_pixels: int | None = None,
     sequence_length: int | None = None,
     pad_to_max_length: bool = False,
     pad_to_multiple_of: int = 128,
     pack_sequences: bool = False,
-    pack_sequences_pad_to_multiple_of: int = 1,
+    in_batch_packing_pad_to_multiple_of: int = 1,
 ) -> dict[str, torch.Tensor]:
     """Collate function for GLM-4.5V model.
 
@@ -42,6 +45,8 @@ def glm4v_collate_fn(
     defaults).  We wrap all visual tensors — including ``mm_token_type_ids`` — in
     :class:`GenericVisualInputs` so they flow through ``vlm_step.py`` to the model.
     """
+    del visual_keys, min_pixels, max_pixels
+
     skipped_tokens = extract_skipped_token_ids(processor)
 
     batch = processor.apply_chat_template(
@@ -88,7 +93,7 @@ def glm4v_collate_fn(
         pad_to_max_length=pad_to_max_length,
         pad_to_multiple_of=pad_to_multiple_of,
         pack_sequences=pack_sequences,
-        pack_sequences_pad_to_multiple_of=pack_sequences_pad_to_multiple_of,
+        in_batch_packing_pad_to_multiple_of=in_batch_packing_pad_to_multiple_of,
         ignore_index=IGNORE_INDEX,
     )
 
 
@@ -138,11 +138,14 @@ def kimi_k25_vl_collate_fn(
     processor,
     max_length: int | None = None,
     *,
+    visual_keys: object = None,
+    min_pixels: int | None = None,
+    max_pixels: int | None = None,
     sequence_length: int | None = None,
     pad_to_max_length: bool = False,
     pad_to_multiple_of: int = 128,
     pack_sequences: bool = False,
-    pack_sequences_pad_to_multiple_of: int = 1,
+    in_batch_packing_pad_to_multiple_of: int = 1,
 ) -> dict[str, torch.Tensor]:
     """Collate function for Kimi K2.5 VL processors with pre-expanded image tokens.
 
@@ -152,6 +155,8 @@ def kimi_k25_vl_collate_fn(
     3. Pads all sequences to fixed max_length
     This ensures the model forward pass doesn't change sequence length dynamically.
     """
+    del visual_keys, min_pixels, max_pixels
+
     skipped_tokens = extract_skipped_token_ids(processor)
 
     # Get media token ID
@@ -310,7 +315,7 @@ def kimi_k25_vl_collate_fn(
         pad_to_max_length=pad_to_max_length,
         pad_to_multiple_of=pad_to_multiple_of,
         pack_sequences=pack_sequences,
-        pack_sequences_pad_to_multiple_of=pack_sequences_pad_to_multiple_of,
+        in_batch_packing_pad_to_multiple_of=in_batch_packing_pad_to_multiple_of,
         ignore_index=IGNORE_INDEX,
     )
     return result