NVIDIA-NeMo · akoumpa · Apr 23, 2026 · Apr 23, 2026
@@ -29,6 +29,11 @@
 import torch
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, PretrainedConfig
+
+try:
+    from huggingface_hub.errors import StrictDataclassClassValidationError
+except ImportError:
+    StrictDataclassClassValidationError = ValueError
 from transformers.modeling_utils import PreTrainedModel
 
 # For models that still accesses config.pad_token_id after v5 removes it in PretrainedConfig
@@ -205,7 +210,7 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
                 trust_remote_code=trust_remote_code,
                 attn_implementation=attn_implementation,
             )
-        except ValueError as e:
+        except (ValueError, StrictDataclassClassValidationError) as e:
             err = str(e)
             if "does not recognize this architecture" in err:
                 raise ValueError(
@@ -220,7 +225,9 @@ def get_hf_config(pretrained_model_name_or_path, attn_implementation, **kwargs):
                 ) from e
             # Some upstream configs (e.g. stepfun-ai/Step-3.5-Flash) ship
             # layer_types longer than num_hidden_layers, which newer transformers
-            # versions reject during config instantiation. Fix the raw dict and retry.
+            # versions reject during config instantiation. huggingface_hub wraps
+            # the validator's ValueError in StrictDataclassClassValidationError
+            # (not a ValueError subclass), so both exception types must be caught.
             if "num_hidden_layers" in err and ("layer_types" in err or "layer types" in err):
                 hf_config = _load_config_with_layer_types_fix(
                     pretrained_model_name_or_path,

@@ -22,6 +22,11 @@
 from transformers import AutoTokenizer
 from transformers.tokenization_utils_base import BatchEncoding
 
+try:
+    from huggingface_hub.errors import StrictDataclassClassValidationError
+except ImportError:
+    StrictDataclassClassValidationError = ValueError
+
 logger = logging.getLogger(__name__)
 
 
@@ -326,7 +331,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, add_bos_token=Tru
             add_bos_token: Whether to add BOS token (default: True)
             add_eos_token: Whether to add EOS token (default: True)
         """
-        tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        try:
+            tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        except (ValueError, StrictDataclassClassValidationError) as e:
+            # AutoTokenizer.from_pretrained internally calls AutoConfig.from_pretrained,
+            # so configs whose layer_types length differs from num_hidden_layers (e.g.
+            # stepfun-ai/Step-3.5-Flash) trip validate_layer_type before the tokenizer
+            # is built. The tokenizer itself doesn't depend on layer_types, so relax
+            # the validator globally and retry.
+            err = str(e)
+            if "num_hidden_layers" not in err or ("layer_types" not in err and "layer types" not in err):
+                raise
+            from nemo_automodel._transformers.v4_patches.layer_types import relax_layer_types_validator
+
+            relax_layer_types_validator()
+            tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # Convert TikToken-based tokenizers to fast (Rust-backed) tokenizers so that
         # char_to_token() works natively for {% generation %} mask computation.

@@ -206,8 +206,91 @@ def install_layer_types_patch_hook() -> bool:
     return True
 
 
+_VALIDATOR_RELAXED: bool = False
+
+
+def _noop_validate_layer_type(self):  # noqa: ARG001
+    """No-op replacement for ``PretrainedConfig.validate_layer_type``."""
+    return None
+
+
+_noop_validate_layer_type.__name__ = "validate_layer_type"
+
+
+def relax_layer_types_validator() -> bool:
+    """Disable ``PretrainedConfig.validate_layer_type`` and its registered copies.
+
+    Some upstream configs (e.g. ``stepfun-ai/Step-3.5-Flash``) ship
+    ``layer_types`` whose length differs from ``num_hidden_layers``; newer
+    transformers rejects such configs, and ``huggingface_hub`` wraps that
+    ``ValueError`` in ``StrictDataclassClassValidationError``. Suppressing the
+    validator lets downstream code load the config unmodified.
+
+    The ``huggingface_hub`` ``@strict`` decorator freezes validator references
+    on each class into ``__class_validators__`` at decoration time, so swapping
+    ``PretrainedConfig.validate_layer_type`` alone is insufficient: every
+    already-decorated subclass keeps its own list. This helper rewrites both
+    the class attribute and each list entry across the live subclass tree,
+    then newly-decorated subclasses pick up the no-op automatically via
+    ``getattr(cls, "validate_layer_type")`` at their decoration time.
+
+    Idempotent and best-effort: failures (missing attribute, transformers not
+    installed) are logged and swallowed so the caller is never broken.
+
+    Returns:
+        ``True`` if the patch was applied on this call, ``False`` otherwise.
+    """
+    global _VALIDATOR_RELAXED
+    if _VALIDATOR_RELAXED:
+        return False
+
+    try:
+        from transformers import PretrainedConfig
+    except ImportError:
+        logger.debug("[v4_patches.layer_types] transformers not importable; skipping relax.")
+        return False
+    except Exception as exc:
+        logger.warning("[v4_patches.layer_types] transformers import failed: %s", exc)
+        return False
+
+    if not hasattr(PretrainedConfig, "validate_layer_type"):
+        logger.debug("[v4_patches.layer_types] validate_layer_type missing; nothing to relax.")
+        _VALIDATOR_RELAXED = True
+        return False
+
+    try:
+        PretrainedConfig.validate_layer_type = _noop_validate_layer_type
+    except Exception as exc:
+        logger.warning("[v4_patches.layer_types] failed to rebind validate_layer_type: %s", exc)
+        return False
+
+    def _relax_subtree(cls) -> None:
+        validators = cls.__dict__.get("__class_validators__")
+        if isinstance(validators, list):
+            for i, v in enumerate(validators):
+                if getattr(v, "__name__", None) == "validate_layer_type":
+                    validators[i] = _noop_validate_layer_type
+        try:
+            subs = cls.__subclasses__()
+        except Exception:
+            return
+        for sub in subs:
+            _relax_subtree(sub)
+
+    try:
+        _relax_subtree(PretrainedConfig)
+    except Exception as exc:
+        logger.warning("[v4_patches.layer_types] subclass walk failed: %s", exc)
+        return False
+
+    _VALIDATOR_RELAXED = True
+    logger.info("[v4_patches.layer_types] relaxed validate_layer_type to no-op")
+    return True
+
+
 __all__ = [
     "DEFAULT_EXTRA_LAYER_TYPES",
     "install_layer_types_patch_hook",
     "patch_allowed_layer_types",
+    "relax_layer_types_validator",
 ]
@@ -363,9 +363,28 @@ def build_dataloader(
                 try:
                     processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, **processor_kwargs)
                 except Exception as e:
-                    # Some models do not provide an AutoProcessor
-                    processor = None
-                    logging.warning(f"AutoProcessor not available for {pretrained_model_name_or_path} ({e}). ")
+                    # AutoProcessor.from_pretrained internally loads AutoConfig. Configs
+                    # whose layer_types length differs from num_hidden_layers trip
+                    # validate_layer_type. The processor itself doesn't depend on
+                    # layer_types, so relax the validator and retry once before giving up.
+                    err = str(e)
+                    if "num_hidden_layers" in err and ("layer_types" in err or "layer types" in err):
+                        from nemo_automodel._transformers.v4_patches.layer_types import (
+                            relax_layer_types_validator,
+                        )
+
+                        relax_layer_types_validator()
+                        try:
+                            processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, **processor_kwargs)
+                        except Exception as retry_exc:
+                            processor = None
+                            logging.warning(
+                                f"AutoProcessor not available for {pretrained_model_name_or_path} ({retry_exc}). "
+                            )
+                    else:
+                        # Some models do not provide an AutoProcessor
+                        processor = None
+                        logging.warning(f"AutoProcessor not available for {pretrained_model_name_or_path} ({e}). ")
 
             chat_template_raw = cfg_ds.__dict__.pop("chat_template", None)
             # Update chat_template if chat_template is given

diff --git a/tests/unit_tests/_transformers/test_auto_tokenizer.py b/tests/unit_tests/_transformers/test_auto_tokenizer.py
@@ -131,6 +131,47 @@ def test_cls_sep_pattern_preserved_when_tokens_present(self):
             tok = NeMoAutoTokenizer.from_pretrained("dummy/model")
             assert tok.special_tokens_pattern == "cls_sep"
 
+    def test_retry_on_layer_types_mismatch(self):
+        """When AutoTokenizer fails because the underlying config has
+        layer_types longer than num_hidden_layers (e.g. stepfun-ai/Step-3.5-Flash),
+        the wrapper should relax the validator globally and retry."""
+        from huggingface_hub.errors import StrictDataclassClassValidationError
+
+        stub = _StubHFTokenizer()
+        calls = {"n": 0}
+
+        def fake_from_pretrained(pretrained_model_name_or_path, *args, **kwargs):
+            calls["n"] += 1
+            if calls["n"] == 1:
+                cause = ValueError("`num_hidden_layers` (45) must be equal to the number of layer types (48).")
+                raise StrictDataclassClassValidationError(validator="validate_layer_type", cause=cause)
+            return stub
+
+        with (
+            patch("transformers.AutoTokenizer.from_pretrained", side_effect=fake_from_pretrained),
+            patch(
+                "nemo_automodel._transformers.v4_patches.layer_types.relax_layer_types_validator",
+                return_value=True,
+            ) as mock_relax,
+        ):
+            tok = NeMoAutoTokenizer.from_pretrained("stepfun-ai/Step-3.5-Flash", trust_remote_code=True)
+            assert tok is not None
+            assert calls["n"] == 2
+            mock_relax.assert_called_once()
+
+    def test_unrelated_value_error_is_not_retried(self):
+        """Unrelated ValueErrors should propagate without triggering the fix path."""
+        with (
+            patch(
+                "transformers.AutoTokenizer.from_pretrained",
+                side_effect=ValueError("totally unrelated tokenizer failure"),
+            ),
+            patch("nemo_automodel._transformers.v4_patches.layer_types.relax_layer_types_validator") as mock_relax,
+        ):
+            with pytest.raises(ValueError, match="totally unrelated"):
+                NeMoAutoTokenizer.from_pretrained("dummy/model")
+            mock_relax.assert_not_called()
+
     def test_force_hf_passthrough(self):
         stub = _StubHFTokenizer()
         with patch("transformers.AutoTokenizer.from_pretrained", return_value=stub):
@@ -841,9 +882,7 @@ def test_converts_tiktoken_tokenizer(self):
         mock_fast_tokenizer.is_fast = True
 
         with (
-            patch(
-                "transformers.convert_slow_tokenizer.TikTokenConverter"
-            ) as mock_converter_cls,
+            patch("transformers.convert_slow_tokenizer.TikTokenConverter") as mock_converter_cls,
             patch(
                 "transformers.tokenization_utils_tokenizers.TokenizersBackend",
                 return_value=mock_fast_tokenizer,

diff --git a/tests/unit_tests/_transformers/test_model_init.py b/tests/unit_tests/_transformers/test_model_init.py
@@ -513,6 +513,25 @@ def test_retry_on_layer_types_mismatch(self, mock_from_pretrained, _mock_trust,
         call_kwargs = mock_fix.call_args[1]
         assert call_kwargs["trust_remote_code"] is True
 
+    @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
+    @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=True)
+    @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")
+    def test_retry_on_strict_dataclass_validation_error(self, mock_from_pretrained, _mock_trust, mock_fix):
+        """huggingface_hub wraps the validator ValueError in a non-ValueError error type."""
+        from huggingface_hub.errors import StrictDataclassClassValidationError
+
+        cause = ValueError("`num_hidden_layers` (45) must be equal to the number of layer types (48).")
+        mock_from_pretrained.side_effect = StrictDataclassClassValidationError(
+            validator="validate_layer_type", cause=cause
+        )
+        fixed_cfg = MagicMock()
+        mock_fix.return_value = fixed_cfg
+
+        result = get_hf_config("stepfun-ai/Step-3.5-Flash", "sdpa")
+
+        assert result is fixed_cfg
+        mock_fix.assert_called_once()
+
     @patch("nemo_automodel._transformers.model_init._load_config_with_layer_types_fix")
     @patch("nemo_automodel._transformers.model_init.resolve_trust_remote_code", return_value=False)
     @patch("nemo_automodel._transformers.model_init.AutoConfig.from_pretrained")

diff --git a/tests/unit_tests/_transformers/v4_patches/test_layer_types.py b/tests/unit_tests/_transformers/v4_patches/test_layer_types.py
@@ -34,12 +34,14 @@ def isolated_layer_types_state():
     """Reset module globals and meta-path / sys.modules mutations after each test."""
     saved_patched = lt_mod._PATCHED
     saved_hook_installed = lt_mod._HOOK_INSTALLED
+    saved_validator_relaxed = lt_mod._VALIDATOR_RELAXED
     saved_meta_path = list(sys.meta_path)
     saved_transformers = sys.modules.get("transformers")
     saved_cu = sys.modules.get(lt_mod._TARGET_MODULE)
 
     lt_mod._PATCHED = False
     lt_mod._HOOK_INSTALLED = False
+    lt_mod._VALIDATOR_RELAXED = False
     # The package installs a finder at import time; strip it so each test can
     # reason about the finders it inserts in isolation. The original list is
     # restored verbatim on teardown.
@@ -50,6 +52,7 @@ def isolated_layer_types_state():
     finally:
         lt_mod._PATCHED = saved_patched
         lt_mod._HOOK_INSTALLED = saved_hook_installed
+        lt_mod._VALIDATOR_RELAXED = saved_validator_relaxed
         sys.meta_path[:] = saved_meta_path
         _restore(sys.modules, "transformers", saved_transformers)
         _restore(sys.modules, lt_mod._TARGET_MODULE, saved_cu)
@@ -215,3 +218,59 @@ def find_spec(self, fullname, path=None, target=None):
         assert lt_mod._PATCHED is True
         for extra in lt_mod.DEFAULT_EXTRA_LAYER_TYPES:
             assert extra in fake_cu.ALLOWED_LAYER_TYPES
+
+
+def _install_fake_pretrained_config_tree():
+    """Register a minimal transformers.PretrainedConfig + subclass with a fake validator."""
+
+    def _boom(self):
+        raise ValueError("`num_hidden_layers` must equal number of layer types")
+
+    _boom.__name__ = "validate_layer_type"
+
+    fake_pkg = types.ModuleType("transformers")
+    fake_pkg.__path__ = []
+
+    class _FakePretrainedConfig:
+        validate_layer_type = _boom
+        __class_validators__ = [_boom]
+
+    class _FakeChildConfig(_FakePretrainedConfig):
+        __class_validators__ = [_boom]
+
+    fake_pkg.PretrainedConfig = _FakePretrainedConfig
+    sys.modules["transformers"] = fake_pkg
+    return _FakePretrainedConfig, _FakeChildConfig, _boom
+
+
+class TestRelaxLayerTypesValidator:
+    def test_replaces_validator_on_base_and_subclass(self, isolated_layer_types_state):
+        base, child, _original = _install_fake_pretrained_config_tree()
+
+        assert lt_mod.relax_layer_types_validator() is True
+
+        assert base.validate_layer_type is lt_mod._noop_validate_layer_type
+        assert base.__class_validators__[0] is lt_mod._noop_validate_layer_type
+        assert child.__class_validators__[0] is lt_mod._noop_validate_layer_type
+
+        instance = child.__new__(child)
+        for v in child.__class_validators__:
+            v(instance)  # must not raise
+
+    def test_idempotent(self, isolated_layer_types_state):
+        _install_fake_pretrained_config_tree()
+
+        assert lt_mod.relax_layer_types_validator() is True
+        assert lt_mod.relax_layer_types_validator() is False
+
+    def test_missing_attribute_is_tolerated(self, isolated_layer_types_state):
+        fake_pkg = types.ModuleType("transformers")
+        fake_pkg.__path__ = []
+
+        class _Bare:
+            pass
+
+        fake_pkg.PretrainedConfig = _Bare
+        sys.modules["transformers"] = fake_pkg
+
+        assert lt_mod.relax_layer_types_validator() is False