diff --git a/examples/models/nemotron_labs_diffusion/README.md b/examples/models/nemotron_labs_diffusion/README.md index f1fb2e424d..03a51e37fd 100644 --- a/examples/models/nemotron_labs_diffusion/README.md +++ b/examples/models/nemotron_labs_diffusion/README.md @@ -45,9 +45,9 @@ The CPT checkpoint from Stage 1 is passed via `checkpoint.pretrained_checkpoint` torchrun --nproc_per_node=8 examples/models/nemotron_labs_diffusion/ar_to_dlm.py \ --model-size 3b \ --hf-path mistralai/Ministral-3-3B-Base-2512 \ - --data-paths /path/to/dclm/merged_tokenized_text_document \ checkpoint.finetune=true \ - checkpoint.pretrained_checkpoint=/path/to/cpt_checkpoint + checkpoint.pretrained_checkpoint=/path/to/cpt_checkpoint \ + --data-paths /path/to/dclm/merged_tokenized_text_document ``` @@ -61,25 +61,24 @@ The script [`inference_nemotron_labs_diffusion.py`](inference_nemotron_labs_diff ```bash torchrun --nproc_per_node=4 examples/models/nemotron_labs_diffusion/inference_nemotron_labs_diffusion.py \ - --megatron-path /path/to/checkpoints/ar_to_dlm_8b \ - --hf-model mistralai/Ministral-3-8B-Base-2512 \ + --megatron-path /path/to/checkpoints/ar_to_dlm_3b/iter_xxxxxxx \ + --hf-model mistralai/Ministral-3-3B-Base-2512 \ --prompts "The capital of France is" \ - --gen-length 256 --block-length 32 --steps-per-block 32 \ - --tp 4 + --gen-length 256 --block-length 32 --steps-per-block 32 ``` ### AR mode ```bash python examples/models/nemotron_labs_diffusion/inference_nemotron_labs_diffusion.py \ - --megatron-path /path/to/checkpoints/ar_to_dlm_3b \ + --megatron-path /path/to/checkpoints/ar_to_dlm_3b/iter_xxxxxxx \ --hf-model mistralai/Ministral-3-3B-Base-2512 \ --mode ar \ --prompts "Once upon a time" \ --max-new-tokens 128 ``` -The `--tp` argument must match the tensor parallelism degree of the saved checkpoint (e.g. `--tp 4` for 8B checkpoints saved with TP=4). `--hf-model` is used for the tokenizer and model config only — weights are loaded from `--megatron-path`. +You can pass `--tp` argument, but it must match the tensor parallelism degree of the saved checkpoint (e.g. `--tp 4` for 8B checkpoints saved with TP=4). `--hf-model` is used for the tokenizer and model config only — weights are loaded from `--megatron-path`. --- @@ -102,14 +101,6 @@ python examples/models/nemotron_labs_diffusion/convert_checkpoints.py import \ --torch-dtype bfloat16 ``` -For the 8B model (TP=4): -```bash -python examples/models/nemotron_labs_diffusion/convert_checkpoints.py import \ - --hf-model nvidia/Nemotron-Labs-Diffusion-8B \ - --megatron-path /path/to/checkpoints/hf_to_mb_8b \ - --torch-dtype bfloat16 -``` - The Megatron checkpoint is written under `--megatron-path` (e.g. `.../hf_to_mb_3b/iter_0000000/`). Use the parent directory for training with `checkpoint.load`. ### Export: Megatron → HuggingFace diff --git a/src/megatron/bridge/diffusion/conversion/nemotron_labs_diffusion/nemotron_labs_diffusion_bridge.py b/src/megatron/bridge/diffusion/conversion/nemotron_labs_diffusion/nemotron_labs_diffusion_bridge.py index 4081dcbb63..e8ed58d291 100644 --- a/src/megatron/bridge/diffusion/conversion/nemotron_labs_diffusion/nemotron_labs_diffusion_bridge.py +++ b/src/megatron/bridge/diffusion/conversion/nemotron_labs_diffusion/nemotron_labs_diffusion_bridge.py @@ -75,6 +75,25 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> "NemotronLabsDif # Auto-detect checkpoint format: VLM configs nest text params under text_config self._is_text_only = not hasattr(hf_config, "text_config") + # NemotronLabsDiffusionConfig (a trust_remote_code config) does not declare + # model-specific fields as dataclass fields. In transformers 5.x + # PretrainedConfig is a dataclass, so MLM's _convert_value_to_dict uses the + # dataclass-fields path and silently drops all model-specific attributes + # (hidden_size, rope_parameters, etc.). Adding to_cfg_dict to the class + # makes the serializer use PretrainedConfig.to_dict() which captures everything. + cfg_cls = type(hf_config) + if not hasattr(cfg_cls, "to_cfg_dict") and hasattr(hf_config, "to_dict"): + + def _to_cfg_dict(self): + cls = self.__class__ + return { + "_target_": f"{cls.__module__}.{cls.__qualname__}.from_dict", + "_call_": True, + "config_dict": self.to_dict(), + } + + cfg_cls.to_cfg_dict = _to_cfg_dict + return NemotronLabsDiffusionModelProvider( hidden_size=text_config.hidden_size, ffn_hidden_size=text_config.intermediate_size, diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py index 585302c77c..81c2acbb8b 100644 --- a/src/megatron/bridge/models/conversion/auto_bridge.py +++ b/src/megatron/bridge/models/conversion/auto_bridge.py @@ -1085,6 +1085,11 @@ def load_megatron_model( except ImportError: raise ImportError("megatron.bridge.training is not available.") + if self.trust_remote_code: + from megatron.bridge.utils.instantiate_utils import register_allowed_target_prefix + + register_allowed_target_prefix("transformers_modules.") + checkpoint_path = Path(path) # Check for iter_* folders diff --git a/tests/unit_tests/diffusion/model/nemotron_labs_diffusion/conversion/test_nemotron_labs_diffusion_bridge.py b/tests/unit_tests/diffusion/model/nemotron_labs_diffusion/conversion/test_nemotron_labs_diffusion_bridge.py index 974fcb78d2..7ab48fdad5 100644 --- a/tests/unit_tests/diffusion/model/nemotron_labs_diffusion/conversion/test_nemotron_labs_diffusion_bridge.py +++ b/tests/unit_tests/diffusion/model/nemotron_labs_diffusion/conversion/test_nemotron_labs_diffusion_bridge.py @@ -234,3 +234,94 @@ def test_vlm_output_layer_mapping_uses_lm_head(self): ] assert len(out_mappings) == 1 assert out_mappings[0].hf_param == "language_model.lm_head.weight" + + +class _MockConfig: + """A mutable config class with to_dict, simulating a trust_remote_code PretrainedConfig.""" + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + def to_dict(self): + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + +class TestToCfgDictMonkeyPatch: + """Tests for the to_cfg_dict monkey-patch in provider_bridge().""" + + def _make_mock_hf_config(self): + text_cfg = _MockConfig( + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=8, + tie_word_embeddings=False, + rope_parameters={"rope_theta": 10000.0}, + vocab_size=32000, + ) + hf_cfg = _MockConfig(text_config=text_cfg) + return hf_cfg + + def test_to_cfg_dict_added_when_config_has_to_dict(self): + """provider_bridge adds to_cfg_dict to config classes that have to_dict.""" + bridge = NemotronLabsDiffusionBridge() + hf_cfg = self._make_mock_hf_config() + hf = DummyHFPretrained(hf_cfg) + + assert not hasattr(_MockConfig, "to_cfg_dict") + bridge.provider_bridge(hf) + assert hasattr(_MockConfig, "to_cfg_dict") + + # Clean up monkey-patch so it doesn't leak to other tests + delattr(_MockConfig, "to_cfg_dict") + + def test_to_cfg_dict_returns_correct_target(self): + """to_cfg_dict must produce a _target_ using cls.__module__ and cls.__qualname__.""" + bridge = NemotronLabsDiffusionBridge() + hf_cfg = self._make_mock_hf_config() + hf = DummyHFPretrained(hf_cfg) + bridge.provider_bridge(hf) + + result = hf_cfg.to_cfg_dict() + expected_target = f"{_MockConfig.__module__}.{_MockConfig.__qualname__}.from_dict" + assert result["_target_"] == expected_target + assert result["_call_"] is True + assert "config_dict" in result + + delattr(_MockConfig, "to_cfg_dict") + + def test_to_cfg_dict_preserves_dynamic_attributes(self): + """to_cfg_dict must capture dynamic attributes like rope_parameters via to_dict.""" + bridge = NemotronLabsDiffusionBridge() + hf_cfg = self._make_mock_hf_config() + hf_cfg.llama_4_scaling_beta = 0.7 # dynamic attribute + hf = DummyHFPretrained(hf_cfg) + bridge.provider_bridge(hf) + + result = hf_cfg.to_cfg_dict() + assert result["config_dict"]["llama_4_scaling_beta"] == 0.7 + + delattr(_MockConfig, "to_cfg_dict") + + def test_to_cfg_dict_not_added_to_simplenamespace(self): + """SimpleNamespace has no to_dict, so to_cfg_dict must not be added.""" + bridge = NemotronLabsDiffusionBridge() + hf_cfg = _make_hf_config() # uses SimpleNamespace + hf = DummyHFPretrained(hf_cfg) + bridge.provider_bridge(hf) + + assert not hasattr(types.SimpleNamespace, "to_cfg_dict") + + def test_to_cfg_dict_not_added_twice(self): + """If to_cfg_dict already exists, provider_bridge must not overwrite it.""" + bridge = NemotronLabsDiffusionBridge() + hf_cfg = self._make_mock_hf_config() + hf = DummyHFPretrained(hf_cfg) + + sentinel = lambda self: {"sentinel": True} + _MockConfig.to_cfg_dict = sentinel + + bridge.provider_bridge(hf) + assert _MockConfig.to_cfg_dict is sentinel + + delattr(_MockConfig, "to_cfg_dict") diff --git a/tests/unit_tests/models/test_auto_bridge.py b/tests/unit_tests/models/test_auto_bridge.py index a3aee84966..4e922b5ad5 100644 --- a/tests/unit_tests/models/test_auto_bridge.py +++ b/tests/unit_tests/models/test_auto_bridge.py @@ -1235,6 +1235,7 @@ def test_load_megatron_model_basic(self): bridge = AutoBridge.__new__(AutoBridge) bridge.hf_pretrained = mock_hf_model + bridge.trust_remote_code = False with patch("megatron.bridge.training.model_load_save.load_megatron_model") as mock_load_megatron_model: from pathlib import Path @@ -1259,6 +1260,7 @@ def test_load_megatron_model_with_iter_folder(self): bridge = AutoBridge.__new__(AutoBridge) bridge.hf_pretrained = mock_hf_model + bridge.trust_remote_code = False with patch("megatron.bridge.training.model_load_save.load_megatron_model") as mock_load_megatron_model: from pathlib import Path @@ -1298,6 +1300,7 @@ def test_load_megatron_model_with_mp_overrides(self): bridge = AutoBridge.__new__(AutoBridge) bridge.hf_pretrained = mock_hf_model + bridge.trust_remote_code = False # Create model-parallel overrides mp_overrides = { @@ -1339,6 +1342,29 @@ def test_load_megatron_model_with_mp_overrides(self): assert call_args.args[0] == "checkpoint_path" # path argument assert "skip_temp_dist_context" in call_args.kwargs + def test_load_megatron_model_registers_prefix_when_trust_remote_code(self): + """Test that load_megatron_model registers transformers_modules prefix when trust_remote_code=True.""" + mock_hf_model = Mock(spec=PreTrainedCausalLM) + mock_config = Mock(spec=PretrainedConfig) + mock_config.architectures = ["LlamaForCausalLM"] + mock_hf_model.config = mock_config + + bridge = AutoBridge.__new__(AutoBridge) + bridge.hf_pretrained = mock_hf_model + bridge.trust_remote_code = True + + with patch("megatron.bridge.training.model_load_save.load_megatron_model") as mock_load_megatron_model: + with patch("megatron.bridge.utils.instantiate_utils.register_allowed_target_prefix") as mock_register: + from pathlib import Path + + with patch.object(Path, "iterdir") as mock_iterdir: + mock_load_megatron_model.return_value = Mock() + mock_iterdir.return_value = [] + + bridge.load_megatron_model("./checkpoint_path") + + mock_register.assert_called_once_with("transformers_modules.") + @patch("torch.distributed.is_available") @patch("torch.distributed.is_initialized") def test_save_hf_pretrained_uses_bridge_additional_file_patterns(self, mock_is_init, mock_is_avail):