Fixes based on Nemotron3 tests

kevalmorabia97 · kevalmorabia97 · commit f905b6ee1644 · 2026-05-01T23:41:37.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -86,7 +86,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/nemo:26.02"
+      docker_image: "nvcr.io/nvidia/nemo:26.04"
       example: megatron_bridge
       timeout_minutes: 30
       pip_install_extras: "[hf,puzzletron,dev-test]"
diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md
@@ -16,7 +16,7 @@ This directory contains examples of using Model Optimizer with [NeMo Megatron-Br
 
 ## Pre-Requisites
 
-Running these examples requires many additional dependencies to be installed (e.g., Megatron-Bridge, Megatron-core, etc.), hence we strongly recommend directly using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02`) which has all the dependencies installed.
+Running these examples requires many additional dependencies to be installed (e.g., Megatron-Bridge, Megatron-core, etc.), hence we strongly recommend directly using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.04`) which has all the dependencies installed.
 
 To get the ModelOpt examples scripts, mount your Model-Optimizer repo to the container as follows:
 
@@ -26,7 +26,7 @@ if [ ! -d "${MODELOPT_DIR}" ]; then
   git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR}
 fi
 
-export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.02
+export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.04
 docker run \
   --gpus all \
   --shm-size=16GB \
@@ -49,6 +49,12 @@ hf auth login --token <your token>
 > [!WARNING]
 > Use `python -m pip` instead of `pip` to avoid conflicts with the system-wide installed packages in the NeMo containers. You may also refer to this [doc](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/docker/common/README.md#installing-packages-inside-the-container) on how to correctly install packages in the NeMo containers without breaking existing torch installation.
 
+Also install additional dependencies from the [requirements.txt](./requirements.txt) file.
+
+```bash
+python -m pip install -r requirements.txt
+```
+
 ## Pruning
 
 This section shows how to prune a HuggingFace model using Minitron algorithm in Megatron-Bridge framework. Checkout other available pruning algorithms, supported frameworks and models, and general pruning getting-started in the [pruning README](../pruning/README.md).
diff --git a/examples/megatron_bridge/prune_minitron.py b/examples/megatron_bridge/prune_minitron.py
@@ -161,11 +161,11 @@ def get_args() -> argparse.Namespace:
     parser.add_argument(
         "--prune_score_func",
         type=str,
-        default="mmlu_10pct",
+        default="mmlu_10pct_bs1",
         help=(
             "Score function to use for NAS-based pruning. Only supports MMLU at the moment. "
-            "Format: mmlu_<N>pct where <N> is the percentage of MMLU data to sample per subject "
-            "(e.g. mmlu_10pct for 10%, mmlu_100pct for full eval)."
+            "Format: mmlu_<N>pct_<bs> where <N> is the percentage of MMLU data to sample per subject and <bs> is "
+            "batch size for fast evaluation (default is mmlu_10pct_bs1)."
         ),
     )
     parser.add_argument(
@@ -343,16 +343,17 @@ def main(args: argparse.Namespace):
             "You can change this to be any other metric you want to maximize (e.g. negative validation loss)."
         )
 
-        match = re.fullmatch(r"mmlu_(\d+)pct", args.prune_score_func)
+        match = re.fullmatch(r"mmlu_(\d+)pct_bs(\d+)", args.prune_score_func)
         if not match:
             raise ValueError(
-                f"Invalid score function: {args.prune_score_func}. Expected format: mmlu_<N>pct (e.g. mmlu_10pct)"
+                f"Invalid score function: {args.prune_score_func}. Expected format: mmlu_<N>pct_bs<bs>"
             )
         mmlu_frac = float(match.group(1)) / 100.0
+        batch_size = int(match.group(2))
 
         def score_func(m):
             return megatron_mmlu(
-                m, tokenizer, few_shots=0, fraction=mmlu_frac, batch_size=args.calib_mbs
+                m, tokenizer, few_shots=0, fraction=mmlu_frac, batch_size=batch_size
             )
 
         pruning_config["score_func"] = score_func
diff --git a/examples/megatron_bridge/requirements.txt b/examples/megatron_bridge/requirements.txt
@@ -0,0 +1,2 @@
+# Saving some pruned models (e.g. Nemotron-3-Nano-30B-A3B-BF16) have issues with transformers>=5.0
+transformers<5.0
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
@@ -27,7 +27,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar
 
 ## Pre-Requisites
 
-For Minitron pruning for Megatron-Bridge / Megatron-LM models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02`) which has all the dependencies installed.
+For Minitron pruning for Megatron-Bridge / Megatron-LM models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.04`) which has all the dependencies installed.
 
 For FastNAS pruning for PyTorch Computer Vision models, no additional dependencies are required.
 
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -190,7 +190,7 @@ def _rprint(*renderables: Any) -> None:
 
 # Constraint keys that trigger the grid-search path in MCoreMinitronSearcher.
 # Order defines priority: first active key is used as the primary display/sort metric.
-_METRIC_CONSTRAINT_PRIORITY = ("params", "active_params", "memory_mb")
+_METRIC_CONSTRAINT_PRIORITY = ("active_params", "params", "memory_mb")
 _METRIC_CONSTRAINTS = frozenset(_METRIC_CONSTRAINT_PRIORITY)
 
 
@@ -524,15 +524,15 @@ def search_best_arch_by_metrics(self) -> dict:
         _rprint(table)
 
         # 3. Optional Knowledge Distillation (KD) step for all top-k candidates
-        print_rank_0(
-            "\nSkipping optional Knowledge Distillation (KD) step for candidates as it is a manual step. "
+        _rprint(
+            f"[yellow]\nSkipping optional Knowledge Distillation (KD) step for candidates as it is a manual step. "
             "As per the original paper (https://arxiv.org/pdf/2407.14679), ideally we need to perform a short "
             f"Knowledge Distillation on ~2B tokens for all top {top_k} candidates before evaluating the "
             "`score_func`, which will take a lot longer to prune, require splitting the pruning process into multiple "
             "stages and a lot more compute for pruning but can lead to better pruned model selection. If you are "
             f"interested to do this, you can take the top {top_k} candidates' `export_config` from the logs above and "
             "then export all models separately and perform Knowledge Distillation on each of them before evaluating "
-            "the `score_func`.\n"
+            f"the `score_func`.\n[/yellow]"
         )
 
         # 4. Validate top-k candidates using the score_func and return the best subnet
@@ -683,9 +683,6 @@ def _generate_search_space_combos(
     def _compute_candidate_metrics(self, ss_config: dict, max_num_layers: int) -> dict[str, float]:
         """Compute all active metric constraint values for a candidate config analytically.
 
-        Calls ``mcore_param_count`` at most once (covers both ``params`` and ``active_params``)
-        and ``mcore_memory_footprint_mb`` at most once (for ``memory_mb``).
-        Replaces the slow ``_prune → _param_num_dynamic → sample(max)`` loop used during search.
         Handles depth pruning by filtering the hybrid layer pattern to the kept (best) layers.
         """
         model = self.model
diff --git a/tests/examples/megatron_bridge/test_prune_minitron.py b/tests/examples/megatron_bridge/test_prune_minitron.py
@@ -38,7 +38,7 @@ def test_prune_minitron(tmp_path: Path, num_gpus):
         calib_num_samples=16,
         seq_length=32,
         prune_target_params=prune_target_params,
-        prune_score_func="mmlu_1pct",
+        prune_score_func="mmlu_1pct_bs32",
         ss_channel_divisor=4,
         hparams_to_skip="num_attention_heads",
         top_k=1,
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
@@ -321,7 +321,12 @@ def _assert_top_k_candidates(searcher_state, constraint_key, expected_top_k, k=1
     assert len(top_k) == k
     for actual, (ss_config, metrics, score) in zip(top_k, expected_top_k):
         assert actual.ss_config == ss_config, (actual.ss_config, ss_config)
-        assert actual.metrics == metrics, (actual.metrics, metrics)
+        for metric_name, expected_value in metrics.items():
+            actual_value = actual.metrics[metric_name]
+            if isinstance(expected_value, float):
+                assert actual_value == pytest.approx(expected_value), (actual.metrics, metrics)
+            else:
+                assert actual_value == expected_value, (actual.metrics, metrics)
         assert actual.score == score, (actual.score, score)
 
 
@@ -338,7 +343,7 @@ def _test_mcore_mamba_hybrid_pruning_nas_params(rank, size, ckpt_dir):
     assert baseline_params == 14984, baseline_params
     constraints = {
         "params": int(baseline_params * 0.5),
-        "active_params": int(baseline_active * 0.7),
+        "active_params": int(baseline_active * 0.55),
     }
 
     # Capture stdout to assert search space output
@@ -373,8 +378,8 @@ def assert_row(key: str, value: str) -> None:
         model.share_embeddings_and_output_weights,
         hybrid_layer_pattern=_get_hybrid_layer_pattern(model),
     )
-    assert pruned_params == 7154, pruned_params
-    assert pruned_active_params == 7154, pruned_active_params
+    assert pruned_params == 6536, pruned_params
+    assert pruned_active_params == 6536, pruned_active_params
 
     # NOTE: Slight variation in layer ordering for MoE / Attention / MLP depending on PP configuration
     # This affects param counts when num_layers is pruned
@@ -384,26 +389,28 @@ def assert_row(key: str, value: str) -> None:
         # Winner is 3-layer: keeps layers [1,4,3] from "ME*-" → drops 'E' (layer 2) → "M*-"
         assert _get_hybrid_layer_pattern(model) == "M*-", _get_hybrid_layer_pattern(model)
         expected_top_k = [
-            # 4 four-layer models qualifying under params_thresh=7492
-            [{"num_layers": 4, "hidden_size": 12, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 6, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 20}, {"params": 7418, "active_params": 6266}, 104],  # noqa: E501
+            # position 1: the one qualifying 4-layer model (active=6542 > 3-layer H=12 active),
+            # demonstrating that active_params-first ranking can elevate 4-layer above 3-layer models
             [{"num_layers": 4, "hidden_size": 12, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 7406, "active_params": 6542}, 115],  # noqa: E501
-            [{"num_layers": 4, "hidden_size": 12, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 28}, {"params": 7310, "active_params": 6446}, 111],  # noqa: E501
-            [{"num_layers": 4, "hidden_size": 12, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 24}, {"params": 7214, "active_params": 6350}, 107],  # noqa: E501
-            # 6 depth-pruned (num_layers=3) models; params==active_params since MoE layer is dropped
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 118],  # noqa: E501
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 122],  # noqa: E501
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 6, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 119],  # noqa: E501
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 6, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 123],  # noqa: E501
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 7, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 120],  # noqa: E501
-            [{"num_layers": 3, "hidden_size": 16, "mamba_num_heads": 6, "mamba_head_dim": 12, "num_moe_experts": 7, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 7154, "active_params": 7154}, 124],  # noqa: E501
+            # positions 2-9: 3-layer H=12 MNH=8 MHD=12 ffn=32 (active==params=6536, no MoE layer)
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 116],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 5, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 120],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 6, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 117],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 6, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 121],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 7, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 118],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 7, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 122],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 8, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 119],  # noqa: E501
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 8, "mamba_head_dim": 12, "num_moe_experts": 8, "moe_ffn_hidden_size": 16, "ffn_hidden_size": 32}, {"params": 6536, "active_params": 6536}, 123],  # noqa: E501
+            # position 10: first 3-layer H=12 MNH=6 MHD=16 ffn=32 candidate (active=6506)
+            [{"num_layers": 3, "hidden_size": 12, "mamba_num_heads": 6, "mamba_head_dim": 16, "num_moe_experts": 5, "moe_ffn_hidden_size": 12, "ffn_hidden_size": 32}, {"params": 6506, "active_params": 6506}, 118],  # noqa: E501
         ]
     else:
         raise RuntimeError(f"FIXME: Non deterministic test, assertions may fail: {sorted_layers=}")
     # fmt: on
 
     _assert_top_k_candidates(
         searcher_state,
-        (("params", constraints["params"]), ("active_params", constraints["active_params"])),
+        (("active_params", constraints["active_params"]), ("params", constraints["params"])),
         expected_top_k,
     )
     run_mcore_inference_with_dummy_input(model, _NAS_BATCH_SIZE, model.config.hidden_size)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Saving some pruned models (e.g. Nemotron-3-Nano-30B-A3B-BF16) have issues with transformers>=5.0`
	`2`	`+transformers<5.0`