feat(model): thread weight_dtype through HF export for plain-dtype output

Meirtz · claude · Meirtz · commit b577be36dd43 · 2026-06-16T21:03:24.000+08:00
Export has two consumers — online weight sync for RL rollout
(export_hf_weights) and on-disk checkpoints (save_hf_pretrained). Each
gains an optional weight_dtype that flows through WeightConversionTask
into the export stream.

Per review (HollowMan6): the plain-dtype cast is now generic, not
DSv4-only. build_conversion_tasks stamps weight_dtype onto each task
(no post-hoc dataclasses.replace except for caller-supplied tasks), and
the cast lives in the shared stream path covering both the standard and
grouped-export branches. The DSv4 hook simply skips requantization when
weight_dtype is set and returns the converted weights unchanged, letting
the generic path cast the dtype — keeping plain-dtype export identical
across bridges. Adds --export-weight-dtype to the multi-gpu convert
example.

Validated end-to-end on 32x GB300: bf16 export = 35020 tensors / 0
scales; quantized export = 69187 / 34167.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Lingrui Mei &lt;lmei@nvidia.com&gt;
diff --git a/examples/conversion/convert_checkpoints_multi_gpu.py b/examples/conversion/convert_checkpoints_multi_gpu.py
@@ -329,9 +329,10 @@ def main():
         choices=sorted(DTYPE_MAP),
         default=None,
         help=(
-            "Emit plain weights in this dtype instead of re-creating the source repo's "
-            "quantized weight/scale layout (currently honored by the DeepSeek-V4 bridge). "
-            "Use for SFT products that need exact train/inference numerical parity."
+            "Emit plain weights cast to this dtype. For bridges that recreate a quantized "
+            "source layout on export (e.g. DeepSeek-V4) this also skips the requantization, "
+            "so no *.scale companions are written. Use for SFT products that need exact "
+            "train/inference numerical parity."
         ),
     )
     args = parser.parse_args()
diff --git a/src/megatron/bridge/models/conversion/model_bridge.py b/src/megatron/bridge/models/conversion/model_bridge.py
@@ -120,8 +120,10 @@ class WeightConversionTask(Generic[MappingT]):
             sub-module that owns the parameter (required for loads).
         param_weight (Optional[torch.Tensor]): The actual parameter tensor that will
             receive the converted weight (required for loads).
-        weight_dtype (Optional[torch.dtype]): When set, bridges that re-create a quantized
-            source layout on export emit plain weights in this dtype instead.
+        weight_dtype (Optional[torch.dtype]): Export only. When set, floating-point
+            weights are cast to this dtype; bridges that recreate a quantized source
+            layout (e.g. DeepSeek-V4) additionally skip requantization so no scale
+            companions are emitted.
 
     """
 
@@ -899,6 +901,23 @@ def maybe_modify_converted_hf_weight(
         """
         return converted_weights_dict
 
+    @staticmethod
+    def _cast_export_weight_dtype(
+        weights: Dict[str, torch.Tensor], weight_dtype: Optional[torch.dtype]
+    ) -> Dict[str, torch.Tensor]:
+        """Cast floating-point export weights to ``weight_dtype`` (no-op if None).
+
+        Integer tensors (e.g. packed/index buffers) are left untouched. This is the
+        generic plain-dtype export path; bridges that recreate a quantized source
+        layout opt out by leaving ``weight_dtype`` unset.
+        """
+        if weight_dtype is None:
+            return weights
+        return {
+            name: (weight.to(weight_dtype) if weight.is_floating_point() else weight)
+            for name, weight in weights.items()
+        }
+
     def _accumulate_grouped_export(
         self,
         task: "WeightConversionTask",
@@ -1256,7 +1275,8 @@ def stream_weights_megatron_to_hf(
         if conversion_tasks is None:
             conversion_tasks = self.build_conversion_tasks(hf_pretrained, unwrapped_model_list)
         if weight_dtype is not None:
-            # WeightConversionTask is frozen — rebuild the tasks with the dtype set
+            # Stamp the export dtype on the (frozen) tasks here rather than in
+            # build_conversion_tasks, which model bridges may override.
             conversion_tasks = [replace(task, weight_dtype=weight_dtype) for task in conversion_tasks]
 
         # Collect adapter conversion tasks when merge is requested
@@ -1312,6 +1332,7 @@ def stream_weights_megatron_to_hf(
                     task, converted_weights_dict, model_config, _grouped_buffers, hf_state_dict
                 )
                 if merged_result is not None:
+                    merged_result = self._cast_export_weight_dtype(merged_result, task.weight_dtype)
                     for hf_name, tensor in merged_result.items():
                         yield HFWeightTuple(hf_name, tensor.cpu() if cpu else tensor)
                 continue
@@ -1336,6 +1357,8 @@ def stream_weights_megatron_to_hf(
                     adapter_weights,
                 )
 
+            converted_weights_dict = self._cast_export_weight_dtype(converted_weights_dict, task.weight_dtype)
+
             for hf_name, tensor in converted_weights_dict.items():
                 final_tensor = tensor.cpu() if cpu else tensor
 
diff --git a/src/megatron/bridge/models/deepseek/deepseek_v4_bridge.py b/src/megatron/bridge/models/deepseek/deepseek_v4_bridge.py
@@ -913,14 +913,12 @@ def maybe_modify_converted_hf_weight(
     ) -> Dict[str, torch.Tensor]:
         """Recreate DSv4 quantized weight/scale pairs expected by the source shard index.
 
-        When ``task.weight_dtype`` is set, plain weights are emitted in that dtype
-        instead (no ``*.scale`` companions).
+        When ``task.weight_dtype`` is set the caller wants plain (non-quantized)
+        weights, so skip requantization and let the generic export path cast the
+        dtype — keeping plain-dtype export behavior identical across bridges.
         """
         if task.weight_dtype is not None:
-            return {
-                name: (weight.to(task.weight_dtype) if weight.is_floating_point() else weight)
-                for name, weight in converted_weights_dict.items()
-            }
+            return converted_weights_dict
         return quantization_utils.requantize_hf_weight_scale_pairs(
             converted_weights_dict,
             hf_state_dict,
diff --git a/tests/unit_tests/models/deepseek/test_deepseek_v4_bridge.py b/tests/unit_tests/models/deepseek/test_deepseek_v4_bridge.py
@@ -441,7 +441,7 @@ def test_provider_bridge_preserves_fused_defaults_without_cuda(self):
 
 
 class TestDeepSeekV4ExportWeightDtype:
-    def test_weight_dtype_set_emits_plain_weights(self):
+    def test_weight_dtype_set_skips_requantization(self, monkeypatch):
         from dataclasses import replace
         from unittest.mock import MagicMock
 
@@ -451,18 +451,30 @@ def test_weight_dtype_set_emits_plain_weights(self):
         bridge = DeepSeekV4Bridge.__new__(DeepSeekV4Bridge)
         task = WeightConversionTask(param_name="w", global_param_name="w", mapping=MagicMock())
         task = replace(task, weight_dtype=torch.bfloat16)  # frozen: must be settable via replace
+
+        def fail_requantize(*args, **kwargs):
+            raise AssertionError("requantize must be skipped when weight_dtype is set")
+
+        monkeypatch.setattr(quantization_utils, "requantize_hf_weight_scale_pairs", fail_requantize)
         weight = torch.randn(4, 4, dtype=torch.float32)
-        converted = {
-            "model.layers.0.mlp.weight": weight,
-            "model.layers.0.mlp.bias_idx": torch.ones(2, dtype=torch.int32),
-        }
+        converted = {"model.layers.0.mlp.weight": weight}
         hf_state = {"model.layers.0.mlp.weight": weight, "model.layers.0.mlp.scale": torch.ones(1)}
 
         out = bridge.maybe_modify_converted_hf_weight(task, converted, hf_state)
 
-        assert set(out) == set(converted)
+        assert out is converted  # returned unchanged; generic path casts the dtype
+
+    def test_generic_export_cast_applies_plain_dtype(self):
+        from megatron.bridge.models.conversion.model_bridge import MegatronModelBridge
+
+        weights = {
+            "model.layers.0.mlp.weight": torch.randn(4, 4, dtype=torch.float32),
+            "model.layers.0.mlp.bias_idx": torch.ones(2, dtype=torch.int32),
+        }
+        out = MegatronModelBridge._cast_export_weight_dtype(weights, torch.bfloat16)
         assert out["model.layers.0.mlp.weight"].dtype == torch.bfloat16
-        assert out["model.layers.0.mlp.bias_idx"].dtype == torch.int32
+        assert out["model.layers.0.mlp.bias_idx"].dtype == torch.int32  # int preserved
+        assert MegatronModelBridge._cast_export_weight_dtype(weights, None) is weights
 
     def test_no_weight_dtype_requantizes_by_default(self, monkeypatch):
         from unittest.mock import MagicMock