Store raw hf_quant_config and convert at config.json write time

jenchen13 · claude · jenchen13 · commit 82f0b973bddd · 2026-05-13T11:10:09.000-07:00
Restore `self._hf_quant_config` to hold the raw modelopt-native schema
(matching upstream main) and call `convert_hf_quant_config_format` inline
when writing `config.json["quantization_config"]` for newer vLLM. Drops
the temporary `raw_hf_quant_config` variable and trims the post-write
region to match main more closely.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -346,19 +346,15 @@ def save_pretrained(
             if gathered_kv_cache_dtype is not None:
                 quantization_config["kv_cache_quant_algo"] = gathered_kv_cache_dtype
 
-            raw_hf_quant_config = {
+            self._hf_quant_config = {
                 "producer": {
                     "name": "modelopt",
                     "version": __version__,
                 },
                 "quantization": quantization_config,
             }
-            # hf_quant_config.json keeps the raw modelopt-native schema for legacy
-            # consumers; config.json["quantization_config"] gets the vLLM-facing
-            # converted schema below.
-            self._hf_quant_config = convert_hf_quant_config_format(raw_hf_quant_config)
             with open(save_directory + "/hf_quant_config.json", "w") as f:
-                json.dump(raw_hf_quant_config, f, indent=4)
+                json.dump(self._hf_quant_config, f, indent=4)
 
         # Add multimodal components to state_dict. Since only support decoder model quantization,
         # no changes will be made to the multimodal components. We copy the multimodal components
@@ -378,7 +374,9 @@ def save_pretrained(
         if self._hf_quant_config and os.path.exists(config_json_file):
             with open(config_json_file) as f:
                 config_dict = json.load(f)
-            config_dict["quantization_config"] = self._hf_quant_config
+            config_dict["quantization_config"] = convert_hf_quant_config_format(
+                self._hf_quant_config
+            )
             with open(config_json_file, "w") as f:
                 json.dump(config_dict, f, indent=4)