[GGUF] using quant_nontext_module to control whether quant vision model (#1317)

n1ck-guo · web-flow · commit 24e43976900a · 2026-01-23T13:54:18.000+08:00
Signed-off-by: n1ck-guo &lt;heng.guo@intel.com&gt;
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -1198,11 +1198,8 @@ def _immediate_pack(self, name: str):
             model=self.model,
             device=self.device,
             output_dir=self._get_save_folder_name(self.formats[0]),
-            mllm=self.mllm,
             layer_config=self.layer_config,
             tokenizer=self.tokenizer,
-            processor=self.processor if hasattr(self, "processor") else None,
-            image_processor=self.image_processor if hasattr(self, "image_processor") else None,
         )
 
     @torch.inference_mode()
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
@@ -183,13 +183,18 @@ def __init__(
 
         self.model = model
         quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
-        if quant_nontext_module:
-            from transformers.utils.versions import require_version
-
-            require_version(
-                "pillow",
-                "pillow is required for quantizing non-text modules, please install it with `pip install pillow`",
-            )
+        if quant_nontext_module and iters > 0:
+            import importlib.util
+
+            missing_libs = []
+            for require_lib in ["pillow", "torchvision"]:
+                if importlib.util.find_spec(require_lib) is None:
+                    missing_libs.append(require_lib)
+            if len(missing_libs) > 0:
+                logger.error(
+                    f"{', '.join(missing_libs)} are required for quantizing non-text modules,"
+                    f" please install them with `pip install {' '.join(missing_libs)}`",
+                )
         all_blocks = get_block_names(model, quant_nontext_module)
         self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
         if to_quant_block_names is None:
@@ -453,7 +458,12 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
         if self.processor is not None and not hasattr(self.processor, "chat_template"):
             self.processor.chat_template = None
         compressed_model = super().save_quantized(
-            output_dir=output_dir, format=format, inplace=inplace, processor=self.processor, **kwargs
+            output_dir=output_dir,
+            format=format,
+            inplace=inplace,
+            processor=self.processor,
+            quant_nontext_module=self.quant_nontext_module if hasattr(self, "quant_nontext_module") else False,
+            **kwargs,
         )
         return compressed_model
 
@@ -467,3 +477,19 @@ def _check_quant_nontext(self, layer_config, quant_nontext_module):
                 if vlm_key in layer_name and check_to_quantized(layer_config[layer_name]):
                     return True
         return quant_nontext_module
+
+    def _immediate_pack(self, name: str):
+        if not self.is_immediate_packing:  # pylint: disable=E1101
+            return
+        self.formats[0].immediate_pack(
+            name=name,
+            model=self.model,
+            device=self.device,
+            output_dir=self._get_save_folder_name(self.formats[0]),
+            mllm=self.mllm,
+            layer_config=self.layer_config,
+            tokenizer=self.tokenizer,
+            processor=self.processor if hasattr(self, "processor") else None,
+            image_processor=self.image_processor if hasattr(self, "image_processor") else None,
+            quant_nontext_module=self.quant_nontext_module if hasattr(self, "quant_nontext_module") else False,
+        )
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
@@ -85,7 +85,9 @@ def download_convert_file(redownload=False):
         f.write(response.text)
 
 
-def wrapper_model_instance(model_instance, model, layer_config, low_cpu_mem_usage=False, device=None):
+def wrapper_model_instance(
+    model_instance, model, layer_config, low_cpu_mem_usage=False, device=None, quant_nontext_module=False
+):
     if model_instance.model_arch == gguf.MODEL_ARCH.MMPROJ and model_instance.fname_out.is_dir():
         model_instance.fname_out = model_instance.fname_out / "mmproj-model.gguf"
     model_instance.model = model
@@ -96,6 +98,7 @@ def wrapper_model_instance(model_instance, model, layer_config, low_cpu_mem_usag
     model_instance.prepare_tensors = partial(prepare_tensors, model_instance)
 
     model_instance.device = device
+    model_instance.quant_nontext_module = quant_nontext_module
 
     return model_instance
 
@@ -528,6 +531,9 @@ def prepare_tensors(cls):
                 elif data_qtype == gguf.GGMLQuantizationType.Q6_K:
                     data_qtype = gguf.GGMLQuantizationType.Q8_0
 
+            if cls.model_arch == gguf.MODEL_ARCH.MMPROJ and cls.quant_nontext_module is False:
+                data_qtype = gguf.GGMLQuantizationType.F32
+
             from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES
 
             if data_qtype.name.lower() in GGML_QUANT_SIZES:
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -74,6 +74,7 @@ def create_model_class(
     low_cpu_mem_usage=False,
     model_type=convert_hf_to_gguf.ModelType.TEXT,
     device="cpu",
+    quant_nontext_module: bool = False,
 ):
     tmp_work_dir = model.name_or_path
     os.makedirs(output_dir, exist_ok=True)
@@ -118,7 +119,12 @@ def create_model_class(
             small_first_shard=False,
         )
         model_instance = wrapper_model_instance(
-            model_instance, model=model, layer_config=layer_config, low_cpu_mem_usage=low_cpu_mem_usage, device=device
+            model_instance,
+            model=model,
+            layer_config=layer_config,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            device=device,
+            quant_nontext_module=quant_nontext_module,
         )
         model_instance = handle_special_model(model_instance, model_architecture)
     return model_instance
@@ -136,6 +142,7 @@ def pack_gguf_layer(
     image_processor=None,
     model_type=convert_hf_to_gguf.ModelType.TEXT,
     device="cpu",
+    quant_nontext_module=False,
 ):
     """Export the model to gguf format."""
     global gguf_model_instance_global
@@ -153,6 +160,7 @@ def pack_gguf_layer(
                 low_cpu_mem_usage=True,
                 model_type=convert_hf_to_gguf.ModelType.TEXT,
                 device=device,
+                quant_nontext_module=quant_nontext_module,
             )
         ]
         if model_type == convert_hf_to_gguf.ModelType.MMPROJ:
@@ -165,6 +173,7 @@ def pack_gguf_layer(
                     low_cpu_mem_usage=True,
                     model_type=convert_hf_to_gguf.ModelType.MMPROJ,
                     device=device,
+                    quant_nontext_module=quant_nontext_module,
                 )
             )
 
@@ -215,7 +224,14 @@ def pack_gguf_layer(
 
 @torch.inference_mode()
 def save_quantized_as_gguf(
-    output_dir, model=None, backend="gguf:q4_0", layer_config=None, mllm=False, device="cpu", **kwargs
+    output_dir,
+    model=None,
+    backend="gguf:q4_0",
+    layer_config=None,
+    mllm=False,
+    device="cpu",
+    quant_nontext_module=False,
+    **kwargs,
 ):
     """Export the model to gguf format."""
     st = time.time()
@@ -224,7 +240,13 @@ def save_quantized_as_gguf(
     if "gguf_model_instance_global" not in globals():
         gguf_model_instance_global = [
             create_model_class(
-                output_dir, model, layer_config, backend, model_type=convert_hf_to_gguf.ModelType.TEXT, device=device
+                output_dir,
+                model,
+                layer_config,
+                backend,
+                model_type=convert_hf_to_gguf.ModelType.TEXT,
+                device=device,
+                quant_nontext_module=quant_nontext_module,
             )
         ]
         if mllm:
@@ -236,6 +258,7 @@ def save_quantized_as_gguf(
                     backend,
                     model_type=convert_hf_to_gguf.ModelType.MMPROJ,
                     device=device,
+                    quant_nontext_module=quant_nontext_module,
                 )
             )
 
diff --git a/auto_round/formats.py b/auto_round/formats.py
@@ -43,6 +43,8 @@
     SUPPORTED_FORMATS,
     check_to_quantized,
     copy_python_files_from_model_cache,
+    find_matching_blocks,
+    get_block_names,
     get_module,
     logger,
 )
@@ -647,6 +649,10 @@ def check_and_reset_format(self, ar):
         elif ar.bits >= 8 and ar.iters != 0:
             logger.warning_once("`iters=0` is recommended for bits>=8")
 
+        if getattr(ar, "quant_nontext_module", False):
+            # for gguf export, leave vl model for gguf itself
+            all_blocks = get_block_names(ar.model, False)
+            ar.quant_block_list = find_matching_blocks(ar.model, all_blocks, None)
         return super().check_and_reset_format(ar)
 
     def pack_layer(
@@ -661,6 +667,7 @@ def pack_layer(
         image_processor=None,
         model_type=ModelType.TEXT,
         device="cpu",
+        quant_nontext_module=False,
     ):
         from auto_round.export.export_to_gguf.export import pack_gguf_layer
 
@@ -675,6 +682,7 @@ def pack_layer(
             image_processor,
             model_type,
             device,
+            quant_nontext_module,
         )
 
     def save_quantized(
@@ -826,6 +834,7 @@ def immediate_pack(
         tokenizer=None,
         processor=None,
         image_processor=None,
+        quant_nontext_module: bool = False,
         **kwargs,
     ):
         m = get_module(model, name)
@@ -843,6 +852,7 @@ def immediate_pack(
             image_processor=image_processor,
             model_type=model_type,
             device=device,
+            quant_nontext_module=quant_nontext_module,
         )
 
 
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
@@ -201,6 +201,7 @@ def test_vlm_gguf(self):
             iters=0,
             nsamples=8,
             disable_opt_rtn=True,
+            quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
@@ -214,6 +215,32 @@ def test_vlm_gguf(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree(tiny_model_path, ignore_errors=True)
 
+    def test_vlm_gguf_wo_quant_nontext_module(self):
+        from ...helpers import save_tiny_model
+
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+        tiny_model_path = save_tiny_model(model_name, "./tmp/tiny_qwen_vl_model_path", num_layers=3, is_mllm=True)
+        from auto_round import AutoRoundMLLM
+
+        autoround = AutoRoundMLLM(
+            tiny_model_path,
+            iters=0,
+            nsamples=8,
+            disable_opt_rtn=True,
+            quant_nontext_module=False,
+        )
+        quantized_model_path = "./saved"
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        assert "mmproj-model.gguf" in os.listdir("./saved")
+        for file_name in os.listdir(quantized_model_path):
+            file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
+            if file_name == "mmproj-model.gguf":
+                assert abs(file_size - 361) < 5.0
+            else:
+                assert abs(file_size - 264) < 5.0
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(tiny_model_path, ignore_errors=True)
+
     def test_qtype_setting(self):
         # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M
         # Qwen3-0.6B output q6_k, token_embed q4_0  448M
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
@@ -175,6 +175,7 @@ def test_vlm_gguf(self):
             nsamples=32,
             iters=0,
             disable_opt_rtn=True,
+            quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")

Original file line number	Diff line number	Diff line change
`@@ -175,6 +175,7 @@ def test_vlm_gguf(self):`
`175`	`175`	`nsamples=32,`
`176`	`176`	`iters=0,`
`177`	`177`	`disable_opt_rtn=True,`
	`178`	`+ quant_nontext_module=True,`
`178`	`179`	`)`
`179`	`180`	`quantized_model_path = "./saved"`
`180`	`181`	`autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")`