fix packing nvfp/mxfp max_wokers & extend xpu ut (#1555)

Kaihui-intel · web-flow · commit 79fa1a9b53ff · 2026-03-21T21:18:12.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
Signed-off-by: chensuyue &lt;suyue.chen@intel.com&gt;
diff --git a/.azure-pipelines/template/ut-template.yml b/.azure-pipelines/template/ut-template.yml
@@ -46,9 +46,11 @@ steps:
     - script: |
         if [ "${{ parameters.utScriptFileName }}" == "run_ut_xpu" ];then
           docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
-            && uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/xpu \
+            && uv pip install torch==2.10.0 torchvision --index-url https://download.pytorch.org/whl/xpu \
+            && uv pip install torch==2.10.0 auto-round-lib \
             && uv pip install -r requirements.txt \
             && uv pip install -r test/test_ark/requirements.txt \
+            && uv pip install -r test/test_xpu/requirements.txt \
             && cd /auto-round && uv pip install . \
             && uv pip list"
         else 
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -236,7 +236,7 @@ def save_quantized_as_fp(
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
     max_workers = 1
-    if not torch.cuda.is_available() or not torch.xpu.is_available():
+    if not torch.cuda.is_available() and not torch.xpu.is_available():
         max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         with tqdm(total=len(names), leave=True) as pbar:
diff --git a/test/test_xpu/requirements.txt b/test/test_xpu/requirements.txt
@@ -0,0 +1 @@
+pillow
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
@@ -50,9 +50,8 @@ def test_gptq_format(self, dataloader):
         autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="auto")
-
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, device_map=self.device, quantization_config=quantization_config
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
@@ -82,7 +81,6 @@ def test_awq_format(self, dataloader):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
 
         quantization_config = AutoRoundConfig(backend="auto")
-        # device_map="auto" doesn't work, must use "xpu"
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map=self.device, quantization_config=quantization_config
         )
@@ -92,3 +90,135 @@ def test_awq_format(self, dataloader):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
+
+    @pytest.mark.parametrize(
+        "scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"]
+    )
+    def test_scheme(self, scheme, dataloader):
+        model_name = get_model_path("facebook/opt-125m")
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+        ar = AutoRound(
+            model=model_name,
+            tokenizer=tokenizer,
+            nsamples=32,
+            seqlen=10,
+            iters=1,
+            device_map=self.device,
+            scheme=scheme,
+            dataset=dataloader,
+        )
+        quantized_model_path = "./saved"
+        ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+
+        # test loading
+        if scheme not in ["FPW8A16"]:  # FPW8A16 group_size is 0
+            model = AutoModelForCausalLM.from_pretrained(
+                quantized_model_path,
+                device_map=self.device,
+            )
+
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_vlm_model(self, dataloader):
+        scheme = "W4A16"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+        from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
+
+        fp32_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+        ar = AutoRound(
+            model=model_name,
+            nsamples=1,
+            iters=0,
+            seqlen=10,
+            disable_opt_rtn=True,
+            device_map=self.device,
+            scheme=scheme,
+            dataset=dataloader,
+        )
+
+        quantized_model_path = "./saved"
+        ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+
+        quantization_config = AutoRoundConfig(backend="auto")
+        import requests
+        from PIL import Image
+
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            quantized_model_path,
+            torch_dtype="float16",
+            device_map=self.device,
+            quantization_config=quantization_config,
+        )
+        processor = AutoProcessor.from_pretrained(quantized_model_path)
+        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image_url,
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+
+        # Preparation for inference
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        print(output_text[0])
+
+    def test_quant_lm_head(self, dataloader):
+        bits, sym, group_size = 4, True, 128
+        # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
+        model_name = get_model_path("Qwen/Qwen3-8B")
+        layer_config = {
+            "lm_head": {"bits": 4},  # set lm_head quant
+            "layer": {"bits": 16},
+        }
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+        ar = AutoRound(
+            model=model_name,
+            tokenizer=tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            nsamples=2,
+            iters=0,
+            seqlen=2,
+            layer_config=layer_config,
+            device_map=self.device,
+            dataset=dataloader,
+        )
+        quantized_model_path = "./saved"
+        ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+
+        quantization_config = AutoRoundConfig(backend="auto")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, device_map=self.device, quantization_config=quantization_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
+        print(res)