Skip to content

Commit 79fa1a9

Browse files
authored
fix packing nvfp/mxfp max_wokers & extend xpu ut (#1555)
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com>
1 parent 62f92d1 commit 79fa1a9

File tree

4 files changed

+138
-5
lines changed

4 files changed

+138
-5
lines changed

.azure-pipelines/template/ut-template.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,11 @@ steps:
4646
- script: |
4747
if [ "${{ parameters.utScriptFileName }}" == "run_ut_xpu" ];then
4848
docker exec ${{ parameters.utContainerName }} bash -c "cd /auto-round \
49-
&& uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/xpu \
49+
&& uv pip install torch==2.10.0 torchvision --index-url https://download.pytorch.org/whl/xpu \
50+
&& uv pip install torch==2.10.0 auto-round-lib \
5051
&& uv pip install -r requirements.txt \
5152
&& uv pip install -r test/test_ark/requirements.txt \
53+
&& uv pip install -r test/test_xpu/requirements.txt \
5254
&& cd /auto-round && uv pip install . \
5355
&& uv pip list"
5456
else

auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def save_quantized_as_fp(
236236
quantization_config["extra_config"] = extra_config
237237
names = list(layer_config.keys())
238238
max_workers = 1
239-
if not torch.cuda.is_available() or not torch.xpu.is_available():
239+
if not torch.cuda.is_available() and not torch.xpu.is_available():
240240
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
241241
with ThreadPoolExecutor(max_workers=max_workers) as executor:
242242
with tqdm(total=len(names), leave=True) as pbar:

test/test_xpu/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pillow

test/test_xpu/test_autoround.py

Lines changed: 133 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,8 @@ def test_gptq_format(self, dataloader):
5050
autoround.quantize_and_save(output_dir=quantized_model_path)
5151

5252
quantization_config = AutoRoundConfig(backend="auto")
53-
5453
model = AutoModelForCausalLM.from_pretrained(
55-
quantized_model_path, device_map="auto", quantization_config=quantization_config
54+
quantized_model_path, device_map=self.device, quantization_config=quantization_config
5655
)
5756
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
5857
text = "There is a girl who likes adventure,"
@@ -82,7 +81,6 @@ def test_awq_format(self, dataloader):
8281
autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
8382

8483
quantization_config = AutoRoundConfig(backend="auto")
85-
# device_map="auto" doesn't work, must use "xpu"
8684
model = AutoModelForCausalLM.from_pretrained(
8785
quantized_model_path, device_map=self.device, quantization_config=quantization_config
8886
)
@@ -92,3 +90,135 @@ def test_awq_format(self, dataloader):
9290
res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
9391
print(res)
9492
assert "!!!" not in res
93+
94+
@pytest.mark.parametrize(
95+
"scheme", ["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"]
96+
)
97+
def test_scheme(self, scheme, dataloader):
98+
model_name = get_model_path("facebook/opt-125m")
99+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
100+
101+
ar = AutoRound(
102+
model=model_name,
103+
tokenizer=tokenizer,
104+
nsamples=32,
105+
seqlen=10,
106+
iters=1,
107+
device_map=self.device,
108+
scheme=scheme,
109+
dataset=dataloader,
110+
)
111+
quantized_model_path = "./saved"
112+
ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
113+
114+
# test loading
115+
if scheme not in ["FPW8A16"]: # FPW8A16 group_size is 0
116+
model = AutoModelForCausalLM.from_pretrained(
117+
quantized_model_path,
118+
device_map=self.device,
119+
)
120+
121+
shutil.rmtree(quantized_model_path, ignore_errors=True)
122+
123+
def test_vlm_model(self, dataloader):
124+
scheme = "W4A16"
125+
model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
126+
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
127+
128+
fp32_model = Qwen2VLForConditionalGeneration.from_pretrained(model_name)
129+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
130+
131+
ar = AutoRound(
132+
model=model_name,
133+
nsamples=1,
134+
iters=0,
135+
seqlen=10,
136+
disable_opt_rtn=True,
137+
device_map=self.device,
138+
scheme=scheme,
139+
dataset=dataloader,
140+
)
141+
142+
quantized_model_path = "./saved"
143+
ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
144+
145+
quantization_config = AutoRoundConfig(backend="auto")
146+
import requests
147+
from PIL import Image
148+
149+
model = Qwen2VLForConditionalGeneration.from_pretrained(
150+
quantized_model_path,
151+
torch_dtype="float16",
152+
device_map=self.device,
153+
quantization_config=quantization_config,
154+
)
155+
processor = AutoProcessor.from_pretrained(quantized_model_path)
156+
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
157+
messages = [
158+
{
159+
"role": "user",
160+
"content": [
161+
{
162+
"type": "image",
163+
"image": image_url,
164+
},
165+
{"type": "text", "text": "Describe this image."},
166+
],
167+
}
168+
]
169+
170+
# Preparation for inference
171+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
172+
image_inputs = Image.open(requests.get(image_url, stream=True).raw)
173+
inputs = processor(
174+
text=[text],
175+
images=image_inputs,
176+
padding=True,
177+
return_tensors="pt",
178+
)
179+
inputs = inputs.to(model.device)
180+
181+
generated_ids = model.generate(**inputs, max_new_tokens=128)
182+
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
183+
output_text = processor.batch_decode(
184+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
185+
)
186+
print(output_text[0])
187+
188+
def test_quant_lm_head(self, dataloader):
189+
bits, sym, group_size = 4, True, 128
190+
# Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
191+
model_name = get_model_path("Qwen/Qwen3-8B")
192+
layer_config = {
193+
"lm_head": {"bits": 4}, # set lm_head quant
194+
"layer": {"bits": 16},
195+
}
196+
from transformers import AutoModelForCausalLM, AutoTokenizer
197+
198+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
199+
200+
ar = AutoRound(
201+
model=model_name,
202+
tokenizer=tokenizer,
203+
bits=bits,
204+
group_size=group_size,
205+
sym=sym,
206+
nsamples=2,
207+
iters=0,
208+
seqlen=2,
209+
layer_config=layer_config,
210+
device_map=self.device,
211+
dataset=dataloader,
212+
)
213+
quantized_model_path = "./saved"
214+
ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
215+
216+
quantization_config = AutoRoundConfig(backend="auto")
217+
model = AutoModelForCausalLM.from_pretrained(
218+
quantized_model_path, device_map=self.device, quantization_config=quantization_config
219+
)
220+
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
221+
text = "There is a girl who likes adventure,"
222+
inputs = tokenizer(text, return_tensors="pt").to(model.device)
223+
res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
224+
print(res)

0 commit comments

Comments
 (0)