@@ -50,9 +50,8 @@ def test_gptq_format(self, dataloader):
5050 autoround .quantize_and_save (output_dir = quantized_model_path )
5151
5252 quantization_config = AutoRoundConfig (backend = "auto" )
53-
5453 model = AutoModelForCausalLM .from_pretrained (
55- quantized_model_path , device_map = "auto" , quantization_config = quantization_config
54+ quantized_model_path , device_map = self . device , quantization_config = quantization_config
5655 )
5756 tokenizer = AutoTokenizer .from_pretrained (quantized_model_path )
5857 text = "There is a girl who likes adventure,"
@@ -82,7 +81,6 @@ def test_awq_format(self, dataloader):
8281 autoround .quantize_and_save (output_dir = quantized_model_path , format = "auto_round:auto_awq" )
8382
8483 quantization_config = AutoRoundConfig (backend = "auto" )
85- # device_map="auto" doesn't work, must use "xpu"
8684 model = AutoModelForCausalLM .from_pretrained (
8785 quantized_model_path , device_map = self .device , quantization_config = quantization_config
8886 )
@@ -92,3 +90,135 @@ def test_awq_format(self, dataloader):
9290 res = tokenizer .decode (model .generate (** inputs , max_new_tokens = 50 )[0 ])
9391 print (res )
9492 assert "!!!" not in res
93+
94+ @pytest .mark .parametrize (
95+ "scheme" , ["W4A16" , "W2A16" , "W3A16" , "W8A16" , "MXFP4" , "MXFP8" , "NVFP4" , "FPW8A16" , "FP8_STATIC" ]
96+ )
97+ def test_scheme (self , scheme , dataloader ):
98+ model_name = get_model_path ("facebook/opt-125m" )
99+ tokenizer = transformers .AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
100+
101+ ar = AutoRound (
102+ model = model_name ,
103+ tokenizer = tokenizer ,
104+ nsamples = 32 ,
105+ seqlen = 10 ,
106+ iters = 1 ,
107+ device_map = self .device ,
108+ scheme = scheme ,
109+ dataset = dataloader ,
110+ )
111+ quantized_model_path = "./saved"
112+ ar .quantize_and_save (output_dir = quantized_model_path , inplace = True , format = "auto_round" )
113+
114+ # test loading
115+ if scheme not in ["FPW8A16" ]: # FPW8A16 group_size is 0
116+ model = AutoModelForCausalLM .from_pretrained (
117+ quantized_model_path ,
118+ device_map = self .device ,
119+ )
120+
121+ shutil .rmtree (quantized_model_path , ignore_errors = True )
122+
123+ def test_vlm_model (self , dataloader ):
124+ scheme = "W4A16"
125+ model_name = get_model_path ("Qwen/Qwen2-VL-2B-Instruct" )
126+ from transformers import AutoProcessor , AutoTokenizer , Qwen2VLForConditionalGeneration
127+
128+ fp32_model = Qwen2VLForConditionalGeneration .from_pretrained (model_name )
129+ tokenizer = AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
130+
131+ ar = AutoRound (
132+ model = model_name ,
133+ nsamples = 1 ,
134+ iters = 0 ,
135+ seqlen = 10 ,
136+ disable_opt_rtn = True ,
137+ device_map = self .device ,
138+ scheme = scheme ,
139+ dataset = dataloader ,
140+ )
141+
142+ quantized_model_path = "./saved"
143+ ar .quantize_and_save (output_dir = quantized_model_path , inplace = True , format = "auto_round" )
144+
145+ quantization_config = AutoRoundConfig (backend = "auto" )
146+ import requests
147+ from PIL import Image
148+
149+ model = Qwen2VLForConditionalGeneration .from_pretrained (
150+ quantized_model_path ,
151+ torch_dtype = "float16" ,
152+ device_map = self .device ,
153+ quantization_config = quantization_config ,
154+ )
155+ processor = AutoProcessor .from_pretrained (quantized_model_path )
156+ image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
157+ messages = [
158+ {
159+ "role" : "user" ,
160+ "content" : [
161+ {
162+ "type" : "image" ,
163+ "image" : image_url ,
164+ },
165+ {"type" : "text" , "text" : "Describe this image." },
166+ ],
167+ }
168+ ]
169+
170+ # Preparation for inference
171+ text = processor .apply_chat_template (messages , tokenize = False , add_generation_prompt = True )
172+ image_inputs = Image .open (requests .get (image_url , stream = True ).raw )
173+ inputs = processor (
174+ text = [text ],
175+ images = image_inputs ,
176+ padding = True ,
177+ return_tensors = "pt" ,
178+ )
179+ inputs = inputs .to (model .device )
180+
181+ generated_ids = model .generate (** inputs , max_new_tokens = 128 )
182+ generated_ids_trimmed = [out_ids [len (in_ids ) :] for in_ids , out_ids in zip (inputs .input_ids , generated_ids )]
183+ output_text = processor .batch_decode (
184+ generated_ids_trimmed , skip_special_tokens = True , clean_up_tokenization_spaces = False
185+ )
186+ print (output_text [0 ])
187+
188+ def test_quant_lm_head (self , dataloader ):
189+ bits , sym , group_size = 4 , True , 128
190+ # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
191+ model_name = get_model_path ("Qwen/Qwen3-8B" )
192+ layer_config = {
193+ "lm_head" : {"bits" : 4 }, # set lm_head quant
194+ "layer" : {"bits" : 16 },
195+ }
196+ from transformers import AutoModelForCausalLM , AutoTokenizer
197+
198+ tokenizer = AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
199+
200+ ar = AutoRound (
201+ model = model_name ,
202+ tokenizer = tokenizer ,
203+ bits = bits ,
204+ group_size = group_size ,
205+ sym = sym ,
206+ nsamples = 2 ,
207+ iters = 0 ,
208+ seqlen = 2 ,
209+ layer_config = layer_config ,
210+ device_map = self .device ,
211+ dataset = dataloader ,
212+ )
213+ quantized_model_path = "./saved"
214+ ar .quantize_and_save (output_dir = quantized_model_path , inplace = True , format = "auto_round" )
215+
216+ quantization_config = AutoRoundConfig (backend = "auto" )
217+ model = AutoModelForCausalLM .from_pretrained (
218+ quantized_model_path , device_map = self .device , quantization_config = quantization_config
219+ )
220+ tokenizer = AutoTokenizer .from_pretrained (quantized_model_path )
221+ text = "There is a girl who likes adventure,"
222+ inputs = tokenizer (text , return_tensors = "pt" ).to (model .device )
223+ res = tokenizer .decode (model .generate (** inputs , max_new_tokens = 50 )[0 ])
224+ print (res )
0 commit comments