diff --git a/test/3x/torch/quantization/test_autoround.py b/test/3x/torch/quantization/test_autoround.py index 1a40c0442ab..99a6bb909bf 100644 --- a/test/3x/torch/quantization/test_autoround.py +++ b/test/3x/torch/quantization/test_autoround.py @@ -88,8 +88,8 @@ def setup_class(self): self.gptj = transformers.AutoModelForCausalLM.from_pretrained( "hf-internal-testing/tiny-random-GPTJForCausalLM", torchscript=True, - ) - self.inp = torch.ones([1, 10], dtype=torch.long) + ).to("cpu") + self.inp = torch.ones([1, 10], dtype=torch.long, device="cpu") self.tokenizer = transformers.AutoTokenizer.from_pretrained( "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True ) @@ -110,18 +110,18 @@ def test_autoround(self, quant_lm_head): # AutoRound does not yet support the actual use of quant_lm_head # https://github.com/intel/auto-round/blob/7b8e280f5b789fe861fe95eac971de0805ce4c62/auto_round/compressors/base.py#L438-L442 fp32_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32") + quant_config = AutoRoundConfig( nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu") if quant_lm_head is True: layer_config = {"lm_head": {"data_type": "int"}} quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32", - quant_lm_head=quant_lm_head, layer_config=layer_config) + quant_lm_head=quant_lm_head, layer_config=layer_config, device_map="cpu") logger.info(f"Test AutoRound with config {quant_config}") # prepare + convert API model = prepare(model=fp32_model, quant_config=quant_config) run_fn(model, self.dataloader) - q_model = convert(model) + q_model = convert(model=model, quant_config=quant_config) out = q_model(self.inp)[0] assert torch.allclose(out, self.label, atol=1e-1) assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys() @@ -133,7 +133,7 @@ def test_autoround(self, quant_lm_head): def test_int4_dtype(self): fp32_model = copy.deepcopy(self.gptj) - quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32") + quant_config = AutoRoundConfig( dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu") logger.info(f"Test AutoRound with config {quant_config}") # prepare + convert API @@ -152,7 +152,7 @@ def test_autoround_with_quantize_API(self): gpt_j_model = copy.deepcopy(self.gptj) quant_config = AutoRoundConfig(dtype="int", bits=4, act_dtype="int", act_bits=32,nsamples=32, seqlen=10, - iters=10, use_sym=False, group_size=128, amp=False ,scale_dtype="fp32") + iters=10, use_sym=False, group_size=128, amp=False, scale_dtype="fp32", device_map="cpu") quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) logger.info(f"Test AutoRound with config {quant_config}") @@ -169,11 +169,11 @@ def test_autoround_with_quantize_API(self): assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed." def test_conv1d(self): - model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="cpu", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-GPT-124M", trust_remote_code=True) text = "Replace me by any text you'd like." encoded_input = tokenizer(text, return_tensors="pt") - quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0,tokenizer=tokenizer,export_format="auto_round") + quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0, tokenizer=tokenizer, export_format="auto_round", device_map="cpu") model = prepare(model=model, quant_config=quant_config) q_model = convert(model) output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0]) @@ -191,10 +191,10 @@ def test_utils(self): fp32_model = copy.deepcopy(self.gptj) to_quant_block_names = get_multimodal_block_names(fp32_model, quant_vision=True) quant_config = AutoRoundConfig( - nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names + nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names, device_map="cpu", ) logger.info(f"Test AutoRound with config {quant_config}") - device = detect_device("auto") + device = "cpu" layers_list = get_layer_names_in_block(fp32_model, to_quant_block_names=to_quant_block_names) layers_list = get_layer_names_in_block(fp32_model) fp32_model.to(device) @@ -216,7 +216,7 @@ def test_mllm(self): model_name = "Qwen/Qwen2-VL-2B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") + model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="cpu") dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader( template=None, model=model, @@ -245,6 +245,7 @@ def test_mllm(self): quant_nontext_module=True, truncation=truncation, gradient_accumulate_steps=gradient_accumulate_steps, + device_map="cpu", ) model = prepare(model=model, quant_config=quant_config) @@ -271,15 +272,15 @@ def test_set_local(self): fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) - inp = torch.ones([1, 10], dtype=torch.long) + inp = torch.ones([1, 10], dtype=torch.long, device='cpu') output_dir = "./saved_inc" tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) quant_config = AutoRoundConfig( tokenizer=tokenizer, output_dir=output_dir, - dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round") + dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu") logger.info(f"Test AutoRound with config {quant_config}") quant_config.set_local("self.attn", AutoRoundConfig(dtype="fp16")) # {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}} @@ -290,7 +291,7 @@ def test_set_local(self): model = AutoModelForCausalLM.from_pretrained( output_dir, torch_dtype="auto", - device_map="auto", + device_map="cpu", ) out = model(self.inp)[0] assert isinstance(q_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear), "set_local failed." @@ -299,22 +300,22 @@ def test_set_local(self): fp32_model = transformers.AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) - inp = torch.ones([1, 10], dtype=torch.long) + inp = torch.ones([1, 10], dtype=torch.long, device='cpu') tokenizer = transformers.AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) from auto_round import AutoRound layer_config = {"self.attn":{"data_type":"fp16"}} ar = AutoRound( tokenizer=tokenizer, model=fp32_model, layer_config=layer_config, - data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round") + data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu") quantized_model_path = "./saved_ar" ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", - device_map="auto", + device_map="cpu", ) out_ar = model(inp)[0] assert torch.all(out_ar.eq(out)) @@ -328,9 +329,9 @@ def test_scheme(self, scheme): fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) - inp = torch.ones([1, 10], dtype=torch.long) + inp = torch.ones([1, 10], dtype=torch.long, device='cpu') tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) @@ -345,6 +346,7 @@ def test_scheme(self, scheme): scheme=scheme, export_format="auto_round", output_dir=output_dir, # default is "temp_auto_round" + device_map="cpu", ) # quantizer execute @@ -355,7 +357,7 @@ def test_scheme(self, scheme): inc_model = AutoModelForCausalLM.from_pretrained( output_dir, torch_dtype="auto", - device_map="auto", + device_map="cpu", ) out = inc_model(inp)[0] @@ -363,9 +365,9 @@ def test_scheme(self, scheme): fp32_model = transformers.AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) - inp = torch.ones([1, 10], dtype=torch.long) + inp = torch.ones([1, 10], dtype=torch.long, device='cpu') tokenizer = transformers.AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) from auto_round import AutoRound @@ -378,13 +380,14 @@ def test_scheme(self, scheme): amp=False, scale_dtype="fp16", scheme=scheme, + device_map="cpu", ) quantized_model_path = "./saved_ar" ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", - device_map="auto", + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) out_ar = model(inp)[0] @@ -399,7 +402,7 @@ def test_target_bits(self): fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) @@ -415,6 +418,7 @@ def test_target_bits(self): enable_torch_compile=True, low_gpu_mem_usage=True, export_format="auto_round", + device_map="cpu", ) # quantizer execute model = prepare(model=fp32_model, quant_config=quant_config) @@ -441,7 +445,7 @@ def eval_acc_fn(model) -> float: fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) @@ -456,6 +460,7 @@ def eval_acc_fn(model) -> float: low_gpu_mem_usage=True, export_format="auto_round", iters=0, + device_map="cpu", ) ] ) @@ -470,7 +475,7 @@ def test_static_attention_dtype(self): fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) @@ -485,6 +490,7 @@ def test_static_attention_dtype(self): static_attention_dtype="fp8", output_dir=output_dir, export_format="auto_round", + device_map="cpu", ) # quantizer execute model = prepare(model=fp32_model, quant_config=quant_config) @@ -510,7 +516,7 @@ def test_static_afp8_export(self, static_kv_dtype): fp32_model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", torchscript=True, - device_map="auto", + device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained( "facebook/opt-125m", trust_remote_code=True) @@ -531,6 +537,7 @@ def test_static_afp8_export(self, static_kv_dtype): static_kv_dtype=static_kv_dtype, export_format="auto_round", output_dir=output_dir, + device_map="cpu", ) # quantizer execute @@ -550,6 +557,7 @@ def test_static_afp8_export(self, static_kv_dtype): model = transformers.AutoModelForCausalLM.from_pretrained( output_dir, torch_dtype="auto", + device_map="cpu", low_cpu_mem_usage=True, trust_remote_code=True, ) @@ -605,6 +613,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): export_format="auto_round", output_dir=output_dir, reloading=False, + device_map="cpu", ) # quantizer execute