intel · kkarwot · Jan 9, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/test/3x/torch/quantization/test_autoround.py b/test/3x/torch/quantization/test_autoround.py
@@ -88,8 +88,8 @@ def setup_class(self):
         self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM",
             torchscript=True,
-        )
-        self.inp = torch.ones([1, 10], dtype=torch.long)
+        ).to("cpu")
+        self.inp = torch.ones([1, 10], dtype=torch.long, device="cpu")
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
@@ -110,18 +110,18 @@ def test_autoround(self, quant_lm_head):
         # AutoRound does not yet support the actual use of quant_lm_head 
         # https://github.com/intel/auto-round/blob/7b8e280f5b789fe861fe95eac971de0805ce4c62/auto_round/compressors/base.py#L438-L442
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32")
+        quant_config = AutoRoundConfig( nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu")
         if quant_lm_head is True:
             layer_config = {"lm_head": {"data_type": "int"}}
             quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32", 
-                                           quant_lm_head=quant_lm_head, layer_config=layer_config)
+                            quant_lm_head=quant_lm_head, layer_config=layer_config, device_map="cpu")
         logger.info(f"Test AutoRound with config {quant_config}")
 
         # prepare + convert API
         model = prepare(model=fp32_model, quant_config=quant_config)
 
         run_fn(model, self.dataloader)
-        q_model = convert(model)
+        q_model = convert(model=model, quant_config=quant_config)
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
@@ -133,7 +133,7 @@ def test_autoround(self, quant_lm_head):
 
     def test_int4_dtype(self):
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32")
+        quant_config = AutoRoundConfig( dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu")
         logger.info(f"Test AutoRound with config {quant_config}")
 
         # prepare + convert API
@@ -152,7 +152,7 @@ def test_autoround_with_quantize_API(self):
         gpt_j_model = copy.deepcopy(self.gptj)
 
         quant_config = AutoRoundConfig(dtype="int", bits=4, act_dtype="int", act_bits=32,nsamples=32, seqlen=10,
-                        iters=10, use_sym=False, group_size=128, amp=False ,scale_dtype="fp32")
+                        iters=10, use_sym=False, group_size=128, amp=False, scale_dtype="fp32", device_map="cpu")
         quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
 
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -169,11 +169,11 @@ def test_autoround_with_quantize_API(self):
         assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
 
     def test_conv1d(self):
-        model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="cpu", trust_remote_code=True)
         tokenizer =  AutoTokenizer.from_pretrained("MBZUAI/LaMini-GPT-124M", trust_remote_code=True)
         text = "Replace me by any text you'd like."
         encoded_input = tokenizer(text, return_tensors="pt")
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0,tokenizer=tokenizer,export_format="auto_round")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0, tokenizer=tokenizer, export_format="auto_round", device_map="cpu")
         model = prepare(model=model, quant_config=quant_config)
         q_model = convert(model)
         output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0])
@@ -191,10 +191,10 @@ def test_utils(self):
         fp32_model = copy.deepcopy(self.gptj)
         to_quant_block_names = get_multimodal_block_names(fp32_model, quant_vision=True)
         quant_config = AutoRoundConfig(
-            nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names
+            nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names, device_map="cpu",
         )
         logger.info(f"Test AutoRound with config {quant_config}")
-        device = detect_device("auto")
+        device = "cpu"
         layers_list = get_layer_names_in_block(fp32_model, to_quant_block_names=to_quant_block_names)
         layers_list = get_layer_names_in_block(fp32_model)
         fp32_model.to(device)
@@ -216,7 +216,7 @@ def test_mllm(self):
         model_name = "Qwen/Qwen2-VL-2B-Instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-        model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+        model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="cpu")
         dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader(
             template=None,
             model=model,
@@ -245,6 +245,7 @@ def test_mllm(self):
             quant_nontext_module=True,
             truncation=truncation,
             gradient_accumulate_steps=gradient_accumulate_steps,
+            device_map="cpu",
         )
 
         model = prepare(model=model, quant_config=quant_config)
@@ -271,15 +272,15 @@ def test_set_local(self):
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
-        inp = torch.ones([1, 10], dtype=torch.long)
+        inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
         output_dir = "./saved_inc"
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
         quant_config = AutoRoundConfig(
             tokenizer=tokenizer, output_dir=output_dir,
-            dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round")
+            dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu")
         logger.info(f"Test AutoRound with config {quant_config}")
         quant_config.set_local("self.attn", AutoRoundConfig(dtype="fp16"))
         # {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}}
@@ -290,7 +291,7 @@ def test_set_local(self):
         model = AutoModelForCausalLM.from_pretrained(
             output_dir,
             torch_dtype="auto",
-            device_map="auto",
+            device_map="cpu",
         )
         out = model(self.inp)[0]
         assert isinstance(q_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear), "set_local failed."
@@ -299,22 +300,22 @@ def test_set_local(self):
         fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
-        inp = torch.ones([1, 10], dtype=torch.long)
+        inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
         from auto_round import AutoRound
         layer_config = {"self.attn":{"data_type":"fp16"}}
         ar = AutoRound(
             tokenizer=tokenizer, model=fp32_model, layer_config=layer_config,
-            data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round")
+            data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu")
         quantized_model_path = "./saved_ar"
         ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
-            device_map="auto",
+            device_map="cpu",
         )
         out_ar = model(inp)[0]
         assert torch.all(out_ar.eq(out))
@@ -328,9 +329,9 @@ def test_scheme(self, scheme):
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
-        inp = torch.ones([1, 10], dtype=torch.long)
+        inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
 
@@ -345,6 +346,7 @@ def test_scheme(self, scheme):
             scheme=scheme,
             export_format="auto_round",
             output_dir=output_dir, # default is "temp_auto_round"
+            device_map="cpu",
         )
 
         # quantizer execute
@@ -355,17 +357,17 @@ def test_scheme(self, scheme):
         inc_model = AutoModelForCausalLM.from_pretrained(
             output_dir,
             torch_dtype="auto",
-            device_map="auto",
+            device_map="cpu",
         )
         out = inc_model(inp)[0]
 
         # AutoRound API
         fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
-        inp = torch.ones([1, 10], dtype=torch.long)
+        inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
         from auto_round import AutoRound
@@ -378,13 +380,14 @@ def test_scheme(self, scheme):
             amp=False,
             scale_dtype="fp16",
             scheme=scheme,
+            device_map="cpu",
         )
         quantized_model_path = "./saved_ar"
         ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
-            device_map="auto",
+            device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         out_ar = model(inp)[0]
@@ -399,7 +402,7 @@ def test_target_bits(self):
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
@@ -415,6 +418,7 @@ def test_target_bits(self):
             enable_torch_compile=True,
             low_gpu_mem_usage=True,
             export_format="auto_round",
+            device_map="cpu",
         )
         # quantizer execute
         model = prepare(model=fp32_model, quant_config=quant_config)
@@ -441,7 +445,7 @@ def eval_acc_fn(model) -> float:
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
@@ -456,6 +460,7 @@ def eval_acc_fn(model) -> float:
                     low_gpu_mem_usage=True,
                     export_format="auto_round",
                     iters=0,
+                    device_map="cpu",
                 )
             ]
         )
@@ -470,7 +475,7 @@ def test_static_attention_dtype(self):
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
@@ -485,6 +490,7 @@ def test_static_attention_dtype(self):
             static_attention_dtype="fp8",
             output_dir=output_dir,
             export_format="auto_round",
+            device_map="cpu",
         )
         # quantizer execute
         model = prepare(model=fp32_model, quant_config=quant_config)
@@ -510,7 +516,7 @@ def test_static_afp8_export(self, static_kv_dtype):
         fp32_model = AutoModelForCausalLM.from_pretrained(
             "facebook/opt-125m",
             torchscript=True,
-            device_map="auto",
+            device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-125m", trust_remote_code=True)
@@ -531,6 +537,7 @@ def test_static_afp8_export(self, static_kv_dtype):
             static_kv_dtype=static_kv_dtype,
             export_format="auto_round",
             output_dir=output_dir,
+            device_map="cpu",
         )
 
         # quantizer execute
@@ -550,6 +557,7 @@ def test_static_afp8_export(self, static_kv_dtype):
                 model = transformers.AutoModelForCausalLM.from_pretrained(
                     output_dir,
                     torch_dtype="auto",
+                    device_map="cpu",
                     low_cpu_mem_usage=True,
                     trust_remote_code=True,
                 )
@@ -605,6 +613,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
             export_format="auto_round",
             output_dir=output_dir,
             reloading=False,
+            device_map="cpu",
         )
 
         # quantizer execute