Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 39 additions & 30 deletions test/3x/torch/quantization/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def setup_class(self):
self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
"hf-internal-testing/tiny-random-GPTJForCausalLM",
torchscript=True,
)
self.inp = torch.ones([1, 10], dtype=torch.long)
).to("cpu")
self.inp = torch.ones([1, 10], dtype=torch.long, device="cpu")
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
"hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
)
Expand All @@ -110,18 +110,18 @@ def test_autoround(self, quant_lm_head):
# AutoRound does not yet support the actual use of quant_lm_head
# https://github.com/intel/auto-round/blob/7b8e280f5b789fe861fe95eac971de0805ce4c62/auto_round/compressors/base.py#L438-L442
fp32_model = copy.deepcopy(self.gptj)
quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32")
quant_config = AutoRoundConfig( nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu")
if quant_lm_head is True:
layer_config = {"lm_head": {"data_type": "int"}}
quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32",
quant_lm_head=quant_lm_head, layer_config=layer_config)
quant_lm_head=quant_lm_head, layer_config=layer_config, device_map="cpu")
logger.info(f"Test AutoRound with config {quant_config}")

# prepare + convert API
model = prepare(model=fp32_model, quant_config=quant_config)

run_fn(model, self.dataloader)
q_model = convert(model)
q_model = convert(model=model, quant_config=quant_config)
out = q_model(self.inp)[0]
assert torch.allclose(out, self.label, atol=1e-1)
assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
Expand All @@ -133,7 +133,7 @@ def test_autoround(self, quant_lm_head):

def test_int4_dtype(self):
fp32_model = copy.deepcopy(self.gptj)
quant_config = AutoRoundConfig(dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp32")
quant_config = AutoRoundConfig( dtype="int4", nsamples=32, seqlen=10, iters=10, amp=False, scale_dtype="fp32", device_map="cpu")
logger.info(f"Test AutoRound with config {quant_config}")

# prepare + convert API
Expand All @@ -152,7 +152,7 @@ def test_autoround_with_quantize_API(self):
gpt_j_model = copy.deepcopy(self.gptj)

quant_config = AutoRoundConfig(dtype="int", bits=4, act_dtype="int", act_bits=32,nsamples=32, seqlen=10,
iters=10, use_sym=False, group_size=128, amp=False ,scale_dtype="fp32")
iters=10, use_sym=False, group_size=128, amp=False, scale_dtype="fp32", device_map="cpu")
quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))

logger.info(f"Test AutoRound with config {quant_config}")
Expand All @@ -169,11 +169,11 @@ def test_autoround_with_quantize_API(self):
assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."

def test_conv1d(self):
model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-GPT-124M", device_map="cpu", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-GPT-124M", trust_remote_code=True)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors="pt")
quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0,tokenizer=tokenizer,export_format="auto_round")
quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=0, tokenizer=tokenizer, export_format="auto_round", device_map="cpu")
model = prepare(model=model, quant_config=quant_config)
q_model = convert(model)
output = tokenizer.decode(q_model.generate(**encoded_input, max_new_tokens=10)[0])
Expand All @@ -191,10 +191,10 @@ def test_utils(self):
fp32_model = copy.deepcopy(self.gptj)
to_quant_block_names = get_multimodal_block_names(fp32_model, quant_vision=True)
quant_config = AutoRoundConfig(
nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names
nsamples=32, seqlen=10, iters=10, amp=False ,scale_dtype="fp16", to_quant_block_names=to_quant_block_names, device_map="cpu",
)
logger.info(f"Test AutoRound with config {quant_config}")
device = detect_device("auto")
device = "cpu"
layers_list = get_layer_names_in_block(fp32_model, to_quant_block_names=to_quant_block_names)
layers_list = get_layer_names_in_block(fp32_model)
fp32_model.to(device)
Expand All @@ -216,7 +216,7 @@ def test_mllm(self):
model_name = "Qwen/Qwen2-VL-2B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="cpu")
dataloader, template, truncation, batch_size, gradient_accumulate_steps, seqlen, nsamples = get_mllm_dataloader(
template=None,
model=model,
Expand Down Expand Up @@ -245,6 +245,7 @@ def test_mllm(self):
quant_nontext_module=True,
truncation=truncation,
gradient_accumulate_steps=gradient_accumulate_steps,
device_map="cpu",
)

model = prepare(model=model, quant_config=quant_config)
Expand All @@ -271,15 +272,15 @@ def test_set_local(self):
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
inp = torch.ones([1, 10], dtype=torch.long)
inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
output_dir = "./saved_inc"
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
quant_config = AutoRoundConfig(
tokenizer=tokenizer, output_dir=output_dir,
dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round")
dtype="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu")
logger.info(f"Test AutoRound with config {quant_config}")
quant_config.set_local("self.attn", AutoRoundConfig(dtype="fp16"))
# {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}}
Expand All @@ -290,7 +291,7 @@ def test_set_local(self):
model = AutoModelForCausalLM.from_pretrained(
output_dir,
torch_dtype="auto",
device_map="auto",
device_map="cpu",
)
out = model(self.inp)[0]
assert isinstance(q_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear), "set_local failed."
Expand All @@ -299,22 +300,22 @@ def test_set_local(self):
fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
inp = torch.ones([1, 10], dtype=torch.long)
inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
tokenizer = transformers.AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
from auto_round import AutoRound
layer_config = {"self.attn":{"data_type":"fp16"}}
ar = AutoRound(
tokenizer=tokenizer, model=fp32_model, layer_config=layer_config,
data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round")
data_type="int4", nsamples=32, seqlen=10, iters=0, amp=False ,scale_dtype="fp32", export_format="auto_round", device_map="cpu")
quantized_model_path = "./saved_ar"
ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
model = AutoModelForCausalLM.from_pretrained(
quantized_model_path,
torch_dtype="auto",
device_map="auto",
device_map="cpu",
)
out_ar = model(inp)[0]
assert torch.all(out_ar.eq(out))
Expand All @@ -328,9 +329,9 @@ def test_scheme(self, scheme):
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
inp = torch.ones([1, 10], dtype=torch.long)
inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)

Expand All @@ -345,6 +346,7 @@ def test_scheme(self, scheme):
scheme=scheme,
export_format="auto_round",
output_dir=output_dir, # default is "temp_auto_round"
device_map="cpu",
)

# quantizer execute
Expand All @@ -355,17 +357,17 @@ def test_scheme(self, scheme):
inc_model = AutoModelForCausalLM.from_pretrained(
output_dir,
torch_dtype="auto",
device_map="auto",
device_map="cpu",
)
out = inc_model(inp)[0]

# AutoRound API
fp32_model = transformers.AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
inp = torch.ones([1, 10], dtype=torch.long)
inp = torch.ones([1, 10], dtype=torch.long, device='cpu')
tokenizer = transformers.AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
from auto_round import AutoRound
Expand All @@ -378,13 +380,14 @@ def test_scheme(self, scheme):
amp=False,
scale_dtype="fp16",
scheme=scheme,
device_map="cpu",
)
quantized_model_path = "./saved_ar"
ar.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
model = AutoModelForCausalLM.from_pretrained(
quantized_model_path,
torch_dtype="auto",
device_map="auto",
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
out_ar = model(inp)[0]
Expand All @@ -399,7 +402,7 @@ def test_target_bits(self):
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
Expand All @@ -415,6 +418,7 @@ def test_target_bits(self):
enable_torch_compile=True,
low_gpu_mem_usage=True,
export_format="auto_round",
device_map="cpu",
)
# quantizer execute
model = prepare(model=fp32_model, quant_config=quant_config)
Expand All @@ -441,7 +445,7 @@ def eval_acc_fn(model) -> float:
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
Expand All @@ -456,6 +460,7 @@ def eval_acc_fn(model) -> float:
low_gpu_mem_usage=True,
export_format="auto_round",
iters=0,
device_map="cpu",
)
]
)
Expand All @@ -470,7 +475,7 @@ def test_static_attention_dtype(self):
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
Expand All @@ -485,6 +490,7 @@ def test_static_attention_dtype(self):
static_attention_dtype="fp8",
output_dir=output_dir,
export_format="auto_round",
device_map="cpu",
)
# quantizer execute
model = prepare(model=fp32_model, quant_config=quant_config)
Expand All @@ -510,7 +516,7 @@ def test_static_afp8_export(self, static_kv_dtype):
fp32_model = AutoModelForCausalLM.from_pretrained(
"facebook/opt-125m",
torchscript=True,
device_map="auto",
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(
"facebook/opt-125m", trust_remote_code=True)
Expand All @@ -531,6 +537,7 @@ def test_static_afp8_export(self, static_kv_dtype):
static_kv_dtype=static_kv_dtype,
export_format="auto_round",
output_dir=output_dir,
device_map="cpu",
)

# quantizer execute
Expand All @@ -550,6 +557,7 @@ def test_static_afp8_export(self, static_kv_dtype):
model = transformers.AutoModelForCausalLM.from_pretrained(
output_dir,
torch_dtype="auto",
device_map="cpu",
low_cpu_mem_usage=True,
trust_remote_code=True,
)
Expand Down Expand Up @@ -605,6 +613,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
export_format="auto_round",
output_dir=output_dir,
reloading=False,
device_map="cpu",
)

# quantizer execute
Expand Down