vllm-project · jangel97 · Dec 11, 2025
diff --git a/examples/autoround/README.md b/examples/autoround/README.md
@@ -40,7 +40,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 

diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py
@@ -7,7 +7,7 @@
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.

diff --git a/examples/awq/README.md b/examples/awq/README.md
@@ -18,7 +18,7 @@ recipe = [
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 ```
 
 ## Adding Mappings ##

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
@@ -8,7 +8,7 @@
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -10,7 +10,7 @@
 
 # Load model.
 model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True
+    MODEL_ID, dtype=torch.bfloat16, device_map=None, trust_remote_code=True
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 

diff --git a/examples/awq/qwen3_coder_moe_example.py b/examples/awq/qwen3_coder_moe_example.py
@@ -51,7 +51,7 @@ def preprocess(example):
 
 
 if __name__ == "__main__":
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
     ###

diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
@@ -8,7 +8,7 @@
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.

diff --git a/examples/big_models_with_sequential_onloading/README.md b/examples/big_models_with_sequential_onloading/README.md
@@ -18,7 +18,7 @@ The Llama 3.3 70b is larger than 80 GB, surpassing the size of 1 A100. However,
 
 ```python
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map=None)
 ```
 
 The model is first loaded onto the `cpu`, as indicated through the use of `None` for the `device_map` argument in the `from_pretrained` method when loading the model.

diff --git a/examples/big_models_with_sequential_onloading/llama3.3_70b.py b/examples/big_models_with_sequential_onloading/llama3.3_70b.py
@@ -10,7 +10,7 @@
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype="auto",
+    dtype="auto",
     device_map=None,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)

diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
@@ -21,7 +21,7 @@
 
 compressed_model = AutoModelForCausalLM.from_pretrained(
     MODEL_STUB,
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
 )
 

diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
@@ -21,7 +21,7 @@ This directory contains example scripts for quantizing a variety of audio langua
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
@@ -13,7 +13,7 @@
 # Select model and load it.
 MODEL_ID = "openai/whisper-large-v3"
 
-model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
 model.config.forced_decoder_ids = None
 processor = WhisperProcessor.from_pretrained(MODEL_ID)
 

diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
@@ -25,7 +25,7 @@ This directory contains example scripts for quantizing a variety of vision-langu
 To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
 ```python3
 model_id = "path/to/your/model"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 ```
 
 ## Customizing GPTQModifier Parameters ##

diff --git a/examples/multimodal_vision/README_internvl3.md b/examples/multimodal_vision/README_internvl3.md
@@ -5,7 +5,7 @@ This file shows the example of quantizing InternVL3-8B-hf.
 
 ```python
 model_id = "OpenGVLab/InternVL3-8B-hf"
-model = AutoModelForImageTextToText.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+model = AutoModelForImageTextToText.from_pretrained(model_id, dtype=torch.bfloat16)
 processor = AutoProcessor.from_pretrained(model_id)
 ```
 

diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
@@ -8,7 +8,7 @@
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
-model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Gemma3ForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
@@ -10,7 +10,7 @@
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
-model = Idefics3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Idefics3ForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/internvl3_example.py b/examples/multimodal_vision/internvl3_example.py
@@ -8,7 +8,7 @@
 # Load model.
 model_id = "OpenGVLab/InternVL3-8B-hf"
 model = AutoModelForImageTextToText.from_pretrained(
-    model_id, torch_dtype=torch.bfloat16
+    model_id, dtype=torch.bfloat16
 )
 processor = AutoProcessor.from_pretrained(model_id)
 

diff --git a/examples/multimodal_vision/llama4_example.py b/examples/multimodal_vision/llama4_example.py
@@ -7,7 +7,7 @@
 
 # Select model and load it.
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Llama4ForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = Llama4Processor.from_pretrained(model_id)
 # MoE calibration is now handled automatically by the pipeline.
 # The `SequentialLlama4TextMoe` modules (from `llmcompressor.modeling.llama4`)

diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
@@ -8,7 +8,7 @@
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
-model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = LlavaForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
@@ -16,7 +16,7 @@
 
 # Load model.
 model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-model = Mistral3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Mistral3ForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Use a custom calibration chat template, rather than the overly-verbose default

diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
@@ -8,7 +8,7 @@
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = MllamaForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
@@ -13,7 +13,7 @@
 model_id = "microsoft/Phi-3-vision-128k-instruct"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype="auto",
+    dtype="auto",
     trust_remote_code=True,
     _attn_implementation="eager",
 )

diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
@@ -13,7 +13,7 @@
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
-model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = LlavaForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
@@ -12,7 +12,7 @@
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
-model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/multimodal_vision/qwen_2_5_vl_example.py b/examples/multimodal_vision/qwen_2_5_vl_example.py
@@ -12,7 +12,7 @@
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments

diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md
@@ -62,7 +62,7 @@ from llmcompressor import oneshot, train
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained(model_stub, dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_stub)
 
 # uses LLM Compressor's built-in preprocessing for ultra chat

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -15,7 +15,7 @@
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained(model_stub, dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_stub)
 
 # uses LLM Compressor's built-in preprocessing for ultra chat

diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
@@ -39,7 +39,7 @@ Load the model using `AutoModelForCausalLM`:
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 

diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -6,7 +6,7 @@
 
 # Select model and load it.
 MODEL_ID = "google/gemma-2-9b-it"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -7,7 +7,7 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.

diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -8,7 +8,7 @@
 # Phi-3.5 is a special case for KV cache quantization because it has
 # fused QKV linear layers.
 MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.

diff --git a/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py b/examples/quantization_non_uniform/quantization_fp8_multiple_strategies.py
@@ -6,7 +6,7 @@
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.

diff --git a/examples/quantization_non_uniform/quantization_int4_int8.py b/examples/quantization_non_uniform/quantization_int4_int8.py
@@ -7,7 +7,7 @@
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 

diff --git a/examples/quantization_non_uniform/quantization_multiple_modifiers.py b/examples/quantization_non_uniform/quantization_multiple_modifiers.py
@@ -22,7 +22,7 @@ def parse_args():
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.

diff --git a/examples/quantization_non_uniform/quantization_nvfp4_fp8.py b/examples/quantization_non_uniform/quantization_nvfp4_fp8.py
@@ -7,7 +7,7 @@
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 

diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
@@ -40,7 +40,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -7,7 +7,7 @@
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.

diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -7,7 +7,7 @@
 MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.

diff --git a/examples/quantization_w4a16_fp4/qwen3_example.py b/examples/quantization_w4a16_fp4/qwen3_example.py
@@ -7,7 +7,7 @@
 # Load model.
 MODEL_ID = "Qwen/Qwen3-32B"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+    MODEL_ID, dtype="auto", trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 

diff --git a/examples/quantization_w4a4_fp4/README.md b/examples/quantization_w4a4_fp4/README.md
@@ -39,7 +39,7 @@ Load the model using `AutoModelForCausalLM` for handling quantized saving and lo
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
 

diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -8,7 +8,7 @@
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 

diff --git a/examples/quantization_w4a4_fp4/llama4_example.py b/examples/quantization_w4a4_fp4/llama4_example.py
@@ -7,7 +7,7 @@
 
 # Select model and load it.
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+model = Llama4ForConditionalGeneration.from_pretrained(model_id, dtype="auto")
 processor = Llama4Processor.from_pretrained(model_id)
 # MoE calibration is now handled automatically by the pipeline.
 # The `SequentialLlama4TextMoe` modules (from `llmcompressor.modeling.llama4`)

diff --git a/examples/quantization_w4a4_fp4/qwen3_next_example.py b/examples/quantization_w4a4_fp4/qwen3_next_example.py
@@ -10,7 +10,7 @@
 MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
 
 # Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 

diff --git a/examples/quantization_w4a4_fp4/qwen3_vl_moe_w4a4_fp4.py b/examples/quantization_w4a4_fp4/qwen3_vl_moe_w4a4_fp4.py
@@ -10,7 +10,7 @@
 
 
 # Load model.
-model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 DATASET_ID = "neuralmagic/calibration"