quic · abukhoy · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026 · Mar 3, 2026
diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py
@@ -5,10 +5,216 @@
 #
 # -----------------------------------------------------------------------------
 
+import copy
+from typing import Dict, Optional
+
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForImageTextToText, AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText
+
+
+def get_qeff_model(
+    model_name: str,
+    num_hidden_layers: int = -1,
+    continuous_batching: bool = False,
+    qaic_config: Dict = None,
+    config: Optional[AutoConfig] = None,
+):
+
+    kwargs = dict(continuous_batching=continuous_batching, qaic_config=qaic_config)
+    if config is None:
+        if num_hidden_layers > 0:
+            kwargs["num_hidden_layers"] = num_hidden_layers
+        qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, **kwargs)
+    else:
+        model_hf = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+        torch_dtype = getattr(model_hf.config, "torch_dtype", None)
+        if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
+            model_hf = model_hf.to(torch.float32)
+        qeff_model = QEFFAutoModelForCausalLM(model_hf, **kwargs)
+
+    return qeff_model
+
+
+def load_vlm_qeff_model(
+    model_name,
+    num_hidden_layers=-1,
+    kv_offload=False,
+    model_hf=None,
+    continuous_batching=False,
+    enable_qnn=None,
+    qnn_config=None,
+):
+    if num_hidden_layers != -1:
+        try:
+            qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+                model_name,
+                low_cpu_mem_usage=False,
+                config=model_hf.config,
+                kv_offload=kv_offload,
+                continuous_batching=continuous_batching,
+            )
+        except ValueError:
+            qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+                model_name,
+                low_cpu_mem_usage=False,
+                config=model_hf.config,
+                kv_offload=kv_offload,
+                continuous_batching=continuous_batching,
+            )
+    else:
+        qeff_model = QEFFAutoModelForImageTextToText(
+            copy.deepcopy(model_hf),
+            kv_offload=kv_offload,
+            continuous_batching=continuous_batching,
+        )
+
+    return qeff_model
+
+
+def load_vlm_hf_config(model_name, num_hidden_layers=-1, additional_params={}):
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, **additional_params)
+    if num_hidden_layers != -1:
+        config = set_num_layers_vlm(config, num_hidden_layers)
+    return config
+
+
+def load_vlm_hf_model(model_name, num_hidden_layers=-1, config=None):
+    if config is None:
+        config = load_vlm_hf_config(model_name, num_hidden_layers=num_hidden_layers)
+        try:
+            model_hf = AutoModelForImageTextToText.from_pretrained(
+                config._name_or_path,
+                low_cpu_mem_usage=False,
+                config=config,
+            )
+        except ValueError:
+            model_hf = AutoModelForCausalLM.from_pretrained(
+                config._name_or_path,
+                low_cpu_mem_usage=False,
+                trust_remote_code=True,
+                config=config,
+            )
+    else:
+        try:
+            model_hf = AutoModelForImageTextToText.from_config(
+                config,
+                attn_implementation="eager",
+                trust_remote_code=True,
+            )
+        except ValueError:
+            model_hf = AutoModelForCausalLM.from_config(
+                config,
+                attn_implementation="eager",
+                trust_remote_code=True,
+            )
+        torch_dtype = getattr(model_hf.config, "torch_dtype", None)
+        if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
+            model_hf = model_hf.to(torch.float32)
+
+    model_hf.eval()
+    return model_hf
+
+
+def set_num_layers_vlm(config, n_layer=-1):
+    ## -1 indicates use all the layers of the model.
+    if n_layer == -1:
+        return config
+    elif hasattr(config, "model_type") and "mllama" in config.model_type:
+        config.text_config.num_hidden_layers = n_layer
+        config.text_config.cross_attention_layers = [
+            x for x in config.text_config.cross_attention_layers if x < n_layer
+        ]
+    elif hasattr(config, "text_config"):
+        config.text_config.num_hidden_layers = n_layer
+        config.vision_config.num_hidden_layers = n_layer
+        if hasattr(config.vision_config, "depth"):
+            config.vision_config.depth = n_layer
+    elif hasattr(config, "llm_config"):
+        config.llm_config.num_hidden_layers = n_layer
+        config.vision_config.num_hidden_layers = n_layer
+        if hasattr(config.vision_config, "depth"):
+            config.vision_config.depth = n_layer
+    else:
+        config.num_hidden_layers = n_layer
+    return config
+
+
+def get_qeff_model_with_sampler(
+    model_name: str,
+    is_vlm: bool,
+    continuous_batching: bool,
+    num_hidden_layers: Optional[int] = -1,
+    config: Optional[AutoConfig] = None,
+    qaic_config: Optional[dict] = None,
+):
+    """
+    Get a QEfficient model with the sampler transform.
+
+    Args:
+        model_name (str): The name of the model to test.
+        is_vlm (bool): Whether the model is a vision-language model.
+        continuous_batching (bool): Whether to use continuous batching.
+        num_hidden_layers (Optional[int]): The number of hidden layers to use.
+        config (Optional[AutoConfig]): The configuration to use.
+        qaic_config (Optional[dict]): The QAIC configuration to use.
+    """
+    processor = None
+    if is_vlm:
+        # For Intern models only
+        additional_configs = {}
+        if config is None:
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            config = set_num_layers_vlm(config, num_hidden_layers)
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            config=config,
+            trust_remote_code=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+        processor = InternProcessor(model_hf, tokenizer)
+        additional_configs["config"] = config
+        additional_configs["kv_offload"] = True
+        additional_configs["trust_remote_code"] = True
+        qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+            model_name,
+            continuous_batching=continuous_batching,
+            qaic_config=qaic_config,
+            **additional_configs,
+        )
+    else:
+        if config is not None:
+            model_hf = AutoModelForCausalLM.from_config(
+                config,
+                attn_implementation="eager",
+            )
+        elif num_hidden_layers != -1:
+            model_hf = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                num_hidden_layers=num_hidden_layers,
+                attn_implementation="eager",
+                low_cpu_mem_usage=False,
+            )
+        else:
+            model_hf = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                attn_implementation="eager",
+                low_cpu_mem_usage=False,
+            )
+        torch_dtype = getattr(model_hf.config, "torch_dtype", None)
+        if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
+            model_hf = model_hf.to(torch.float32)
+        qeff_model = QEFFAutoModelForCausalLM(
+            model_hf,
+            continuous_batching=continuous_batching,
+            qaic_config=qaic_config,
+        )
+
+    return qeff_model, processor
 
 
 # Processor class for InternVL models
@@ -169,6 +375,36 @@ class ModelConfig:
         "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
     }
 
+    STANDARD_VLM_MODELS = {
+        "llava-hf/llava-1.5-7b-hf",
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "google/gemma-3-4b-it",
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    }
+
+    INTERNVL_MODELS = {
+        "OpenGVLab/InternVL2_5-1B",
+        "OpenGVLab/InternVL3_5-1B",
+    }
+
+    MOLMO_MODELS = {
+        "allenai/Molmo-7B-D-0924",
+    }
+
+    SKIPPED_MODELS = {
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "allenai/Molmo-7B-D-0924",
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    }
+
+    DUAL_QPC_MODELS = {
+        "OpenGVLab/InternVL2_5-1B",
+        "OpenGVLab/InternVL3_5-1B",
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+    }
+
     EXTERNAL_MODELS = {
         "hpcai-tech/grok-1": {
             "pytorch_hf_tokens_custom_case": [
@@ -229,3 +465,7 @@ class ModelConfig:
     SWIFTKV_MODELS = {
         "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
     }
+
+    FULL_MODEL_TESTS_TO_SKIP = {
+        "hpcai-tech/grok-1",
+    }