Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
adf6175
Test Time Reduction
abukhoy Feb 25, 2026
fdba210
Test Time Improvment
abukhoy Feb 26, 2026
1928f5c
Test Time Improvment I
abukhoy Feb 26, 2026
bb65682
vlm models tests
abukhoy Mar 3, 2026
48a3d19
adding VLM dummy tests
abukhoy Mar 4, 2026
215eded
CI Issue
abukhoy Mar 4, 2026
2f9d163
fixing tests
abukhoy Mar 4, 2026
443dce6
qwen2.5 VL dummy config
abukhoy Mar 17, 2026
0ec05af
Merge branch 'main' into ci-time-reduction
abukhoy Mar 23, 2026
04e87a7
Updated disagg mode
quic-rishinr Mar 23, 2026
f783fda
added ignore unit test
quic-rishinr Mar 23, 2026
4a7a439
removed quick test
quic-rishinr Mar 23, 2026
3b358f5
Merge branch 'main' into ci-time-reduction
abukhoy Mar 24, 2026
a81bad6
Updated whl file name in FT CI
Mar 24, 2026
bb38866
Merge branch 'main' into ci-time-reduction
quic-rishinr Mar 25, 2026
30b290d
Merge branch 'main' into ci-time-reduction
quic-rishinr Mar 25, 2026
3b617c3
Merge branch 'main' into ci-time-reduction
quic-rishinr Mar 26, 2026
6a0cc7a
creating 3 way execution dummy_layers, few_layers, full_layers
abukhoy Mar 26, 2026
0f9e869
Merge remote-tracking branch 'origin/ci-time-reduction' into ci-time-…
abukhoy Mar 26, 2026
d2a2fe1
spd and subfunction testing for full, few, and dummy layers model
abukhoy Mar 30, 2026
725edc3
model tests restructured
abukhoy Mar 31, 2026
c7b02e1
tests configuration
abukhoy Apr 2, 2026
1e51530
tests configuration
abukhoy Apr 3, 2026
68ec928
JenkinsFile Aligning
abukhoy Apr 3, 2026
dc4e38d
JenkinsFile Aligning
abukhoy Apr 3, 2026
b9e1c93
JenkinsFile Aligning
abukhoy Apr 3, 2026
75b6e29
JenkinsFile Aligning
abukhoy Apr 3, 2026
1a7cc0b
JenkinsFile
abukhoy Apr 3, 2026
4364517
JenkinsFile
abukhoy Apr 3, 2026
1265d79
JenkinsFile Aligning
abukhoy Apr 6, 2026
762cdc5
Jenkins Fixing
abukhoy Apr 7, 2026
96cb8a6
adding teradown function in pytest
abukhoy Apr 8, 2026
ee3a41f
resolving issues
abukhoy Apr 9, 2026
d1c4ded
CI fixing
abukhoy Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 240 additions & 0 deletions QEfficient/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,216 @@
#
# -----------------------------------------------------------------------------

import copy
from typing import Dict, Optional

import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForImageTextToText, AutoTokenizer

from QEfficient import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText


def get_qeff_model(
model_name: str,
num_hidden_layers: int = -1,
continuous_batching: bool = False,
qaic_config: Dict = None,
config: Optional[AutoConfig] = None,
):

kwargs = dict(continuous_batching=continuous_batching, qaic_config=qaic_config)
if config is None:
if num_hidden_layers > 0:
kwargs["num_hidden_layers"] = num_hidden_layers
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, **kwargs)
else:
model_hf = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
torch_dtype = getattr(model_hf.config, "torch_dtype", None)
if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
model_hf = model_hf.to(torch.float32)
qeff_model = QEFFAutoModelForCausalLM(model_hf, **kwargs)

return qeff_model


def load_vlm_qeff_model(
model_name,
num_hidden_layers=-1,
kv_offload=False,
model_hf=None,
continuous_batching=False,
enable_qnn=None,
qnn_config=None,
):
if num_hidden_layers != -1:
try:
qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
model_name,
low_cpu_mem_usage=False,
config=model_hf.config,
kv_offload=kv_offload,
continuous_batching=continuous_batching,
)
except ValueError:
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=False,
config=model_hf.config,
kv_offload=kv_offload,
continuous_batching=continuous_batching,
)
else:
qeff_model = QEFFAutoModelForImageTextToText(
copy.deepcopy(model_hf),
kv_offload=kv_offload,
continuous_batching=continuous_batching,
)

return qeff_model


def load_vlm_hf_config(model_name, num_hidden_layers=-1, additional_params={}):
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, **additional_params)
if num_hidden_layers != -1:
config = set_num_layers_vlm(config, num_hidden_layers)
return config


def load_vlm_hf_model(model_name, num_hidden_layers=-1, config=None):
if config is None:
config = load_vlm_hf_config(model_name, num_hidden_layers=num_hidden_layers)
try:
model_hf = AutoModelForImageTextToText.from_pretrained(
config._name_or_path,
low_cpu_mem_usage=False,
config=config,
)
except ValueError:
model_hf = AutoModelForCausalLM.from_pretrained(
config._name_or_path,
low_cpu_mem_usage=False,
trust_remote_code=True,
config=config,
)
else:
try:
model_hf = AutoModelForImageTextToText.from_config(
config,
attn_implementation="eager",
trust_remote_code=True,
)
except ValueError:
model_hf = AutoModelForCausalLM.from_config(
config,
attn_implementation="eager",
trust_remote_code=True,
)
torch_dtype = getattr(model_hf.config, "torch_dtype", None)
if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
model_hf = model_hf.to(torch.float32)

model_hf.eval()
return model_hf


def set_num_layers_vlm(config, n_layer=-1):
## -1 indicates use all the layers of the model.
if n_layer == -1:
return config
elif hasattr(config, "model_type") and "mllama" in config.model_type:
config.text_config.num_hidden_layers = n_layer
config.text_config.cross_attention_layers = [
x for x in config.text_config.cross_attention_layers if x < n_layer
]
elif hasattr(config, "text_config"):
config.text_config.num_hidden_layers = n_layer
config.vision_config.num_hidden_layers = n_layer
if hasattr(config.vision_config, "depth"):
config.vision_config.depth = n_layer
elif hasattr(config, "llm_config"):
config.llm_config.num_hidden_layers = n_layer
config.vision_config.num_hidden_layers = n_layer
if hasattr(config.vision_config, "depth"):
config.vision_config.depth = n_layer
else:
config.num_hidden_layers = n_layer
return config


def get_qeff_model_with_sampler(
model_name: str,
is_vlm: bool,
continuous_batching: bool,
num_hidden_layers: Optional[int] = -1,
config: Optional[AutoConfig] = None,
qaic_config: Optional[dict] = None,
):
"""
Get a QEfficient model with the sampler transform.

Args:
model_name (str): The name of the model to test.
is_vlm (bool): Whether the model is a vision-language model.
continuous_batching (bool): Whether to use continuous batching.
num_hidden_layers (Optional[int]): The number of hidden layers to use.
config (Optional[AutoConfig]): The configuration to use.
qaic_config (Optional[dict]): The QAIC configuration to use.
"""
processor = None
if is_vlm:
# For Intern models only
additional_configs = {}
if config is None:
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
config = set_num_layers_vlm(config, num_hidden_layers)
model_hf = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
processor = InternProcessor(model_hf, tokenizer)
additional_configs["config"] = config
additional_configs["kv_offload"] = True
additional_configs["trust_remote_code"] = True
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
model_name,
continuous_batching=continuous_batching,
qaic_config=qaic_config,
**additional_configs,
)
else:
if config is not None:
model_hf = AutoModelForCausalLM.from_config(
config,
attn_implementation="eager",
)
elif num_hidden_layers != -1:
model_hf = AutoModelForCausalLM.from_pretrained(
model_name,
num_hidden_layers=num_hidden_layers,
attn_implementation="eager",
low_cpu_mem_usage=False,
)
else:
model_hf = AutoModelForCausalLM.from_pretrained(
model_name,
attn_implementation="eager",
low_cpu_mem_usage=False,
)
torch_dtype = getattr(model_hf.config, "torch_dtype", None)
if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
model_hf = model_hf.to(torch.float32)
qeff_model = QEFFAutoModelForCausalLM(
model_hf,
continuous_batching=continuous_batching,
qaic_config=qaic_config,
)

return qeff_model, processor


# Processor class for InternVL models
Expand Down Expand Up @@ -169,6 +375,36 @@ class ModelConfig:
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
}

STANDARD_VLM_MODELS = {
"llava-hf/llava-1.5-7b-hf",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"google/gemma-3-4b-it",
"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
"Qwen/Qwen2.5-VL-3B-Instruct",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
}

INTERNVL_MODELS = {
"OpenGVLab/InternVL2_5-1B",
"OpenGVLab/InternVL3_5-1B",
}

MOLMO_MODELS = {
"allenai/Molmo-7B-D-0924",
}

SKIPPED_MODELS = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"allenai/Molmo-7B-D-0924",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
}

DUAL_QPC_MODELS = {
"OpenGVLab/InternVL2_5-1B",
"OpenGVLab/InternVL3_5-1B",
"Qwen/Qwen2.5-VL-3B-Instruct",
}

EXTERNAL_MODELS = {
"hpcai-tech/grok-1": {
"pytorch_hf_tokens_custom_case": [
Expand Down Expand Up @@ -229,3 +465,7 @@ class ModelConfig:
SWIFTKV_MODELS = {
"Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
}

FULL_MODEL_TESTS_TO_SKIP = {
"hpcai-tech/grok-1",
}
Loading