Skip to content
140 changes: 136 additions & 4 deletions ads/aqua/shaperecommend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,90 @@

NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)

EXCLUDED_MODELS contains a set of model identifiers that are known to be unsupported for shape recommendation, such as audio and speech models.
EXCLUDED_MODELS contains a set of model identifiers that are known to be unsupported for shape recommendation.

ARCHITECTURE_TYPE identifies the detected model architecture category for strategy selection.

SUPPORTED_TASKS defines the set of model task types that the recommender can handle.
"""

# ---------------------------------------------------------------------------
# Architecture type identifiers (used by StrategyFactory)
# ---------------------------------------------------------------------------
ARCH_TEXT_GENERATION = "text_generation"
ARCH_MULTIMODAL = "multimodal"
ARCH_EMBEDDING = "embedding"
ARCH_AUDIO = "audio"
ARCH_UNSUPPORTED = "unsupported"

# ---------------------------------------------------------------------------
# Supported task tags (from HF / OCI freeform_tags)
# ---------------------------------------------------------------------------
SUPPORTED_TASKS = {
"text_generation",
"text-generation",
"image_text_to_text",
"image-text-to-text",
"feature_extraction",
"feature-extraction",
"automatic_speech_recognition",
"automatic-speech-recognition",
}

# ---------------------------------------------------------------------------
# Model types that map to specific architecture strategies
# ---------------------------------------------------------------------------
MULTIMODAL_MODEL_TYPES = {
"llava",
"llava_next",
"llava_onevision",
"qwen2_vl",
"internvl",
"phi3_v",
"pixtral",
"idefics2",
"idefics3",
"mllama",
"paligemma",
}

EMBEDDING_MODEL_TYPES = {
"bert",
"roberta",
"xlm-roberta",
"xlm_roberta",
"modernbert",
"nomic_bert",
}

# Architecture class names in HF 'architectures' list that identify embedding-only models
EMBEDDING_ARCHITECTURE_KEYWORDS = {
"embeddingmodel",
"formaskedlm",
"xlmrobertamodel", # Jina embeddings (XLMRobertaModel)
"bertmodel", # bert-base etc.
"robertamodel", # roberta-base etc.
}

AUDIO_MODEL_TYPES = {
"whisper",
}

# Architecture keywords in HF 'architectures' list that indicate multimodal
MULTIMODAL_ARCHITECTURE_KEYWORDS = {
"llava",
"vila",
"nemotron_vl",
"nemotron_nano_vl",
"qwen2vl",
"internvl",
"phi3v",
"pixtral",
"idefics",
"paligemma",
"mllama",
}

LLAMA_REQUIRED_FIELDS = [
"num_hidden_layers",
"hidden_size",
Expand Down Expand Up @@ -101,8 +182,45 @@
"max_model_len": "--max-model-len",
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
"trust_remote_code": "--trust-remote-code",
"task_embedding": "--task embedding",
"task_transcribe": "--task transcribe",
"limit_mm_per_prompt_image": '--limit-mm-per-prompt {"image": 1}',
"limit_mm_per_prompt_audio": '--limit-mm-per-prompt {"audio": 1}',
"limit_mm_per_prompt_video": '--limit-mm-per-prompt {"video": 1}',
"enforce_eager": "--enforce-eager",
"dtype": "--dtype",
}

# ---------------------------------------------------------------------------
# Multimodal model characteristics that affect vLLM param selection
# ---------------------------------------------------------------------------

# Models supporting multiple images per prompt (image_grid_pinpoints or tiling)
# These benefit from higher --limit-mm-per-prompt image counts
MULTI_IMAGE_MODEL_TYPES = {
"llava_onevision",
"qwen2_vl",
"idefics3",
"mllama", # Llama 3.2 Vision supports multi-image
}

# Models that require --enforce-eager due to custom CUDA graph limitations
# Typically those with non-standard attention patterns or custom ops
ENFORCE_EAGER_MODEL_TYPES = {
"phi3_v", # Phi-3-Vision needs eager mode
"idefics2", # IDEFICS-2 needs eager mode
"paligemma", # PaliGemma can have issues with CUDA graphs
}

# Large-context embedding models with LLM backbones (hidden_size threshold)
# These use decoder architectures and benefit from context-length tuning
LARGE_EMBEDDING_HIDDEN_SIZE_THRESHOLD = (
1024 # >= this => "large" LLM-backbone embedding
)

# Whisper distilled model threshold: decoder_layers below this => distilled variant
WHISPER_DISTILLED_DECODER_LAYERS_THRESHOLD = 4

DEFAULT_WEIGHT_SIZE = "float32"
DEFAULT_MAX_SEQ_LEN = 4096

Expand Down Expand Up @@ -133,7 +251,21 @@
"ARM": "CPU",
"UNKNOWN_ENUM_VALUE": "N/A",
}
# Models that are truly unsupported (encoder-decoder text gen, no vLLM support)
EXCLUDED_MODELS = {
"t5", "gemma", "bart", "bert", "roberta", "albert",
"whisper", "wav2vec", "speech", "audio"
}
"t5",
"bart",
"albert",
"t5gemma",
}

# Encoder-decoder text models that cannot be served via standard vLLM text generation
ENCODER_DECODER_TEXT_MODELS = {
"t5",
"bart",
"albert",
"t5gemma",
"ul2",
"longt5",
"pegasus",
}
114 changes: 113 additions & 1 deletion ads/aqua/shaperecommend/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
QUANT_MAPPING,
VLLM_PARAMS,
)
from ads.aqua.shaperecommend.llm_config import LLMConfig
from ads.aqua.shaperecommend.llm_config import EmbeddingConfig, LLMConfig, VisionConfig, WhisperConfig


class MemoryEstimator(BaseModel):
Expand Down Expand Up @@ -377,6 +377,118 @@ def model_memory(self) -> float:
return total_params * llm_config.bytes_per_parameter / 1e9


class VisionMemoryEstimator(BaseModel):
"""
Estimator for Vision Encoder (ViT) models used in multimodal architectures.
Estimates model weight memory and image token overhead.
"""

vision_config: VisionConfig = Field(
..., description="The vision encoder configuration."
)

@property
def model_memory(self) -> float:
"""
Estimates Vision Encoder weight memory in GB.
Uses standard ViT parameter estimation: 12 * L * H^2 for transformer layers.
"""
vc = self.vision_config
layer_params = 12 * vc.num_hidden_layers * (vc.hidden_size ** 2)
total_params = layer_params
return total_params * vc.bytes_per_parameter / 1e9

def image_token_count(self, image_size: Optional[int] = None, patch_size: Optional[int] = None) -> int:
"""
Estimates the number of tokens an image is expanded into.

Formula: (image_size / patch_size)^2 + 1 (for CLS token)
"""
img_size = image_size or getattr(self.vision_config, "image_size", None) or 336
p_size = patch_size or getattr(self.vision_config, "patch_size", None) or 14
if p_size == 0:
return 0
return ((img_size // p_size) ** 2) + 1


class EmbeddingMemoryEstimator(BaseModel):
"""
Estimator for embedding models (BERT, RoBERTa, E5-Mistral, etc.).
Embedding models are typically small; the focus is on throughput estimation.
"""

embedding_config: EmbeddingConfig = Field(
..., description="The embedding model configuration."
)

@property
def model_memory(self) -> float:
"""
Estimates model weight memory in GB.
"""
ec = self.embedding_config
embed_params = ec.vocab_size * ec.hidden_size
layer_params = 12 * ec.num_hidden_layers * (ec.hidden_size ** 2)
total_params = embed_params + layer_params
return total_params * ec.bytes_per_parameter / 1e9

@property
def total_memory(self) -> float:
"""
Embedding models have negligible KV cache during inference.
Total memory is approximately model weight memory + small overhead.
"""
return self.model_memory * 1.1 # 10% overhead for activation memory

def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9) -> bool:
"""Validates if the embedding model fits within GPU memory."""
return (allowed_gpu_memory * gpu_utilization) > self.total_memory


class WhisperMemoryEstimator(BaseModel):
"""
Estimator for Whisper ASR models.
Whisper models have fixed architecture sizes and encoder-decoder structure.
"""

whisper_config: WhisperConfig = Field(
..., description="The Whisper model configuration."
)

@property
def encoder_memory(self) -> float:
"""Estimates encoder weight memory in GB."""
wc = self.whisper_config
layer_params = 12 * wc.encoder_layers * (wc.d_model ** 2)
return layer_params * wc.bytes_per_parameter / 1e9

@property
def decoder_memory(self) -> float:
"""Estimates decoder weight memory in GB."""
wc = self.whisper_config
layer_params = 12 * wc.decoder_layers * (wc.d_model ** 2)
embed_params = wc.vocab_size * wc.d_model
return (layer_params + embed_params) * wc.bytes_per_parameter / 1e9

@property
def model_memory(self) -> float:
"""Total model weight memory (encoder + decoder)."""
return self.encoder_memory + self.decoder_memory

@property
def total_memory(self) -> float:
"""
Total memory including overhead for audio feature buffers.
Whisper pre-processing requires CPU memory for mel-spectrograms.
GPU memory is primarily model weights + small activation overhead.
"""
return self.model_memory * 1.2 # 20% overhead for activations and audio buffers

def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9) -> bool:
"""Validates if the Whisper model fits within GPU memory."""
return (allowed_gpu_memory * gpu_utilization) > self.total_memory


def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
"""
Extracts the correct estimator based on the defined parameters in the config.json
Expand Down
Loading
Loading