oracle · mrDzurb · Feb 28, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -14,9 +14,90 @@
 
 NEXT_QUANT suggests the next quantization level based on the current quantization (if applied) or the model weights (if no quantization yet)
 
-EXCLUDED_MODELS contains a set of model identifiers that are known to be unsupported for shape recommendation, such as audio and speech models.
+EXCLUDED_MODELS contains a set of model identifiers that are known to be unsupported for shape recommendation.
+
+ARCHITECTURE_TYPE identifies the detected model architecture category for strategy selection.
+
+SUPPORTED_TASKS defines the set of model task types that the recommender can handle.
 """
 
+# ---------------------------------------------------------------------------
+# Architecture type identifiers (used by StrategyFactory)
+# ---------------------------------------------------------------------------
+ARCH_TEXT_GENERATION = "text_generation"
+ARCH_MULTIMODAL = "multimodal"
+ARCH_EMBEDDING = "embedding"
+ARCH_AUDIO = "audio"
+ARCH_UNSUPPORTED = "unsupported"
+
+# ---------------------------------------------------------------------------
+# Supported task tags (from HF / OCI freeform_tags)
+# ---------------------------------------------------------------------------
+SUPPORTED_TASKS = {
+    "text_generation",
+    "text-generation",
+    "image_text_to_text",
+    "image-text-to-text",
+    "feature_extraction",
+    "feature-extraction",
+    "automatic_speech_recognition",
+    "automatic-speech-recognition",
+}
+
+# ---------------------------------------------------------------------------
+# Model types that map to specific architecture strategies
+# ---------------------------------------------------------------------------
+MULTIMODAL_MODEL_TYPES = {
+    "llava",
+    "llava_next",
+    "llava_onevision",
+    "qwen2_vl",
+    "internvl",
+    "phi3_v",
+    "pixtral",
+    "idefics2",
+    "idefics3",
+    "mllama",
+    "paligemma",
+}
+
+EMBEDDING_MODEL_TYPES = {
+    "bert",
+    "roberta",
+    "xlm-roberta",
+    "xlm_roberta",
+    "modernbert",
+    "nomic_bert",
+}
+
+# Architecture class names in HF 'architectures' list that identify embedding-only models
+EMBEDDING_ARCHITECTURE_KEYWORDS = {
+    "embeddingmodel",
+    "formaskedlm",
+    "xlmrobertamodel",  # Jina embeddings (XLMRobertaModel)
+    "bertmodel",  # bert-base etc.
+    "robertamodel",  # roberta-base etc.
+}
+
+AUDIO_MODEL_TYPES = {
+    "whisper",
+}
+
+# Architecture keywords in HF 'architectures' list that indicate multimodal
+MULTIMODAL_ARCHITECTURE_KEYWORDS = {
+    "llava",
+    "vila",
+    "nemotron_vl",
+    "nemotron_nano_vl",
+    "qwen2vl",
+    "internvl",
+    "phi3v",
+    "pixtral",
+    "idefics",
+    "paligemma",
+    "mllama",
+}
+
 LLAMA_REQUIRED_FIELDS = [
     "num_hidden_layers",
     "hidden_size",
@@ -101,8 +182,45 @@
     "max_model_len": "--max-model-len",
     "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
     "trust_remote_code": "--trust-remote-code",
+    "task_embedding": "--task embedding",
+    "task_transcribe": "--task transcribe",
+    "limit_mm_per_prompt_image": '--limit-mm-per-prompt {"image": 1}',
+    "limit_mm_per_prompt_audio": '--limit-mm-per-prompt {"audio": 1}',
+    "limit_mm_per_prompt_video": '--limit-mm-per-prompt {"video": 1}',
+    "enforce_eager": "--enforce-eager",
+    "dtype": "--dtype",
+}
+
+# ---------------------------------------------------------------------------
+# Multimodal model characteristics that affect vLLM param selection
+# ---------------------------------------------------------------------------
+
+# Models supporting multiple images per prompt (image_grid_pinpoints or tiling)
+# These benefit from higher --limit-mm-per-prompt image counts
+MULTI_IMAGE_MODEL_TYPES = {
+    "llava_onevision",
+    "qwen2_vl",
+    "idefics3",
+    "mllama",  # Llama 3.2 Vision supports multi-image
+}
+
+# Models that require --enforce-eager due to custom CUDA graph limitations
+# Typically those with non-standard attention patterns or custom ops
+ENFORCE_EAGER_MODEL_TYPES = {
+    "phi3_v",  # Phi-3-Vision needs eager mode
+    "idefics2",  # IDEFICS-2 needs eager mode
+    "paligemma",  # PaliGemma can have issues with CUDA graphs
 }
 
+# Large-context embedding models with LLM backbones (hidden_size threshold)
+# These use decoder architectures and benefit from context-length tuning
+LARGE_EMBEDDING_HIDDEN_SIZE_THRESHOLD = (
+    1024  # >= this => "large" LLM-backbone embedding
+)
+
+# Whisper distilled model threshold: decoder_layers below this => distilled variant
+WHISPER_DISTILLED_DECODER_LAYERS_THRESHOLD = 4
+
 DEFAULT_WEIGHT_SIZE = "float32"
 DEFAULT_MAX_SEQ_LEN = 4096
 
@@ -133,7 +251,21 @@
     "ARM": "CPU",
     "UNKNOWN_ENUM_VALUE": "N/A",
 }
+# Models that are truly unsupported (encoder-decoder text gen, no vLLM support)
 EXCLUDED_MODELS = {
-            "t5", "gemma", "bart", "bert", "roberta", "albert", 
-            "whisper", "wav2vec", "speech", "audio"
-        }
+    "t5",
+    "bart",
+    "albert",
+    "t5gemma",
+}
+
+# Encoder-decoder text models that cannot be served via standard vLLM text generation
+ENCODER_DECODER_TEXT_MODELS = {
+    "t5",
+    "bart",
+    "albert",
+    "t5gemma",
+    "ul2",
+    "longt5",
+    "pegasus",
+}
@@ -14,7 +14,7 @@
     QUANT_MAPPING,
     VLLM_PARAMS,
 )
-from ads.aqua.shaperecommend.llm_config import LLMConfig
+from ads.aqua.shaperecommend.llm_config import EmbeddingConfig, LLMConfig, VisionConfig, WhisperConfig
 
 
 class MemoryEstimator(BaseModel):
@@ -377,6 +377,118 @@ def model_memory(self) -> float:
         return total_params * llm_config.bytes_per_parameter / 1e9
 
 
+class VisionMemoryEstimator(BaseModel):
+    """
+    Estimator for Vision Encoder (ViT) models used in multimodal architectures.
+    Estimates model weight memory and image token overhead.
+    """
+
+    vision_config: VisionConfig = Field(
+        ..., description="The vision encoder configuration."
+    )
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Estimates Vision Encoder weight memory in GB.
+        Uses standard ViT parameter estimation: 12 * L * H^2 for transformer layers.
+        """
+        vc = self.vision_config
+        layer_params = 12 * vc.num_hidden_layers * (vc.hidden_size ** 2)
+        total_params = layer_params
+        return total_params * vc.bytes_per_parameter / 1e9
+
+    def image_token_count(self, image_size: Optional[int] = None, patch_size: Optional[int] = None) -> int:
+        """
+        Estimates the number of tokens an image is expanded into.
+
+        Formula: (image_size / patch_size)^2 + 1 (for CLS token)
+        """
+        img_size = image_size or getattr(self.vision_config, "image_size", None) or 336
+        p_size = patch_size or getattr(self.vision_config, "patch_size", None) or 14
+        if p_size == 0:
+            return 0
+        return ((img_size // p_size) ** 2) + 1
+
+
+class EmbeddingMemoryEstimator(BaseModel):
+    """
+    Estimator for embedding models (BERT, RoBERTa, E5-Mistral, etc.).
+    Embedding models are typically small; the focus is on throughput estimation.
+    """
+
+    embedding_config: EmbeddingConfig = Field(
+        ..., description="The embedding model configuration."
+    )
+
+    @property
+    def model_memory(self) -> float:
+        """
+        Estimates model weight memory in GB.
+        """
+        ec = self.embedding_config
+        embed_params = ec.vocab_size * ec.hidden_size
+        layer_params = 12 * ec.num_hidden_layers * (ec.hidden_size ** 2)
+        total_params = embed_params + layer_params
+        return total_params * ec.bytes_per_parameter / 1e9
+
+    @property
+    def total_memory(self) -> float:
+        """
+        Embedding models have negligible KV cache during inference.
+        Total memory is approximately model weight memory + small overhead.
+        """
+        return self.model_memory * 1.1  # 10% overhead for activation memory
+
+    def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9) -> bool:
+        """Validates if the embedding model fits within GPU memory."""
+        return (allowed_gpu_memory * gpu_utilization) > self.total_memory
+
+
+class WhisperMemoryEstimator(BaseModel):
+    """
+    Estimator for Whisper ASR models.
+    Whisper models have fixed architecture sizes and encoder-decoder structure.
+    """
+
+    whisper_config: WhisperConfig = Field(
+        ..., description="The Whisper model configuration."
+    )
+
+    @property
+    def encoder_memory(self) -> float:
+        """Estimates encoder weight memory in GB."""
+        wc = self.whisper_config
+        layer_params = 12 * wc.encoder_layers * (wc.d_model ** 2)
+        return layer_params * wc.bytes_per_parameter / 1e9
+
+    @property
+    def decoder_memory(self) -> float:
+        """Estimates decoder weight memory in GB."""
+        wc = self.whisper_config
+        layer_params = 12 * wc.decoder_layers * (wc.d_model ** 2)
+        embed_params = wc.vocab_size * wc.d_model
+        return (layer_params + embed_params) * wc.bytes_per_parameter / 1e9
+
+    @property
+    def model_memory(self) -> float:
+        """Total model weight memory (encoder + decoder)."""
+        return self.encoder_memory + self.decoder_memory
+
+    @property
+    def total_memory(self) -> float:
+        """
+        Total memory including overhead for audio feature buffers.
+        Whisper pre-processing requires CPU memory for mel-spectrograms.
+        GPU memory is primarily model weights + small activation overhead.
+        """
+        return self.model_memory * 1.2  # 20% overhead for activations and audio buffers
+
+    def validate_shape(self, allowed_gpu_memory: float, gpu_utilization: float = 0.9) -> bool:
+        """Validates if the Whisper model fits within GPU memory."""
+        return (allowed_gpu_memory * gpu_utilization) > self.total_memory
+
+
 def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
     """
     Extracts the correct estimator based on the defined parameters in the config.json