From 652c4fce3e0d43e6bb88f27de3f95d4d6ce0f2ff Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 31 Mar 2026 17:52:01 -0400 Subject: [PATCH 1/8] Implement Signed-off-by: Matthew Bonanni --- .../layers/attention/attention.py | 21 +++++++++------ .../attention/chunked_local_attention.py | 9 +++++-- .../layers/attention/cross_attention.py | 9 +++++-- .../attention/encoder_only_attention.py | 9 +++++-- .../layers/attention/static_sink_attention.py | 13 ++++++--- vllm/model_executor/models/AXK1.py | 1 + vllm/model_executor/models/afmoe.py | 8 +++++- vllm/model_executor/models/arctic.py | 15 ++++++++--- vllm/model_executor/models/baichuan.py | 15 +++++++++-- vllm/model_executor/models/bailing_moe.py | 13 +++++++-- vllm/model_executor/models/bamba.py | 1 + vllm/model_executor/models/bert.py | 8 ++++++ vllm/model_executor/models/bert_with_rope.py | 10 +++++-- vllm/model_executor/models/bloom.py | 18 ++++++++++--- vllm/model_executor/models/chameleon.py | 10 ++++++- vllm/model_executor/models/chatglm.py | 27 ++++++++++++++++--- vllm/model_executor/models/cohere_asr.py | 7 +++++ vllm/model_executor/models/commandr.py | 13 +++++++-- vllm/model_executor/models/dbrx.py | 27 ++++++++++++++++--- vllm/model_executor/models/deepseek_v2.py | 2 ++ vllm/model_executor/models/dots1.py | 3 +++ vllm/model_executor/models/ernie45_moe.py | 8 +++++- vllm/model_executor/models/ernie45_vl_moe.py | 8 +++++- vllm/model_executor/models/exaone.py | 10 ++++++- vllm/model_executor/models/exaone4.py | 8 +++++- vllm/model_executor/models/falcon.py | 20 +++++++++++--- vllm/model_executor/models/falcon_h1.py | 3 +++ vllm/model_executor/models/gemma.py | 13 +++++++-- vllm/model_executor/models/gemma2.py | 13 +++++++-- vllm/model_executor/models/gemma3.py | 13 +++++++-- vllm/model_executor/models/gemma3n.py | 13 +++++++-- vllm/model_executor/models/glm4.py | 6 ++++- vllm/model_executor/models/glm4_moe.py | 8 +++++- vllm/model_executor/models/gpt2.py | 20 +++++++++++--- vllm/model_executor/models/gpt_bigcode.py | 18 ++++++++++--- vllm/model_executor/models/gpt_j.py | 20 +++++++++++--- vllm/model_executor/models/gpt_neox.py | 18 ++++++++++--- vllm/model_executor/models/gpt_oss.py | 6 ++++- vllm/model_executor/models/granite.py | 8 +++++- vllm/model_executor/models/granitemoe.py | 6 ++++- .../model_executor/models/granitemoehybrid.py | 1 + vllm/model_executor/models/grok1.py | 13 +++++++-- vllm/model_executor/models/hunyuan_v1.py | 11 +++++++- vllm/model_executor/models/hyperclovax.py | 6 ++++- vllm/model_executor/models/internlm2.py | 13 +++++++-- vllm/model_executor/models/interns1_pro.py | 6 ++++- .../model_executor/models/iquest_loopcoder.py | 9 +++++-- vllm/model_executor/models/jais.py | 13 +++++++-- vllm/model_executor/models/jais2.py | 6 ++++- vllm/model_executor/models/jamba.py | 1 + vllm/model_executor/models/lfm2.py | 3 +++ vllm/model_executor/models/lfm2_moe.py | 3 +++ vllm/model_executor/models/llama.py | 8 ++++-- vllm/model_executor/models/mimo_v2_flash.py | 8 +++++- vllm/model_executor/models/minicpm.py | 19 ++++++++++--- vllm/model_executor/models/minicpm3.py | 12 +++++++-- vllm/model_executor/models/minimax_m2.py | 3 +++ vllm/model_executor/models/minimax_text_01.py | 3 +++ vllm/model_executor/models/mixtral.py | 8 +++++- vllm/model_executor/models/modernbert.py | 19 ++++++++++--- vllm/model_executor/models/molmo.py | 18 ++++++++++--- vllm/model_executor/models/molmo2.py | 12 ++++++--- vllm/model_executor/models/mpt.py | 20 +++++++++++--- vllm/model_executor/models/nemotron.py | 8 +++++- vllm/model_executor/models/nemotron_h.py | 1 + vllm/model_executor/models/nemotron_parse.py | 8 +++++- vllm/model_executor/models/olmo.py | 18 ++++++++++--- vllm/model_executor/models/olmo2.py | 18 ++++++++++--- vllm/model_executor/models/olmo_hybrid.py | 12 ++++++++- vllm/model_executor/models/olmoe.py | 15 +++++++++-- vllm/model_executor/models/openpangu.py | 9 ++++++- vllm/model_executor/models/opt.py | 20 +++++++++++--- vllm/model_executor/models/orion.py | 13 +++++++-- vllm/model_executor/models/ouro.py | 8 +++++- vllm/model_executor/models/persimmon.py | 13 +++++++-- vllm/model_executor/models/phi.py | 20 +++++++++++--- vllm/model_executor/models/phimoe.py | 13 +++++++-- vllm/model_executor/models/pixtral.py | 10 ++++++- vllm/model_executor/models/plamo2.py | 1 + vllm/model_executor/models/plamo3.py | 1 + vllm/model_executor/models/qwen.py | 15 +++++++++-- vllm/model_executor/models/qwen2_moe.py | 8 +++++- vllm/model_executor/models/qwen3_dflash.py | 6 ++++- vllm/model_executor/models/qwen3_moe.py | 6 ++++- vllm/model_executor/models/qwen3_next.py | 1 + vllm/model_executor/models/seed_oss.py | 8 +++++- vllm/model_executor/models/solar.py | 8 +++++- vllm/model_executor/models/stablelm.py | 18 ++++++++++--- vllm/model_executor/models/starcoder2.py | 13 +++++++-- vllm/model_executor/models/step1.py | 6 ++++- vllm/model_executor/models/step3_text.py | 6 +++++ vllm/model_executor/models/step3p5.py | 4 +++ vllm/model_executor/models/whisper.py | 8 ++++++ vllm/model_executor/models/whisper_causal.py | 9 +++++-- vllm/model_executor/models/zamba2.py | 5 ++++ vllm/utils/torch_utils.py | 7 +++++ 96 files changed, 842 insertions(+), 147 deletions(-) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 7610030f3ed0..b668cfd63f83 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -7,7 +7,7 @@ import torch.nn as nn import vllm.envs as envs -from vllm.config import CacheConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger @@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform from vllm.utils.torch_utils import ( + TORCH_DTYPE_TO_KV_CACHE_STR, direct_register_custom_op, kv_cache_dtype_str_to_dtype, ) @@ -193,6 +194,7 @@ def __init__( alibi_slopes: list[float] | None = None, use_alibi_sqrt: bool | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, logits_soft_cap: float | None = None, per_layer_sliding_window: int | None = None, @@ -217,12 +219,14 @@ def __init__( else: sliding_window = None - vllm_config = get_current_vllm_config() if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype calculate_kv_scales = cache_config.calculate_kv_scales else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] calculate_kv_scales = False # llm-compressor mdls need to set cache_dtype to "fp8" manually. @@ -256,7 +260,10 @@ def __init__( if str(layer_idx) in cache_config.kv_cache_dtype_skip_layers: skip = True if skip: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required for kv_cache_dtype_skip_layers" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] calculate_kv_scales = False logger.info( "Layer %s: kv_cache_dtype=%s, sliding_window=%s", @@ -266,7 +273,7 @@ def __init__( ) self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( - kv_cache_dtype, vllm_config.model_config + kv_cache_dtype, model_config ) self.kv_cache_dtype = kv_cache_dtype self.calculate_kv_scales = calculate_kv_scales @@ -285,8 +292,6 @@ def __init__( self.sliding_window = sliding_window self.has_sink = extra_impl_args.get("sinks") is not None - # NOTE: model_config may be None during certain tests - model_config = vllm_config.model_config self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm # During model initialization, the default dtype is set as the model @@ -357,7 +362,7 @@ def __init__( self.use_direct_call = not current_platform.opaque_attention_op() self.use_output = self.attn_backend.accept_output_buffer - compilation_config = vllm_config.compilation_config + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index b747304acd0b..82c2ff894d6c 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,10 +4,11 @@ import torch -from vllm.config import CacheConfig +from vllm.config import CacheConfig, ModelConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionCGSupport, @@ -87,6 +88,7 @@ def __init__( num_kv_heads: int | None = None, alibi_slopes: list[float] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, kv_sharing_target_layer_name: str | None = None, prefix: str = "", @@ -96,7 +98,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] underlying_attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype) attn_backend = create_chunked_local_attention_backend( diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index 5bd8e163f4aa..1c2e4156613b 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,10 +6,11 @@ import numpy as np import torch -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -181,6 +182,7 @@ def __init__( head_size: int, scale: float, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, attn_type: str | None = None, **kwargs, ): @@ -189,7 +191,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] if attn_type is not None: assert attn_type == AttentionType.ENCODER_DECODER, ( diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index 0897ee45b84d..e805fa8759d2 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,9 +5,10 @@ import torch -from vllm.config import CacheConfig +from vllm.config import CacheConfig, ModelConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention +from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -59,6 +60,7 @@ def __init__( head_size: int, scale: float, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, attn_type: str | None = None, **kwargs, ): @@ -67,7 +69,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] underlying_attn_backend = get_attn_backend( head_size, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index 913d73a16c2c..eea6e9c4d069 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,13 +4,16 @@ import torch -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import direct_register_custom_op +from vllm.utils.torch_utils import ( + TORCH_DTYPE_TO_KV_CACHE_STR, + direct_register_custom_op, +) from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -120,6 +123,7 @@ def __init__( sink_len: int, attn_backend: type[AttentionBackend] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, **kwargs, ): dtype = torch.get_default_dtype() @@ -127,7 +131,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] if attn_backend is not None: underlying_attn_backend = attn_backend diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py index f5ed4400fb65..27eb4c739645 100644 --- a/vllm/model_executor/models/AXK1.py +++ b/vllm/model_executor/models/AXK1.py @@ -365,6 +365,7 @@ def __init__( self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 22037336411a..073f62cca08f 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -10,7 +10,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -180,6 +180,7 @@ def __init__( max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -259,6 +260,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=self.sliding_window, @@ -297,6 +299,7 @@ class AfmoeDecoderLayer(nn.Module): def __init__( self, config, # AfmoeConfig + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -319,6 +322,7 @@ def __init__( max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -405,10 +409,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: AfmoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 031b6534fb69..c73ddfddbc12 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -9,7 +9,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -230,6 +230,7 @@ class ArcticAttention(nn.Module): def __init__( self, config: ArcticConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -285,6 +286,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -307,6 +309,7 @@ class ArcticDecoderLayer(nn.Module): def __init__( self, config: ArcticConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -318,7 +321,8 @@ def __init__( self.use_residual = config.use_residual and is_moe_layer self.self_attn = ArcticAttention( config, - cache_config, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) @@ -388,10 +392,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, org_num_embeddings=self.vocab_size ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: ArcticDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index bc1cd2ed811b..2f0d4d299c3d 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -30,7 +30,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -140,6 +140,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -184,6 +185,7 @@ def __init__( scaling, alibi_slopes=alibi_slopes, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) else: @@ -199,6 +201,7 @@ def __init__( self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -223,6 +226,7 @@ def __init__( position_embedding: str, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -236,6 +240,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = BaiChuanMLP( @@ -286,6 +291,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -297,7 +303,12 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: BaiChuanDecoderLayer( - config, position_embedding, cache_config, quant_config, prefix=prefix + config, + position_embedding, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 7725dfa2a887..b57288235e81 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -33,7 +33,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -73,6 +73,7 @@ class BailingAttention(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, reduce_results: bool = True, @@ -142,6 +143,7 @@ def __init__( self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -336,6 +338,7 @@ class BailingMoeBlock(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -348,7 +351,11 @@ def __init__( self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) self.attention = BailingAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attention" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attention", ) self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) @@ -416,10 +423,12 @@ def __init__( self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: BailingMoeBlock( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index d220b22ddae7..a00a040ea167 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -215,6 +215,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.feed_forward = BambaMLP( diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 01854b96d56f..edb88dc086d5 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -128,12 +128,14 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.layer = nn.ModuleList( [ BertLayer( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layer.{layer_idx}", ) for layer_idx in range(config.num_hidden_layers) @@ -155,6 +157,7 @@ def __init__( config: BertConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -165,6 +168,7 @@ def __init__( layer_norm_eps=config.layer_norm_eps, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attention", ) @@ -199,6 +203,7 @@ def __init__( layer_norm_eps: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -208,6 +213,7 @@ def __init__( num_attention_heads=num_attention_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.output", ) @@ -233,6 +239,7 @@ def __init__( num_attention_heads: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -269,6 +276,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 22bcdeb453c4..1a0575c33e29 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -7,7 +7,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -94,6 +94,7 @@ def __init__( num_attention_heads: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, bias: bool = True, rotary_kwargs: dict | None = None, prefix: str = "", @@ -136,6 +137,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -347,6 +349,7 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, moe: bool = False, bias: bool = True, rotary_kwargs: dict | None = None, @@ -358,6 +361,7 @@ def __init__( num_attention_heads=config.num_attention_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, bias=bias, rotary_kwargs=rotary_kwargs, prefix=f"{prefix}.attention", @@ -411,7 +415,8 @@ def __init__( prefix: str = "", ): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config every_n = getattr(config, "moe_every_n_layers", 0) @@ -421,6 +426,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, bias=bias, moe=every_n > 0 and (layer_idx % every_n == 1), rotary_kwargs=rotary_kwargs, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 233028a905f6..e9b7e54878aa 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -28,7 +28,7 @@ from transformers import BloomConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -90,6 +90,7 @@ def __init__( config: BloomConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -133,6 +134,7 @@ def __init__( alibi_slopes=alibi_slopes, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -185,6 +187,7 @@ def __init__( config: BloomConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -192,7 +195,11 @@ def __init__( self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.self_attention = BloomAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.self_attention", ) self.post_attention_layernorm = nn.LayerNorm( hidden_size, eps=config.layer_norm_epsilon @@ -243,6 +250,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.embed_dim = config.hidden_size @@ -260,7 +268,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: BloomBlock( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a150428baff4..1bfadd4b0e27 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,7 +16,7 @@ ChameleonVQVAEConfig, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict @@ -269,6 +269,7 @@ def __init__( max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -323,6 +324,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -359,6 +361,7 @@ class ChameleonDecoderLayer(nn.Module): def __init__( self, config: ChameleonConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -377,6 +380,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -420,6 +424,7 @@ class ChameleonSwinDecoderLayer(nn.Module): def __init__( self, config: ChameleonConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -438,6 +443,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -846,10 +852,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else ChameleonSwinDecoderLayer ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c5d857e7c3df..a944734597e5 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -13,7 +13,7 @@ from torch.nn import LayerNorm from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -51,6 +51,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -120,6 +121,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -195,6 +197,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -212,7 +215,11 @@ def __init__( # Self attention. self.self_attention = GLMAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.self_attention", ) self.hidden_dropout = config.hidden_dropout @@ -268,6 +275,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -279,7 +287,13 @@ def __init__( # Transformer layers. self.start_layer, self.end_layer, self.layers = make_layers( self.num_layers, - lambda prefix: GLMBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GLMBlock( + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, + ), prefix=f"{prefix}.layers", ) @@ -329,6 +343,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -343,7 +358,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multi_query_group_num = config.multi_query_group_num self.kv_channels = config.kv_channels self.encoder = GLMTransformer( - config, cache_config, quant_config, prefix=f"{prefix}.encoder" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.encoder", ) self.output_layer = ParallelLMHead( diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py index 1cebea56a138..418af2693d19 100644 --- a/vllm/model_executor/models/cohere_asr.py +++ b/vllm/model_executor/models/cohere_asr.py @@ -96,6 +96,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = embed_dim @@ -148,6 +149,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) else: # AttentionType.DECODER (regular decoder self-attention) self.attn = Attention( @@ -159,6 +161,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) def _init_qkv( @@ -201,6 +204,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__( embed_dim=embed_dim, @@ -210,6 +214,7 @@ def __init__( quant_config=quant_config, prefix=prefix, attn_type=AttentionType.ENCODER_DECODER, + model_config=model_config, ) def _init_qkv( @@ -347,6 +352,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.first_sub_layer", + model_config=vllm_config.model_config, ) # cross attn to attend to encoder @@ -357,6 +363,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.second_sub_layer", + model_config=vllm_config.model_config, ) self.layer_norm_3 = nn.LayerNorm(self.hidden_dim) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index e73dfb1f01e3..2c885140c7a0 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -31,7 +31,7 @@ from transformers import Cohere2Config, CohereConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -131,6 +131,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -198,6 +199,7 @@ def __init__( quant_config=quant_config, per_layer_sliding_window=self.sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) if self.use_qk_norm: self.q_norm = LayerNorm( @@ -240,6 +242,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.hidden_size = config.hidden_size @@ -249,6 +252,7 @@ def __init__( cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = CohereMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp") @@ -284,6 +288,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.quant_config = quant_config self.config = config @@ -296,7 +301,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: CohereDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index ca6e6a49a98a..8c49efe16786 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,7 +8,7 @@ import torch.nn as nn from transformers import DbrxConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -187,6 +187,7 @@ class DbrxAttention(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -248,6 +249,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -272,6 +274,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -279,7 +282,11 @@ def __init__( super().__init__() self.d_model = config.d_model self.attn = DbrxAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", ) self.norm_1 = nn.LayerNorm(self.d_model) self.norm_2 = nn.LayerNorm(self.d_model) @@ -305,13 +312,18 @@ class DbrxBlock(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ): super().__init__() self.norm_attn_norm = DbrxFusedNormAttention( - config, cache_config, quant_config, prefix=f"{prefix}.norm_attn_norm" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.norm_attn_norm", ) self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn") @@ -342,9 +354,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, config.d_model, ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: DbrxBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: DbrxBlock( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), prefix=f"{prefix}.blocks", ) self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f1c4a7b21993..267459938fa0 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -167,6 +167,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -529,6 +530,7 @@ def __init__( self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 4e393145462a..05a0acc58e57 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -203,6 +203,7 @@ def __init__( num_kv_heads: int, config: Dots1Config, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -258,6 +259,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -301,6 +303,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, config=config, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f038cfb21f28..23c96a92dc6d 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -33,7 +33,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -239,6 +239,7 @@ def __init__( max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -298,6 +299,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -324,6 +326,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -342,6 +345,7 @@ def __init__( max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -439,10 +443,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Ernie4_5_MoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 418fdcfa072b..12d81dddaea4 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -32,7 +32,7 @@ from transformers import PretrainedConfig # from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention @@ -97,6 +97,7 @@ def __init__( max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -164,6 +165,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -407,6 +409,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -427,6 +430,7 @@ def __init__( max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -538,10 +542,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Ernie4_5_VLMoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index b633fd285082..2cbf7b70752e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -33,7 +33,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -117,6 +117,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -179,6 +180,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -206,6 +208,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.attention = ExaoneAttention( @@ -218,6 +221,7 @@ def __init__( bias=bias, cache_config=cache_config, prefix=f"{prefix}.attention", + model_config=model_config, ) def forward( @@ -238,6 +242,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -259,6 +264,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.mlp = ExaoneGatedMLP( hidden_size=self.hidden_size, @@ -318,6 +324,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.wte = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: ExaoneDecoderLayer( @@ -325,6 +332,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 485b145b9cdf..13cbdc8045a2 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -29,7 +29,7 @@ from transformers import Exaone4Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -117,6 +117,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -191,6 +192,7 @@ def __init__( quant_config=quant_config, per_layer_sliding_window=self.sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -222,6 +224,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -244,6 +247,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = Exaone4GatedMLP( hidden_size=self.hidden_size, @@ -313,6 +317,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Exaone4DecoderLayer( @@ -320,6 +325,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index efd24b51442a..b76e027f04fd 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -31,7 +31,7 @@ from transformers import FalconConfig as HF_FalconConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -97,6 +97,7 @@ def __init__( config: FalconConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -176,6 +177,7 @@ def __init__( self.inv_norm_factor, num_kv_heads=self.num_kv_heads, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) elif self.use_alibi: @@ -193,6 +195,7 @@ def __init__( num_kv_heads=self.num_kv_heads, alibi_slopes=alibi_slopes, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) else: @@ -203,6 +206,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -270,13 +274,18 @@ def __init__( config: FalconConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.self_attention = FalconAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.self_attention", ) self.mlp = FalconMLP(config, quant_config, prefix=f"{prefix}.mlp") self.config = config @@ -370,6 +379,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.embed_dim = config.hidden_size @@ -386,7 +396,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: FalconDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index fba2e216e3fa..97b65c5055d6 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -215,6 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): def __init__( self, config: FalconH1Config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -280,6 +281,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -340,6 +342,7 @@ def __init__( # Instantiate the attention branch self.self_attn = FalconH1AttentionDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6e35020a6eac..237bc4af57e3 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -27,7 +27,7 @@ from transformers import GemmaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -133,6 +133,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -186,6 +187,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -208,6 +210,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -221,6 +224,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = GemmaMLP( hidden_size=self.hidden_size, @@ -266,6 +270,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -276,7 +281,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GemmaDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 425ecc65195a..dcea7d13b6b5 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -24,7 +24,7 @@ from transformers import Gemma2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -109,6 +109,7 @@ def __init__( quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -169,6 +170,7 @@ def __init__( logits_soft_cap=attn_logits_soft_cap, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -191,6 +193,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -205,6 +208,7 @@ def __init__( quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.hidden_size = config.hidden_size self.mlp = Gemma2MLP( @@ -257,6 +261,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -267,7 +272,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma2DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index b2352a3c9268..e9963a9d1a7b 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -23,7 +23,7 @@ from transformers import Gemma3TextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -116,6 +116,7 @@ def __init__( quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -207,6 +208,7 @@ def __init__( logits_soft_cap=attn_logits_soft_cap, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -238,6 +240,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -252,6 +255,7 @@ def __init__( quant_config=quant_config, attn_logits_soft_cap=None, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.hidden_size = config.hidden_size self.mlp = Gemma3MLP( @@ -306,6 +310,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -318,7 +323,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma3DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 770424ba0fdf..ac4e3c21d542 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -22,7 +22,7 @@ from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.logger import init_logger @@ -287,6 +287,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -399,6 +400,7 @@ def __init__( per_layer_sliding_window=self.sliding_window, kv_sharing_target_layer_name=kv_sharing_target_layer_name, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -434,6 +436,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() assert isinstance(config, Gemma3nTextConfig) @@ -459,6 +462,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = Gemma3nMLP( hidden_size=config.hidden_size, @@ -812,10 +816,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) # Allocate config.num_kv_shared_layers layers for self-decoder + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma3nDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 89447927d5cd..e28699497557 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -30,7 +30,7 @@ from transformers import Glm4Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -69,6 +69,7 @@ def __init__( qkv_bias: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -130,6 +131,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", attn_type=attn_type, ) @@ -159,6 +161,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size @@ -172,6 +175,7 @@ def __init__( head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index d0e6cb6ada8b..82d2b1fcfb23 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -33,7 +33,7 @@ from transformers.models.glm4_moe import Glm4MoeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -241,6 +241,7 @@ def __init__( use_qk_norm: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -297,6 +298,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -331,6 +333,7 @@ def __init__( config: Glm4MoeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", enable_eplb: bool = False, ) -> None: @@ -353,6 +356,7 @@ def __init__( qkv_bias=config.attention_bias, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", use_qk_norm=config.use_qk_norm, ) @@ -414,6 +418,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config enable_eplb = vllm_config.parallel_config.enable_eplb self.config = config @@ -432,6 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, enable_eplb=enable_eplb, ), diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 41a4ca174257..d45150290e4c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -28,7 +28,7 @@ from transformers import GPT2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -66,6 +66,7 @@ def __init__( config: GPT2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -98,6 +99,7 @@ def __init__( scale=self.scale, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -151,6 +153,7 @@ def __init__( config: GPT2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -159,7 +162,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPT2Attention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -190,6 +197,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.add_cross_attention @@ -205,7 +213,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GPT2Block( + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, + ), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c6629c937dc6..80aedd5558eb 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -29,7 +29,7 @@ from transformers import GPTBigCodeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -63,6 +63,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -106,6 +107,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -166,6 +168,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -174,7 +177,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTBigCodeAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTBigMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -207,6 +214,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.add_cross_attention @@ -221,7 +229,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: GPTBigCodeBlock( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index c29103c6d52c..fe4a02adef5a 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -27,7 +27,7 @@ from transformers import GPTJConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -65,6 +65,7 @@ def __init__( config: GPTJConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -110,6 +111,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -163,13 +165,18 @@ def __init__( config: GPTJConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attn", ) self.mlp = GPTJMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -197,6 +204,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -207,7 +215,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.n_layer, - lambda prefix: GPTJBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GPTJBlock( + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, + ), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8d44d12fc212..b7cf3a431b2b 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -27,7 +27,7 @@ from transformers import GPTNeoXConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -62,6 +62,7 @@ def __init__( config: GPTNeoXConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -102,6 +103,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -153,6 +155,7 @@ def __init__( config: GPTNeoXConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -164,7 +167,11 @@ def __init__( config.hidden_size, eps=config.layer_norm_eps ) self.attention = GPTNeoXAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attention" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attention", ) self.mlp = GPTNeoXMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -204,6 +211,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -214,7 +222,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GPTNeoXLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 482056250a1e..922f44d46ec6 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -9,7 +9,7 @@ from transformers import GptOssConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_dp_group, get_ep_group, @@ -70,6 +70,7 @@ def __init__( config: GptOssConfig, quant_config: QuantizationConfig | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -137,6 +138,7 @@ def __init__( num_kv_heads=self.num_local_key_value_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=AttentionType.DECODER, prefix=f"{prefix}.attn", @@ -226,6 +228,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config + model_config = vllm_config.model_config self.layer_idx = extract_layer_index(prefix) self.attn = OAIAttention( @@ -233,6 +236,7 @@ def __init__( prefix=f"{prefix}.attn", quant_config=quant_config, cache_config=cache_config, + model_config=model_config, ) self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp") self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 4b486ede4439..dd18403ad03e 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -32,7 +32,7 @@ from transformers import GraniteConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -168,6 +169,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -190,6 +192,7 @@ class GraniteDecoderLayer(nn.Module): def __init__( self, config: GraniteConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -213,6 +216,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -273,10 +277,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GraniteDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 171b2e0ec5a0..3aa333302823 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -32,7 +32,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -142,6 +142,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, rope_parameters: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -199,6 +200,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -226,6 +228,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config @@ -237,6 +240,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1ab069e3ba38..5c38d0bceec3 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -288,6 +288,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 0bd6a8f3d606..6e3aa634ff72 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -240,6 +240,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, rope_parameters: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -306,6 +307,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, @@ -333,6 +335,7 @@ class Grok1DecoderLayer(nn.Module): def __init__( self, config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -352,6 +355,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=_get_rope_parameters(config), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -470,10 +474,15 @@ def __init__( quant_config=quant_config, ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Grok1DecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index a0130402c66f..2925b686df16 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -150,6 +150,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -215,6 +216,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -257,6 +259,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -320,6 +323,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -477,6 +481,7 @@ class HunYuanDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -511,6 +516,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, @@ -526,6 +532,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, @@ -620,10 +627,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: HunYuanDecoderLayer( config=config, + model_config=model_config, layer_id=int(prefix.split(".")[-1]), cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py index 3176c4284139..9769be81f082 100644 --- a/vllm/model_executor/models/hyperclovax.py +++ b/vllm/model_executor/models/hyperclovax.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -119,6 +119,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -179,6 +180,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -206,6 +208,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -231,6 +234,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c00b9a0ee671..f4593ac55a6b 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -11,7 +11,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -95,6 +95,7 @@ def __init__( num_kv_heads: int, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -150,6 +151,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -196,6 +198,7 @@ class InternLMDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -209,6 +212,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attention", @@ -258,6 +262,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -270,7 +275,11 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: layer_type( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/interns1_pro.py b/vllm/model_executor/models/interns1_pro.py index 28331b8ef3e8..54e9d277d344 100644 --- a/vllm/model_executor/models/interns1_pro.py +++ b/vllm/model_executor/models/interns1_pro.py @@ -32,7 +32,7 @@ from torch import nn from transformers import AutoProcessor, PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_tensor_model_parallel_world_size, @@ -272,6 +272,7 @@ def __init__( head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -331,6 +332,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -371,6 +373,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -403,6 +406,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 24c004ff4c20..54b3e9dc3ecd 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -25,7 +25,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -47,6 +47,7 @@ ) from vllm.model_executor.models.llama import LlamaMLP from vllm.sequence import IntermediateTensors +from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import AttentionType from .utils import ( @@ -66,6 +67,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -147,9 +149,12 @@ def __init__( sliding_window=self.loop_window_size, ) else: + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) loop_cache_config = CacheConfig( sliding_window=self.loop_window_size, - cache_dtype="auto", + cache_dtype=TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype], ) self.attn.append( diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index cc0c1aa01baf..1c8c9e847b7a 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -29,7 +29,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -88,6 +88,7 @@ def __init__( config: JAISConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -133,6 +134,7 @@ def __init__( alibi_slopes=alibi_slopes, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -205,6 +207,7 @@ def __init__( config: JAISConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -213,7 +216,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = JAISAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = JAISMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -246,6 +253,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.scale_attn_by_inverse_layer_idx @@ -268,6 +276,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 4e03eb12ee44..3fc207225eba 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -32,7 +32,7 @@ from transformers import Jais2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -114,6 +114,7 @@ def __init__( quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -192,6 +193,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", ) @@ -221,6 +223,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = self.get_quant_config(vllm_config) + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -240,6 +243,7 @@ def __init__( quant_config=quant_config, bias=attention_bias, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = Jais2MLP( diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 980bcffb5f9b..c1e088b026a8 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -239,6 +239,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) num_experts = config.layers_num_experts[layer_idx] diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 95a8cdb8711d..40e1b70923f2 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -101,6 +101,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -157,6 +158,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -207,6 +209,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index d955b7127adc..562acb564708 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -192,6 +192,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -248,6 +249,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -299,6 +301,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ecced3df8ba..3cc29cf1e9dc 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -32,7 +32,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import ( @@ -133,6 +133,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -214,6 +215,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, attn_type=attn_type, @@ -260,7 +262,8 @@ def __init__( ) -> None: super().__init__() - config = config or vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = config or model_config.hf_config cache_config = vllm_config.cache_config quant_config = self.get_quant_config(vllm_config) @@ -297,6 +300,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 43475ed690c9..49a825335881 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -8,6 +8,7 @@ from vllm.config import ( CacheConfig, + ModelConfig, VllmConfig, get_current_vllm_config, str_dtype_to_torch_dtype, @@ -221,6 +222,7 @@ def __init__( max_position_embeddings: int = 32768, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, partial_rotary_factor: float = 1.0, prefix: str = "", ) -> None: @@ -292,6 +294,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=AttentionType.DECODER, prefix=f"{prefix}.attn", @@ -328,7 +331,8 @@ def forward( class MiMoV2FlashDecoderLayer(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() - config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config + config = model_config.hf_text_config quant_config = vllm_config.quant_config layer_id = extract_layer_index(prefix) @@ -358,6 +362,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rope_theta=getattr(config, "swa_rope_theta", rope_theta), max_position_embeddings=max_position_embeddings, quant_config=quant_config, + model_config=model_config, partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), prefix=f"{prefix}.self_attn", ) @@ -375,6 +380,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, quant_config=quant_config, + model_config=model_config, partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 54870eb2ede4..a9fda85b9b52 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -243,6 +243,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -297,6 +298,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -319,11 +321,13 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.hidden_size = config.hidden_size self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix @@ -343,6 +347,7 @@ def _init_attn_block(self): cache_config=self.cache_config, quant_config=self.quant_config, prefix=f"{self.prefix}.self_attn", + model_config=self.model_config, ) def _init_ffn_block(self): @@ -416,7 +421,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, ) self.num_experts = getattr(self.config, "num_experts", 0) - self._init_layers(prefix, config, cache_config, quant_config) + model_config = vllm_config.model_config + self._init_layers( + prefix, config, cache_config, quant_config, model_config=model_config + ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( @@ -429,11 +437,16 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None = None, ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MiniCPMDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index e61e9d06103d..4a9068bb0fd7 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,7 +29,7 @@ from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -64,6 +64,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -131,6 +132,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -201,6 +203,7 @@ def _init_attn_block(self): cache_config=self.cache_config, quant_config=self.quant_config, prefix=f"{self.prefix}.self_attn", + model_config=self.model_config, ) @@ -211,11 +214,16 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None = None, ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MiniCPM3DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 0f43bc0cdcec..5d5d13b516ae 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -153,6 +153,7 @@ def __init__( head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -213,6 +214,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, per_layer_sliding_window=attn_window_size, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -272,6 +274,7 @@ def __init__( rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 21d74d8b0580..505e4d39aabd 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -197,6 +197,7 @@ def __init__( max_position: int = 4096 * 32, rope_parameters: dict | None = None, sliding_window: int | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, cache_config: CacheConfig | None = None, @@ -245,6 +246,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -332,6 +334,7 @@ def __init__( max_position=max_position_embeddings, rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, + model_config=model_config, quant_config=quant_config, layer_idx=self._ilayer, cache_config=cache_config, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 376fd7a1709d..3b3b92345675 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -33,7 +33,7 @@ from transformers import MixtralConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -163,6 +163,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -218,6 +219,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -241,6 +243,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", enable_eplb: bool = False, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -253,6 +256,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.block_sparse_moe = MixtralMoE( num_experts=config.num_local_experts, @@ -316,6 +320,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.enable_eplb = parallel_config.enable_eplb self.num_redundant_experts = parallel_config.eplb_config.num_redundant_experts + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MixtralDecoderLayer( @@ -324,6 +329,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, prefix=prefix, enable_eplb=self.enable_eplb, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index a29b1a9fbfbb..f82c655c8d9b 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -63,7 +63,11 @@ def forward( class ModernBertAttention(nn.Module): def __init__( - self, config: ModernBertConfig, layer_id: int | None = None, prefix: str = "" + self, + config: ModernBertConfig, + layer_id: int | None = None, + prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -115,6 +119,7 @@ def __init__( self.num_heads, self.head_dim, self.scaling, + model_config=model_config, prefix=f"{layer_id}.attn", per_layer_sliding_window=sliding_window, ) @@ -161,7 +166,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class ModernBertLayer(nn.Module): def __init__( - self, config: ModernBertConfig, prefix: str = "", layer_id: int | None = None + self, + config: ModernBertConfig, + prefix: str = "", + layer_id: int | None = None, + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -172,7 +181,10 @@ def __init__( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias ) self.attn = ModernBertAttention( - config=config, layer_id=layer_id, prefix=f"{prefix}.attn" + config=config, + layer_id=layer_id, + prefix=f"{prefix}.attn", + model_config=model_config, ) self.mlp_norm = nn.LayerNorm( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias @@ -203,6 +215,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): config=config, layer_id=layer_id, prefix=f"{prefix}.layers.{layer_id}", + model_config=vllm_config.model_config, ) for layer_id in range(config.num_hidden_layers) ] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 1d756a2addeb..e4b9891ff8fe 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -20,7 +20,7 @@ ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import ( get_pp_group, @@ -410,6 +410,7 @@ class MolmoAttention(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -468,6 +469,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -600,6 +602,7 @@ class MolmoDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -607,7 +610,11 @@ def __init__( super().__init__() # Attention block. self.self_attn = MolmoAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) # MLP block. @@ -853,13 +860,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, ) + model_config = vllm_config.model_config decoder_layer = ( MolmoDecoderNormAfterLayer if config.norm_after else MolmoDecoderLayer ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index aa58fa6d1583..d500e98a0432 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -24,7 +24,7 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import ( get_pp_group, @@ -885,6 +885,7 @@ def __init__( self, config: TextConfig, rope_parameters: dict[str, Any], + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -959,6 +960,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -1068,6 +1070,7 @@ def __init__( self, config: TextConfig, rope_parameters: dict[str, Any], + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -1077,8 +1080,9 @@ def __init__( self.self_attn = Molmo2Attention( config, rope_parameters, - cache_config, - quant_config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, prefix=f"{prefix}.self_attn", ) @@ -1182,11 +1186,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if text_config.norm_after else Molmo2DecoderLayer ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( text_config.num_hidden_layers, lambda prefix: decoder_layer( text_config, hf_text_config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 85933626cd30..b2b72672c88e 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -11,7 +11,7 @@ from transformers import MptConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -59,6 +59,7 @@ def __init__( config: MptConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -128,6 +129,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -189,13 +191,18 @@ def __init__( config: MptConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() hidden_size = config.d_model self.norm_1 = nn.LayerNorm(hidden_size) self.attn = MPTAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.attn", ) self.norm_2 = nn.LayerNorm(hidden_size) self.ffn = MPTMLP(config, quant_config, prefix=f"{prefix}.ffn") @@ -225,6 +232,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config assert config.embedding_fraction == 1.0 assert config.norm_type == "low_precision_layernorm" @@ -235,7 +243,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: MPTBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: MPTBlock( + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, + ), prefix=f"{prefix}.blocks", ) self.norm_f = nn.LayerNorm(config.d_model) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 15d43a9ddf98..b66086c98fde 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -31,7 +31,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -152,6 +152,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -207,6 +208,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -229,6 +231,7 @@ class NemotronDecoderLayer(nn.Module): def __init__( self, config: NemotronConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -251,6 +254,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -315,10 +319,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: NemotronDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 4ec794eccf72..3b6a3d4cae3c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -490,6 +490,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", per_layer_sliding_window=sliding_window, + model_config=model_config, ) def forward( diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index ae417f095eb4..cc5349438b98 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -20,7 +20,7 @@ PretrainedConfig, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs import MultiModalDataDict @@ -102,6 +102,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = config.d_model @@ -113,6 +114,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.activation_fn = get_act_fn(config.activation_function) @@ -128,6 +130,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.encoder_attn", + model_config=model_config, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -252,6 +255,7 @@ def __init__( lora_config: LoRAConfig | None = None, embed_tokens: nn.Embedding | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.cache_config = cache_config @@ -273,6 +277,7 @@ def __init__( cache_config, quant_config, prefix=f"{prefix}.layers.{layer_idx}", + model_config=model_config, ) for layer_idx in range(config.decoder_layers) ] @@ -596,6 +601,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.decoder", + model_config=vllm_config.model_config, ) self.vocab_size = config.decoder.vocab_size diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 4491a6a3ea1b..09a2fb770728 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -32,7 +32,7 @@ from transformers import OlmoConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -74,6 +74,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -113,6 +114,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) # Attention output projection. @@ -201,11 +203,16 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() # Attention block. self.self_attn = OlmoAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn", + model_config=model_config, ) # MLP block. @@ -246,6 +253,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -255,7 +263,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OlmoDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 212140fe15ea..19c9c4f5d055 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -33,7 +33,7 @@ from transformers import Olmo2Config, Olmo3Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_tensor_model_parallel_rank @@ -72,10 +72,18 @@ class Olmo2Attention(nn.Module): (plus another skip connection). """ - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ): super().__init__() self.config = vllm_config.model_config.hf_config assert isinstance(self.config, (Olmo2Config, Olmo3Config)) + if model_config is None: + model_config = vllm_config.model_config hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -135,6 +143,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=vllm_config.quant_config, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) # Rotary embeddings. Rope scaling is only applied on full attention layers. @@ -242,9 +251,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config assert isinstance(config, (Olmo2Config, Olmo3Config)) + model_config = vllm_config.model_config # Attention block. self.self_attn = Olmo2Attention( - vllm_config=vllm_config, prefix=f"{prefix}.self_attn" + vllm_config=vllm_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", ) # MLP block. diff --git a/vllm/model_executor/models/olmo_hybrid.py b/vllm/model_executor/models/olmo_hybrid.py index 97e56b3ff6f9..594f9cb2502b 100644 --- a/vllm/model_executor/models/olmo_hybrid.py +++ b/vllm/model_executor/models/olmo_hybrid.py @@ -596,9 +596,17 @@ def _forward_core( class OlmoHybridAttention(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ): super().__init__() self.config = vllm_config.model_config.hf_config + if model_config is None: + model_config = vllm_config.model_config hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -653,6 +661,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) rope_parameters = getattr(self.config, "rope_parameters", None) @@ -771,6 +780,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: else: self.self_attn = OlmoHybridAttention( vllm_config=vllm_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) # Attention layers use these norm names diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index f0afe0e997cc..6c78734fc96b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -22,7 +22,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -119,12 +119,20 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class OlmoeAttention(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ) -> None: super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + if model_config is None: + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 4096) @@ -187,6 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def _apply_qk_norm( @@ -225,8 +234,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.hidden_size = config.hidden_size + model_config = vllm_config.model_config self.self_attn = OlmoeAttention( vllm_config=vllm_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 994ae82529ab..6f728d9e4ecf 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -30,7 +30,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ParallelConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_pp_group, @@ -422,6 +422,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, bias_o_proj: bool = False, @@ -508,6 +509,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, @@ -560,6 +562,7 @@ def __init__( num_kv_heads: int, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, bias_o_proj: bool = False, @@ -662,6 +665,7 @@ def __init__( self.scaling, sink_len=self.param_sink_number, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, @@ -820,6 +824,7 @@ def __init__( if config is None: config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config @@ -885,6 +890,7 @@ def __init__( ), rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, bias_o_proj=bias_o_proj, @@ -915,6 +921,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, bias_o_proj=bias_o_proj, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 81653b9516ac..e5d8cf22f566 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -28,7 +28,7 @@ from transformers import OPTConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -78,6 +78,7 @@ def __init__( bias: bool = True, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -110,6 +111,7 @@ def __init__( scale=self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -130,6 +132,7 @@ def __init__( config: OPTConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -141,6 +144,7 @@ def __init__( bias=config.enable_bias, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.do_layer_norm_before = config.do_layer_norm_before @@ -203,6 +207,7 @@ def __init__( config: OPTConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -257,7 +262,11 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OPTDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) @@ -303,9 +312,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.decoder = OPTDecoder( - config, cache_config, quant_config, prefix=f"{prefix}.decoder" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.decoder", ) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states"], config.hidden_size diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 3cacb9d61cd5..268be2a77032 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -16,7 +16,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -93,6 +93,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -146,6 +147,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -168,6 +170,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -181,6 +184,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = OrionMLP( hidden_size=self.hidden_size, @@ -226,6 +230,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -236,7 +241,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OrionDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 56505ec7be20..26c638d0f047 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -185,6 +186,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, attn_type=attn_type, @@ -216,6 +218,7 @@ class OuroDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -237,6 +240,7 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -332,11 +336,13 @@ def __init__( ) # Use the provided decoder layer type or default to OuroDecoderLayer + model_config = vllm_config.model_config decoder_layer_type = decoder_layer_type or OuroDecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer_type( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index a03a785577ee..3d87b9a986b5 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -31,7 +31,7 @@ from transformers import PersimmonConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -95,6 +95,7 @@ def __init__( config: PersimmonConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -144,6 +145,7 @@ def __init__( scale=self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -189,6 +191,7 @@ def __init__( config: PersimmonConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -197,6 +200,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = PersimmonMLP( @@ -246,6 +250,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size self.config = config @@ -255,7 +260,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: PersimmonDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 75c42c0d3930..79340e9346e7 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -46,7 +46,7 @@ from transformers import PhiConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -81,6 +81,7 @@ def __init__( config: PhiConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -121,6 +122,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -176,6 +178,7 @@ def __init__( config: PhiConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -183,7 +186,11 @@ def __init__( config.hidden_size, eps=config.layer_norm_eps ) self.self_attn = PhiAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", ) self.mlp = PhiMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -211,6 +218,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -219,7 +227,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: PhiLayer(config, cache_config, quant_config, prefix=prefix), + lambda prefix: PhiLayer( + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, + ), prefix=f"{prefix}.layers", ) self.final_layernorm = nn.LayerNorm( diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 0b55b7ec8392..bbc7bcddd2b3 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -32,7 +32,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE @@ -310,6 +310,7 @@ def __init__( max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -364,6 +365,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -386,6 +388,7 @@ def __init__( config: PhiMoEConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -401,6 +404,7 @@ def __init__( ), cache_config=cache_config, quant_config=quant_config, + model_config=model_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -453,6 +457,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size @@ -466,7 +471,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: PhiMoEDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index e179638a869b..48ab77eb05b4 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -21,7 +21,7 @@ position_ids_in_meshgrid, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict @@ -345,6 +345,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): with self._mark_tower_model(vllm_config, "image"): self.vision_encoder = VisionTransformer( self.vision_args, + model_config=vllm_config.model_config, prefix=maybe_prefix(prefix, "vision_encoder"), ) self.pre_mm_projector_norm = ( @@ -696,6 +697,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -760,6 +762,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -767,6 +770,7 @@ def __init__( self.attention = Attention( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attention", disable_tp=disable_tp, ) @@ -800,6 +804,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -810,6 +815,7 @@ def __init__( TransformerBlock( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{idx}", disable_tp=disable_tp, ) @@ -850,6 +856,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -866,6 +873,7 @@ def __init__( self.transformer = Transformer( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.transformer", disable_tp=disable_tp, ) diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 44b1207745ec..43bfb2188148 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -615,6 +615,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 2ba38a7b1f8f..920aaeacad86 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -196,6 +196,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, per_layer_sliding_window=config.interleaved_sliding_window[layer_idx], prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b4526beac637..d2e2a805ec87 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -96,6 +96,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.hidden_size = hidden_size @@ -133,6 +134,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -155,6 +157,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -167,6 +170,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -209,6 +213,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -219,7 +224,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: QWenBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: QWenBlock( + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, + ), prefix=f"{prefix}.h", ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 4b0c756165a5..e52f7c1015cb 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -35,7 +35,7 @@ from transformers import Qwen2MoeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -206,6 +206,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", dual_chunk_attention_config: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -262,6 +263,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": dual_chunk_attention_config, @@ -290,6 +292,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -307,6 +310,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.self_attn", dual_chunk_attention_config=dual_chunk_attention_config, + model_config=model_config, ) # Note: Qwen/Qwen2-57B-A14B-Instruct does not have @@ -365,6 +369,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size self.config = config @@ -382,6 +387,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/qwen3_dflash.py b/vllm/model_executor/models/qwen3_dflash.py index ce45136d7c0b..4c5abacc85fd 100644 --- a/vllm/model_executor/models/qwen3_dflash.py +++ b/vllm/model_executor/models/qwen3_dflash.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention @@ -68,6 +68,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.layer_name = prefix @@ -118,6 +119,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=attn_type, + model_config=model_config, ) self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -162,6 +164,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + model_config = vllm_config.model_config set_default_rope_theta(config, default_theta=1000000) attn_type = AttentionType.DECODER @@ -178,6 +181,7 @@ def __init__( rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, + model_config=model_config, ) self.mlp = Qwen3MLP( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f2ce070be8b4..a889d110487a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -270,6 +270,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", dual_chunk_attention_config: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -326,6 +327,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": dual_chunk_attention_config, @@ -365,6 +367,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config = vllm_config.model_config.hf_text_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -384,6 +387,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config=quant_config, prefix=f"{prefix}.self_attn", dual_chunk_attention_config=dual_chunk_attention_config, + model_config=model_config, ) # `mlp_only_layers` in the config. diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 6cf386cc8ba2..89305ceab26f 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -266,6 +266,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": self.dual_chunk_attention_config, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index d90174911fb6..4fa1e36108b5 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -31,7 +31,7 @@ from transformers import PretrainedConfig as SeedOssConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -116,6 +116,7 @@ def __init__( head_dim: int, rope_parameters: dict, max_position: int = 4096 * 32, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -169,6 +170,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, attn_type=attn_type, @@ -192,6 +194,7 @@ class SeedOssDecoderLayer(nn.Module): def __init__( self, config: SeedOssConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -215,6 +218,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, rope_parameters=config.rope_parameters, @@ -309,11 +313,13 @@ def __init__( self.embed_tokens = PPMissingLayer() # Use the provided decoder layer type or default to SeedDecoderLayer + model_config = vllm_config.model_config decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer_type( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index bff866d0d0c2..10a1d541e6a8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -31,7 +31,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -168,6 +169,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -190,6 +192,7 @@ class SolarDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -212,6 +215,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -274,10 +278,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: SolarDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 034c9c18ff7b..3d39b6de6849 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,7 +29,7 @@ from torch import nn from transformers import StableLmConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -98,6 +98,7 @@ def __init__( config: StableLmConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -158,6 +159,7 @@ def __init__( num_kv_heads=self.num_key_value_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -180,11 +182,16 @@ def __init__( config: StableLmConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.self_attn = StablelmAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", ) self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) @@ -221,6 +228,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, @@ -231,7 +239,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: StablelmDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 5f08a59e2364..0b3df4a183bc 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -29,7 +29,7 @@ from transformers import Starcoder2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -67,6 +67,7 @@ def __init__( config: Starcoder2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -123,6 +124,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -176,6 +178,7 @@ def __init__( config: Starcoder2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -184,6 +187,7 @@ def __init__( config, cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = Starcoder2MLP( @@ -225,6 +229,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -238,7 +243,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + cache_config, + quant_config=quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 07653fa6b377..b99cda87735d 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -10,7 +10,7 @@ import torch from torch import nn -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -88,6 +88,7 @@ class StepAttention(nn.Module): def __init__( self, config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -144,6 +145,7 @@ def __init__( self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, alibi_slopes=alibi_slopes, @@ -200,12 +202,14 @@ class StepDecoderLayer(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size self.self_attn = StepAttention( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 18b689166a5f..0a01c5b2e604 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -149,6 +149,7 @@ def __init__( share_q_dim: int | None = None, max_position_embedding: int = 8192, head_dim: int = 256, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -205,6 +206,7 @@ def __init__( self.head_dim, scaling, self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -226,6 +228,7 @@ class Step3TextDecoderLayer(nn.Module): def __init__( self, config: Step3TextConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -237,6 +240,7 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=1, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, norm_eps=config.rms_norm_eps, @@ -315,6 +319,7 @@ class Step3TextModel(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.vocab_size = config.vocab_size @@ -334,6 +339,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index bb4bf14a9632..810838773216 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -138,6 +138,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, rope_scaling: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, # Step3p5 specific args @@ -245,6 +246,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -432,6 +434,7 @@ def __init__( ) -> None: super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size layer_idx = extract_layer_index(prefix) self.layer_idx = layer_idx @@ -470,6 +473,7 @@ def __init__( rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=head_dim if head_dim else getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, rope_scaling=getattr(config, "rope_scaling", None), diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f0f6f619b022..3e0fee8c6b40 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -153,6 +153,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = embed_dim @@ -206,6 +207,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) else: # AttentionType.DECODER (regular decoder self-attention) self.attn = Attention( @@ -218,6 +220,7 @@ def __init__( prefix=f"{prefix}.attn", attn_type=self.attn_type, per_layer_sliding_window=per_layer_sliding_window, + model_config=model_config, ) def _init_qkv( @@ -260,6 +263,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__( embed_dim=embed_dim, @@ -269,6 +273,7 @@ def __init__( quant_config=quant_config, prefix=prefix, attn_type=AttentionType.ENCODER_DECODER, + model_config=model_config, ) def _init_qkv( @@ -367,6 +372,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=vllm_config.model_config, ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.mlp = WhisperMLP( @@ -410,6 +416,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=vllm_config.model_config, ) self.self_attn_layer_norm = nn.LayerNorm(config.d_model) self.encoder_attn = WhisperCrossAttention( @@ -418,6 +425,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.encoder_attn", + model_config=vllm_config.model_config, ) self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) self.mlp = WhisperMLP( diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index 8e4322ea335d..f1c00aa84563 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -11,7 +11,7 @@ import torch.nn.functional as F from torch import nn -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -23,6 +23,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.models.mistral import MistralMLP from vllm.model_executor.models.whisper import WhisperPosEmbedType +from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -277,6 +278,7 @@ def __init__( num_kv_heads: int | None = None, alibi_slopes: list[float] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, logits_soft_cap: float | None = None, per_layer_sliding_window: int | None = None, @@ -293,7 +295,10 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - kv_cache_dtype = "auto" + assert model_config is not None, ( + "model_config is required when cache_config is not provided" + ) + kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] underlying_attn_backend = get_attn_backend( head_size, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index b4d844ba6d76..45071d44d996 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -118,6 +118,7 @@ def __init__( config: Zamba2Config, bare_block_idx: int, num_hybrid_layers: int, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,6 +190,7 @@ def __init__( self.num_attention_heads, self.attention_head_dim, self.scale, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn.{j}", ) @@ -403,6 +405,7 @@ def __init__( config: Zamba2Config, bare_block_idx: int, num_hybrid_layers: int, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -424,6 +427,7 @@ def __init__( config, bare_block_idx=bare_block_idx, num_hybrid_layers=num_hybrid_layers, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, @@ -715,6 +719,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: config, bare_block_idx=idx, num_hybrid_layers=len(layer2block_map), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}", diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index bd9741024f2a..a78825f7c6ba 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from vllm.config import ModelConfig + from vllm.config.cache import CacheDType from vllm.sequence import IntermediateTensors else: ModelConfig = object @@ -58,6 +59,12 @@ "fp8": "fp8_e4m3", } +TORCH_DTYPE_TO_KV_CACHE_STR: dict[torch.dtype, "CacheDType"] = { + torch.float32: "float32", + torch.float16: "float16", + torch.bfloat16: "bfloat16", +} + T = TypeVar("T") From 4cb3e008cf3b30b529b8727765c558932a930c25 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 31 Mar 2026 18:02:43 -0400 Subject: [PATCH 2/8] Clean up unintended changes Signed-off-by: Matthew Bonanni --- vllm/model_executor/layers/attention/attention.py | 11 ++--------- .../layers/attention/chunked_local_attention.py | 7 ++----- .../layers/attention/cross_attention.py | 7 ++----- .../layers/attention/encoder_only_attention.py | 7 ++----- .../layers/attention/static_sink_attention.py | 11 +++-------- vllm/model_executor/models/iquest_loopcoder.py | 6 +----- vllm/model_executor/models/whisper_causal.py | 14 +++++++------- vllm/utils/torch_utils.py | 7 ------- 8 files changed, 19 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index b668cfd63f83..6d249ed7fac6 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform from vllm.utils.torch_utils import ( - TORCH_DTYPE_TO_KV_CACHE_STR, direct_register_custom_op, kv_cache_dtype_str_to_dtype, ) @@ -223,10 +222,7 @@ def __init__( kv_cache_dtype = cache_config.cache_dtype calculate_kv_scales = cache_config.calculate_kv_scales else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" calculate_kv_scales = False # llm-compressor mdls need to set cache_dtype to "fp8" manually. @@ -260,10 +256,7 @@ def __init__( if str(layer_idx) in cache_config.kv_cache_dtype_skip_layers: skip = True if skip: - assert model_config is not None, ( - "model_config is required for kv_cache_dtype_skip_layers" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" calculate_kv_scales = False logger.info( "Layer %s: kv_cache_dtype=%s, sliding_window=%s", diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index 82c2ff894d6c..f8caa0b6f0c0 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -8,7 +8,6 @@ from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionCGSupport, @@ -98,10 +97,7 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" underlying_attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype) attn_backend = create_chunked_local_attention_backend( @@ -115,6 +111,7 @@ def __init__( num_kv_heads=num_kv_heads, alibi_slopes=alibi_slopes, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=prefix, kv_sharing_target_layer_name=kv_sharing_target_layer_name, diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index 1c2e4156613b..686c94e9ed6f 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -10,7 +10,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -191,10 +190,7 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" if attn_type is not None: assert attn_type == AttentionType.ENCODER_DECODER, ( @@ -214,6 +210,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, attn_type=AttentionType.ENCODER_DECODER, **kwargs, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index e805fa8759d2..d40d483919da 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -8,7 +8,6 @@ from vllm.config import CacheConfig, ModelConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention -from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -69,10 +68,7 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" underlying_attn_backend = get_attn_backend( head_size, @@ -93,6 +89,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, attn_type=AttentionType.ENCODER_ONLY, **kwargs, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index eea6e9c4d069..3b7b43113afe 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -10,10 +10,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import ( - TORCH_DTYPE_TO_KV_CACHE_STR, - direct_register_custom_op, -) +from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -131,10 +128,7 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" if attn_backend is not None: underlying_attn_backend = attn_backend @@ -150,6 +144,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, **kwargs, ) diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 54b3e9dc3ecd..51d8a67f615f 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -47,7 +47,6 @@ ) from vllm.model_executor.models.llama import LlamaMLP from vllm.sequence import IntermediateTensors -from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import AttentionType from .utils import ( @@ -149,12 +148,9 @@ def __init__( sliding_window=self.loop_window_size, ) else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) loop_cache_config = CacheConfig( sliding_window=self.loop_window_size, - cache_dtype=TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype], + cache_dtype="auto", ) self.attn.append( diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index f1c00aa84563..8b4447bd5632 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.models.mistral import MistralMLP from vllm.model_executor.models.whisper import WhisperPosEmbedType -from vllm.utils.torch_utils import TORCH_DTYPE_TO_KV_CACHE_STR from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, @@ -295,10 +294,7 @@ def __init__( if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype else: - assert model_config is not None, ( - "model_config is required when cache_config is not provided" - ) - kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[model_config.dtype] + kv_cache_dtype = "auto" underlying_attn_backend = get_attn_backend( head_size, @@ -317,6 +313,7 @@ def __init__( num_kv_heads=num_kv_heads, alibi_slopes=alibi_slopes, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, logits_soft_cap=logits_soft_cap, per_layer_sliding_window=per_layer_sliding_window, @@ -349,6 +346,7 @@ def __init__( per_layer_sliding_window: int | None = None, block_pool_size: int = 1, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ): @@ -391,6 +389,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=AttentionType.DECODER, @@ -449,11 +448,11 @@ def forward( class WhisperCausalEncoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config sliding_window = getattr(config, "sliding_window", None) block_pool_size = config.block_pool_size assert block_pool_size > 1 - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -467,6 +466,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): block_pool_size=block_pool_size, per_layer_sliding_window=sliding_window, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index a78825f7c6ba..bd9741024f2a 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -19,7 +19,6 @@ if TYPE_CHECKING: from vllm.config import ModelConfig - from vllm.config.cache import CacheDType from vllm.sequence import IntermediateTensors else: ModelConfig = object @@ -59,12 +58,6 @@ "fp8": "fp8_e4m3", } -TORCH_DTYPE_TO_KV_CACHE_STR: dict[torch.dtype, "CacheDType"] = { - torch.float32: "float32", - torch.float16: "float16", - torch.bfloat16: "bfloat16", -} - T = TypeVar("T") From 2dcaef0859d32c63037d0dd6f62ae1be857c2c78 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 31 Mar 2026 18:07:38 -0400 Subject: [PATCH 3/8] Clean up Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/bloom.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index e9b7e54878aa..9eb8821ec516 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -198,7 +198,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.self_attention", ) self.post_attention_layernorm = nn.LayerNorm( @@ -271,7 +271,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.h", From 6eb4e4390ba3fd4b7077a723295d6e3206ee67f4 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 31 Mar 2026 18:12:55 -0400 Subject: [PATCH 4/8] Clean up Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/chatglm.py | 6 +++--- vllm/model_executor/models/gpt2.py | 4 ++-- vllm/model_executor/models/gpt_j.py | 4 ++-- vllm/model_executor/models/mpt.py | 4 ++-- vllm/model_executor/models/phi.py | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index a944734597e5..32aa9fdb1972 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -218,7 +218,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.self_attention", ) self.hidden_dropout = config.hidden_dropout @@ -291,7 +291,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", @@ -361,7 +361,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.encoder", ) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index d45150290e4c..0c568b0d5792 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -165,7 +165,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) @@ -217,7 +217,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index fe4a02adef5a..3eea5a56c4a3 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -175,7 +175,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attn", ) self.mlp = GPTJMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -219,7 +219,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index b2b72672c88e..e7da483b3506 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -201,7 +201,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attn", ) self.norm_2 = nn.LayerNorm(hidden_size) @@ -247,7 +247,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.blocks", diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 79340e9346e7..9495be016af5 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -189,7 +189,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.self_attn", ) self.mlp = PhiMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -231,7 +231,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", From 04a8496c2f673e62f69da8b62939010d424e7099 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 31 Mar 2026 19:56:37 -0400 Subject: [PATCH 5/8] Clean up Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/baichuan.py | 2 +- vllm/model_executor/models/falcon.py | 4 ++-- vllm/model_executor/models/gpt_bigcode.py | 4 ++-- vllm/model_executor/models/gpt_neox.py | 4 ++-- vllm/model_executor/models/jais.py | 2 +- vllm/model_executor/models/opt.py | 4 ++-- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/phimoe.py | 2 +- vllm/model_executor/models/stablelm.py | 4 ++-- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 2f0d4d299c3d..4ce894ae9399 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -307,7 +307,7 @@ def __init__( position_embedding, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index b76e027f04fd..19a6a899e1d0 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -284,7 +284,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.self_attention", ) self.mlp = FalconMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -399,7 +399,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 80aedd5558eb..5b23100fe1cd 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -180,7 +180,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) @@ -232,7 +232,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index b7cf3a431b2b..781ff3350763 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -170,7 +170,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attention", ) self.mlp = GPTNeoXMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -225,7 +225,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 1c8c9e847b7a..ef723222c56e 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -219,7 +219,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index e5d8cf22f566..c68327950073 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -265,7 +265,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", @@ -318,7 +318,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.decoder", ) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3d87b9a986b5..2130e3d982f0 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -263,7 +263,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index bbc7bcddd2b3..29a589ed0ef3 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -474,7 +474,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 3d39b6de6849..1af5cb2a57ea 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -190,7 +190,7 @@ def __init__( config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=f"{prefix}.self_attn", ) self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -242,7 +242,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config, cache_config, quant_config, - model_config=model_config, + model_config, prefix=prefix, ), prefix=f"{prefix}.layers", From 59aa775e881154b52e256f6aa7a771ca8a89fd7c Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 1 Apr 2026 15:21:15 -0400 Subject: [PATCH 6/8] Add missing pass locations Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/apertus.py | 8 +++++++- vllm/model_executor/models/clip.py | 12 +++++++++++- vllm/model_executor/models/iquest_loopcoder.py | 1 + vllm/model_executor/models/llama4.py | 6 +++++- vllm/model_executor/models/qwen2.py | 8 +++++++- vllm/model_executor/models/qwen3.py | 6 +++++- vllm/model_executor/models/siglip.py | 12 +++++++++++- vllm/utils/torch_utils.py | 2 +- 8 files changed, 48 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 5905a198b289..f31be7f12553 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -33,7 +33,7 @@ from transformers import ApertusConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU from vllm.model_executor.layers.attention import ( @@ -131,6 +131,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -200,6 +201,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=attn_type, prefix=f"{prefix}.attn", @@ -246,6 +248,7 @@ def __init__( config: ApertusConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -282,6 +285,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) @@ -332,6 +336,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -354,6 +359,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 05a494683a85..a119b06b4a56 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,7 +14,7 @@ CLIPVisionConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict, MultiModalInput @@ -356,6 +356,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -409,6 +410,7 @@ def __init__( self.num_heads_per_partition, self.head_dim, self.scale, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -469,6 +471,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -478,6 +481,7 @@ def __init__( self.self_attn = CLIPAttention( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -517,6 +521,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -536,6 +541,7 @@ def __init__( CLIPEncoderLayer( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -567,6 +573,7 @@ def __init__( self, config: CLIPTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", ) -> None: @@ -580,6 +587,7 @@ def __init__( self.encoder = CLIPEncoder( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.encoder", attn_cls=Attention, ) @@ -824,6 +832,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: CLIPConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config @@ -840,6 +849,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.text_model = CLIPTextTransformer( text_config, quant_config=quant_config, + model_config=model_config, prefix=maybe_prefix(prefix, "text_model"), ) self.text_projection = nn.Linear( diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 51d8a67f615f..ab86db91321c 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -160,6 +160,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=loop_cache_config, + model_config=model_config, quant_config=quant_config, attn_type=attn_type, prefix=f"{unique_prefix}.attn", diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index b84b4e2ae512..22232f2309d2 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,7 +25,7 @@ from transformers import Llama4TextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_tensor_model_parallel_world_size, @@ -180,6 +180,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -263,6 +264,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", **( @@ -325,6 +327,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config self.layer_idx = extract_layer_index(prefix) @@ -342,6 +345,7 @@ def __init__( bias=False, bias_o_proj=False, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) is_moe_layer = ( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 27aa6175b9bc..38024a4c738c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -34,7 +34,7 @@ from transformers import Qwen2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import ( @@ -126,6 +126,7 @@ def __init__( rope_parameters: dict[str, Any], max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -195,6 +196,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, attn_type=attn_type, prefix=f"{prefix}.attn", @@ -241,6 +243,7 @@ def __init__( self, config: Qwen2Config, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -269,6 +272,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", @@ -366,6 +370,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config.get_text_config() + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -402,6 +407,7 @@ def __init__( lambda prefix: decoder_layer_type( config=config, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=prefix, ), diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 6dec60232b1d..825474a4ed79 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -31,7 +31,7 @@ from transformers import Qwen3Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention.encoder_only_attention import ( @@ -68,6 +68,7 @@ def __init__( rms_norm_eps: float = 1e-06, qkv_bias: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -129,6 +130,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=attn_type, @@ -167,6 +169,7 @@ def __init__( self, config: Qwen3Config, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -195,6 +198,7 @@ def __init__( qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, + model_config=model_config, quant_config=quant_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index ce3a260d0ef6..db48eb6bf812 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -15,7 +15,7 @@ SiglipVisionConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict, MultiModalInput @@ -360,6 +360,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -414,6 +415,7 @@ def __init__( self.num_heads_per_partition, self.head_dim, self.scale, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -480,6 +482,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -491,6 +494,7 @@ def __init__( self.self_attn = SiglipAttention( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -525,6 +529,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -544,6 +549,7 @@ def __init__( SiglipEncoderLayer( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -575,6 +581,7 @@ def __init__( self, config: SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", ) -> None: @@ -588,6 +595,7 @@ def __init__( self.encoder = SiglipEncoder( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.encoder", attn_cls=EncoderOnlyAttention, ) @@ -1042,6 +1050,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: SiglipConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config self.config = config @@ -1059,6 +1068,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.text_model = SiglipTextTransformer( text_config, quant_config=quant_config, + model_config=model_config, prefix=maybe_prefix(prefix, "text_model"), ) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 59c19a56e4f2..9c7c7f044379 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -345,7 +345,7 @@ def resolve_kv_cache_dtype_string( def kv_cache_dtype_str_to_dtype( - kv_cache_dtype: str, model_config: ModelConfig + kv_cache_dtype: str, model_config: ModelConfig | None ) -> torch.dtype: if kv_cache_dtype == "auto": # Model config may not be specified for unit tests, default to float16 From b26293fa1dd8b540cf559d9ab4bb00aadc035efd Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 2 Apr 2026 09:30:54 -0400 Subject: [PATCH 7/8] Add more missing locations Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/arcee.py | 4 ++++ .../models/bailing_moe_linear.py | 2 ++ vllm/model_executor/models/blip2.py | 14 +++++++++++++- vllm/model_executor/models/exaone_moe.py | 8 ++++++-- .../models/extract_hidden_states.py | 8 ++++++-- vllm/model_executor/models/glm4_moe_mtp.py | 5 ++++- vllm/model_executor/models/granitemoeshared.py | 11 +++++++++-- vllm/model_executor/models/internlm2_ve.py | 4 +++- vllm/model_executor/models/kimi_linear.py | 1 + vllm/model_executor/models/minicpm_eagle.py | 18 ++++++++++++++++-- vllm/model_executor/models/mistral.py | 4 +++- vllm/model_executor/models/nemotron_nas.py | 8 +++++++- 12 files changed, 74 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index bc4f85bf7ddb..c0f4e21a93e9 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -105,6 +105,7 @@ def __init__( config: LlamaConfig, cache_config: Any | None = None, quant_config: Any | None = None, + model_config: Any | None = None, prefix: str = "", ) -> None: super().__init__() @@ -135,6 +136,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=getattr( config, "attn_type", "decoder" @@ -191,6 +193,7 @@ def __init__( config: LlamaConfig = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.quant_config = quant_config self.config = config self.vocab_size = config.vocab_size @@ -214,6 +217,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index ecc5d63ced75..09aaa291e43a 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -113,6 +113,7 @@ def __init__( layer_id: int = 0, prefix: str = "attention", cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -819,6 +820,7 @@ def __init__( layer_id=layer_id, prefix=f"{prefix}.self_attn", cache_config=cache_config, + model_config=model_config, ) # MLP/MoE diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 8b5fd452e8ff..68f706a6de79 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -13,7 +13,7 @@ apply_chunking_to_forward, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs import MultiModalDataDict from vllm.model_executor.layers.activation import get_act_fn @@ -83,6 +83,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, is_cross_attention: bool = False, prefix: str = "", ) -> None: @@ -184,6 +185,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, is_cross_attention: bool = False, prefix: str = "", ) -> None: @@ -193,6 +195,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, is_cross_attention=is_cross_attention, prefix=f"{prefix}.attention", ) @@ -252,6 +255,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, layer_idx: int, prefix: str = "", ) -> None: @@ -263,6 +267,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.attention", ) @@ -273,6 +278,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, is_cross_attention=True, prefix=f"{prefix}.crossattention", ) @@ -345,6 +351,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -357,6 +364,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, layer_idx=layer_idx, prefix=f"{prefix}.layer.{layer_idx}", ) @@ -390,6 +398,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -403,6 +412,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.encoder", ) @@ -538,6 +548,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -561,6 +572,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.qformer_config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.qformer", ) self.language_projection = nn.Linear( diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index d7282edcf4f6..41d40ea41ca6 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -24,7 +24,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -179,6 +179,7 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, mtp_layer: bool = None, prefix: str = "", ) -> None: @@ -204,6 +205,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) if config.is_moe_layer[layer_idx] and not mtp_layer: @@ -254,7 +256,8 @@ class ExaoneMoeModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -287,6 +290,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/extract_hidden_states.py b/vllm/model_executor/models/extract_hidden_states.py index 608e93d6a930..c3b88186c3c0 100644 --- a/vllm/model_executor/models/extract_hidden_states.py +++ b/vllm/model_executor/models/extract_hidden_states.py @@ -14,7 +14,7 @@ import torch import torch.nn as nn -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.forward_context import get_forward_context from vllm.model_executor.layers.attention.attention import set_default_quant_scales @@ -240,6 +240,7 @@ def __init__( num_heads: int, head_size: int, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ): @@ -250,6 +251,7 @@ def __init__( self.layer_name = prefix vllm_config = get_current_vllm_config() + model_config = model_config or vllm_config.model_config # KV cache configuration cache_config = cache_config or vllm_config.cache_config @@ -265,7 +267,7 @@ def __init__( f"kv cache dtype was set to {kv_cache_dtype}" ) self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( - kv_cache_dtype, vllm_config.model_config + kv_cache_dtype, model_config ) # Initialize KV cache quantization attributes @@ -357,12 +359,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # and head_size <- hidden_size so that we can insert # the hidden states directly into the cache without # reshaping + model_config = vllm_config.model_config self.cache_only_layers = nn.ModuleDict( { str(self.target_num_hidden_layers): CacheOnlyAttentionLayer( num_heads=self.num_hidden_states, head_size=self.hidden_size, cache_config=cache_config, + model_config=model_config, prefix=maybe_prefix( prefix, f"cache_only_layers.{self.target_num_hidden_layers}" ), diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index cde94673e53a..b31bea4a32e1 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -30,7 +30,7 @@ import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, ParallelConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -79,6 +79,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, parallel_config: ParallelConfig | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -92,6 +93,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, enable_eplb=self.enable_eplb, ) @@ -136,6 +138,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, parallel_config=vllm_config.parallel_config, + model_config=vllm_config.model_config, ) for idx in range( self.mtp_start_layer_idx, diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 7abc682c58e5..8ce243f364b4 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -14,7 +14,7 @@ from transformers.models.granitemoeshared import GraniteMoeSharedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -80,6 +80,7 @@ def __init__( config: GraniteMoeSharedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -90,6 +91,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -152,6 +154,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -170,7 +173,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GraniteMoeSharedDecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + cache_config=cache_config, + quant_config=quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index da0dfe73e6f7..7a6744beab6c 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -7,7 +7,7 @@ from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig @@ -24,6 +24,7 @@ class InternLM2VEDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -37,6 +38,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attention", diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index 4cd7b63c1472..0f3223ba09d8 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -195,6 +195,7 @@ def __init__( use_nope: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", **kwargs, ) -> None: diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index e9f1a91bfc4a..9a446b0af1b4 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -32,7 +32,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -62,12 +62,14 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.hidden_size = config.hidden_size self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix @@ -86,6 +88,7 @@ def _init_attn_block(self): max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, + model_config=self.model_config, prefix=f"{self.prefix}.self_attn", ) @@ -149,10 +152,12 @@ def __init__( config = vllm_config.speculative_config.draft_model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.vocab_size = config.vocab_size @@ -166,7 +171,14 @@ def __init__( config.hidden_size, ) self.num_experts = getattr(self.config, "num_experts", 0) - self._init_layers(prefix, config, cache_config, quant_config, start_layer) + self._init_layers( + prefix, + config, + cache_config, + quant_config, + model_config, + start_layer, + ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], self.config.hidden_size @@ -178,6 +190,7 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None, start_layer: int, ): self.eagle_layers = nn.ModuleList( @@ -186,6 +199,7 @@ def _init_layers( config, cache_config, quant_config, + model_config, f"{prefix}.eagle_layers.{i + start_layer}", ) for i in range(self.config.num_hidden_layers) diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index ce1332d0c9d1..b174428cf2f1 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -9,7 +9,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -86,6 +86,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -99,6 +100,7 @@ def __init__( bias=bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=prefix, attn_type=attn_type, ) diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f2f3811c0644..356fc0d0bc6d 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -32,7 +32,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -86,6 +86,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -99,6 +100,7 @@ def __init__( bias, bias_o_proj, cache_config, + model_config, prefix, attn_type, ) @@ -131,6 +133,7 @@ def __init__( layer_idx: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -164,6 +167,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -236,6 +240,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -262,6 +267,7 @@ def get_layer(prefix: str): layer_idx, cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ) From 355a83cdb38d2bb1c3b41a7ac1ff529cd09550fd Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 2 Apr 2026 10:45:23 -0400 Subject: [PATCH 8/8] More missing passes Signed-off-by: Matthew Bonanni --- tests/compile/passes/test_fusion_attn.py | 1 + tests/compile/passes/test_qk_norm_rope_fusion.py | 1 + tests/compile/passes/test_rope_kvcache_fusion.py | 1 + tests/v1/worker/test_gpu_model_runner.py | 4 ++-- vllm/model_executor/models/transformers/base.py | 1 + 5 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 2c5ac7b0b614..5d945b1bcba5 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -73,6 +73,7 @@ def __init__( scale=1.0 / (self.head_size**0.5), num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, prefix="model.layers.0.self_attn.attn", ) self.attn._k_scale = self.attn._k_scale.to(device) diff --git a/tests/compile/passes/test_qk_norm_rope_fusion.py b/tests/compile/passes/test_qk_norm_rope_fusion.py index 25b8ea56fe25..a4c0291705a1 100644 --- a/tests/compile/passes/test_qk_norm_rope_fusion.py +++ b/tests/compile/passes/test_qk_norm_rope_fusion.py @@ -66,6 +66,7 @@ def __init__( scale=1.0 / self.head_dim**0.5, num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, prefix=prefix, attn_type=AttentionType.DECODER, ) diff --git a/tests/compile/passes/test_rope_kvcache_fusion.py b/tests/compile/passes/test_rope_kvcache_fusion.py index eea21c9179bd..be0792c5bc98 100644 --- a/tests/compile/passes/test_rope_kvcache_fusion.py +++ b/tests/compile/passes/test_rope_kvcache_fusion.py @@ -84,6 +84,7 @@ def __init__( scale=1.0 / head_size**0.5, num_kv_heads=num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, quant_config=vllm_config.quant_config, prefix=prefix, attn_backend=attn_backend.get_class(), diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d7695027a284..6b1362046d25 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -119,7 +119,7 @@ def model_runner(): num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context["layer.0"] = Attention( - num_heads, head_size, 0.1 + num_heads, head_size, 0.1, model_config=model_config ) runner = GPUModelRunner(vllm_config, DEVICE) initialize_kv_cache(runner) @@ -1080,7 +1080,7 @@ def test_hybrid_cache_integration(default_vllm_config, dist_init): num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context["layer.0"] = Attention( - num_heads, head_size, 0.1 + num_heads, head_size, 0.1, model_config=model_config ) runner = GPUModelRunner(vllm_config, DEVICE) diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index d32bfe6cabbd..090dff4058ce 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -499,6 +499,7 @@ def create_attention_instances(self) -> dict[int, Attention]: scale=head_size**-0.5, num_kv_heads=num_kv_heads, cache_config=self.cache_config, + model_config=self.model_config, quant_config=self.quant_config, logits_soft_cap=logits_soft_cap, per_layer_sliding_window=per_layer_sliding_window,