diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 2c5ac7b0b614..5d945b1bcba5 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -73,6 +73,7 @@ def __init__( scale=1.0 / (self.head_size**0.5), num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, prefix="model.layers.0.self_attn.attn", ) self.attn._k_scale = self.attn._k_scale.to(device) diff --git a/tests/compile/passes/test_qk_norm_rope_fusion.py b/tests/compile/passes/test_qk_norm_rope_fusion.py index 25b8ea56fe25..a4c0291705a1 100644 --- a/tests/compile/passes/test_qk_norm_rope_fusion.py +++ b/tests/compile/passes/test_qk_norm_rope_fusion.py @@ -66,6 +66,7 @@ def __init__( scale=1.0 / self.head_dim**0.5, num_kv_heads=self.num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, prefix=prefix, attn_type=AttentionType.DECODER, ) diff --git a/tests/compile/passes/test_rope_kvcache_fusion.py b/tests/compile/passes/test_rope_kvcache_fusion.py index eea21c9179bd..be0792c5bc98 100644 --- a/tests/compile/passes/test_rope_kvcache_fusion.py +++ b/tests/compile/passes/test_rope_kvcache_fusion.py @@ -84,6 +84,7 @@ def __init__( scale=1.0 / head_size**0.5, num_kv_heads=num_kv_heads, cache_config=vllm_config.cache_config, + model_config=vllm_config.model_config, quant_config=vllm_config.quant_config, prefix=prefix, attn_backend=attn_backend.get_class(), diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d7695027a284..6b1362046d25 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -119,7 +119,7 @@ def model_runner(): num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context["layer.0"] = Attention( - num_heads, head_size, 0.1 + num_heads, head_size, 0.1, model_config=model_config ) runner = GPUModelRunner(vllm_config, DEVICE) initialize_kv_cache(runner) @@ -1080,7 +1080,7 @@ def test_hybrid_cache_integration(default_vllm_config, dist_init): num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context["layer.0"] = Attention( - num_heads, head_size, 0.1 + num_heads, head_size, 0.1, model_config=model_config ) runner = GPUModelRunner(vllm_config, DEVICE) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 3ff4ec62a6b5..e50d8743b6fb 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -7,7 +7,7 @@ import torch.nn as nn import vllm.envs as envs -from vllm.config import CacheConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger @@ -194,6 +194,7 @@ def __init__( alibi_slopes: list[float] | None = None, use_alibi_sqrt: bool | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, logits_soft_cap: float | None = None, per_layer_sliding_window: int | None = None, @@ -218,7 +219,6 @@ def __init__( else: sliding_window = None - vllm_config = get_current_vllm_config() if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype calculate_kv_scales = cache_config.calculate_kv_scales @@ -267,7 +267,7 @@ def __init__( ) self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( - kv_cache_dtype, vllm_config.model_config + kv_cache_dtype, model_config ) self.kv_cache_dtype = kv_cache_dtype self.calculate_kv_scales = calculate_kv_scales @@ -286,8 +286,6 @@ def __init__( self.sliding_window = sliding_window self.has_sink = extra_impl_args.get("sinks") is not None - # NOTE: model_config may be None during certain tests - model_config = vllm_config.model_config self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm # During model initialization, the default dtype is set as the model @@ -358,7 +356,7 @@ def __init__( self.use_direct_call = not current_platform.opaque_attention_op() self.use_output = self.attn_backend.accept_output_buffer - compilation_config = vllm_config.compilation_config + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index 136574d97529..f41bdbb4af11 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,7 +4,7 @@ import torch -from vllm.config import CacheConfig +from vllm.config import CacheConfig, ModelConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig @@ -88,6 +88,7 @@ def __init__( num_kv_heads: int | None = None, alibi_slopes: list[float] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, kv_sharing_target_layer_name: str | None = None, prefix: str = "", @@ -111,6 +112,7 @@ def __init__( num_kv_heads=num_kv_heads, alibi_slopes=alibi_slopes, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=prefix, kv_sharing_target_layer_name=kv_sharing_target_layer_name, diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index 31ac7fa1bd5a..9e6c191e690f 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,7 +6,7 @@ import numpy as np import torch -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv @@ -185,6 +185,7 @@ def __init__( head_size: int, scale: float, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, attn_type: str | None = None, **kwargs, ): @@ -213,6 +214,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, attn_type=AttentionType.ENCODER_DECODER, **kwargs, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index 0897ee45b84d..d40d483919da 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,7 +5,7 @@ import torch -from vllm.config import CacheConfig +from vllm.config import CacheConfig, ModelConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( @@ -59,6 +59,7 @@ def __init__( head_size: int, scale: float, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, attn_type: str | None = None, **kwargs, ): @@ -88,6 +89,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, attn_type=AttentionType.ENCODER_ONLY, **kwargs, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index 263d873218fc..2328ba8e4d7c 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,7 +4,7 @@ import torch -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp @@ -121,6 +121,7 @@ def __init__( sink_len: int, attn_backend: type[AttentionBackend] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, **kwargs, ): dtype = torch.get_default_dtype() @@ -144,6 +145,7 @@ def __init__( head_size=head_size, scale=scale, cache_config=cache_config, + model_config=model_config, attn_backend=attn_backend, **kwargs, ) diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py index f5ed4400fb65..27eb4c739645 100644 --- a/vllm/model_executor/models/AXK1.py +++ b/vllm/model_executor/models/AXK1.py @@ -365,6 +365,7 @@ def __init__( self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 22037336411a..073f62cca08f 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -10,7 +10,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -180,6 +180,7 @@ def __init__( max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -259,6 +260,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=self.sliding_window, @@ -297,6 +299,7 @@ class AfmoeDecoderLayer(nn.Module): def __init__( self, config, # AfmoeConfig + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -319,6 +322,7 @@ def __init__( max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -405,10 +409,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: AfmoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 5905a198b289..f31be7f12553 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -33,7 +33,7 @@ from transformers import ApertusConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU from vllm.model_executor.layers.attention import ( @@ -131,6 +131,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -200,6 +201,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=attn_type, prefix=f"{prefix}.attn", @@ -246,6 +248,7 @@ def __init__( config: ApertusConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -282,6 +285,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) @@ -332,6 +336,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -354,6 +359,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index bc4f85bf7ddb..c0f4e21a93e9 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -105,6 +105,7 @@ def __init__( config: LlamaConfig, cache_config: Any | None = None, quant_config: Any | None = None, + model_config: Any | None = None, prefix: str = "", ) -> None: super().__init__() @@ -135,6 +136,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=getattr( config, "attn_type", "decoder" @@ -191,6 +193,7 @@ def __init__( config: LlamaConfig = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.quant_config = quant_config self.config = config self.vocab_size = config.vocab_size @@ -214,6 +217,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 031b6534fb69..c73ddfddbc12 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -9,7 +9,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -230,6 +230,7 @@ class ArcticAttention(nn.Module): def __init__( self, config: ArcticConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -285,6 +286,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -307,6 +309,7 @@ class ArcticDecoderLayer(nn.Module): def __init__( self, config: ArcticConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -318,7 +321,8 @@ def __init__( self.use_residual = config.use_residual and is_moe_layer self.self_attn = ArcticAttention( config, - cache_config, + model_config=model_config, + cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) @@ -388,10 +392,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, org_num_embeddings=self.vocab_size ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: ArcticDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index bc1cd2ed811b..4ce894ae9399 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -30,7 +30,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -140,6 +140,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -184,6 +185,7 @@ def __init__( scaling, alibi_slopes=alibi_slopes, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) else: @@ -199,6 +201,7 @@ def __init__( self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -223,6 +226,7 @@ def __init__( position_embedding: str, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -236,6 +240,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = BaiChuanMLP( @@ -286,6 +291,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -297,7 +303,12 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: BaiChuanDecoderLayer( - config, position_embedding, cache_config, quant_config, prefix=prefix + config, + position_embedding, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 7725dfa2a887..b57288235e81 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -33,7 +33,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -73,6 +73,7 @@ class BailingAttention(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, reduce_results: bool = True, @@ -142,6 +143,7 @@ def __init__( self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -336,6 +338,7 @@ class BailingMoeBlock(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -348,7 +351,11 @@ def __init__( self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) self.attention = BailingAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attention" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attention", ) self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps) @@ -416,10 +423,12 @@ def __init__( self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: BailingMoeBlock( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index ecc5d63ced75..09aaa291e43a 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -113,6 +113,7 @@ def __init__( layer_id: int = 0, prefix: str = "attention", cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -819,6 +820,7 @@ def __init__( layer_id=layer_id, prefix=f"{prefix}.self_attn", cache_config=cache_config, + model_config=model_config, ) # MLP/MoE diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index d220b22ddae7..a00a040ea167 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -215,6 +215,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.feed_forward = BambaMLP( diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 01854b96d56f..edb88dc086d5 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -128,12 +128,14 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.layer = nn.ModuleList( [ BertLayer( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layer.{layer_idx}", ) for layer_idx in range(config.num_hidden_layers) @@ -155,6 +157,7 @@ def __init__( config: BertConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -165,6 +168,7 @@ def __init__( layer_norm_eps=config.layer_norm_eps, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attention", ) @@ -199,6 +203,7 @@ def __init__( layer_norm_eps: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -208,6 +213,7 @@ def __init__( num_attention_heads=num_attention_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.output", ) @@ -233,6 +239,7 @@ def __init__( num_attention_heads: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -269,6 +276,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 22bcdeb453c4..1a0575c33e29 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -7,7 +7,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -94,6 +94,7 @@ def __init__( num_attention_heads: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, bias: bool = True, rotary_kwargs: dict | None = None, prefix: str = "", @@ -136,6 +137,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -347,6 +349,7 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, moe: bool = False, bias: bool = True, rotary_kwargs: dict | None = None, @@ -358,6 +361,7 @@ def __init__( num_attention_heads=config.num_attention_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, bias=bias, rotary_kwargs=rotary_kwargs, prefix=f"{prefix}.attention", @@ -411,7 +415,8 @@ def __init__( prefix: str = "", ): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config every_n = getattr(config, "moe_every_n_layers", 0) @@ -421,6 +426,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, bias=bias, moe=every_n > 0 and (layer_idx % every_n == 1), rotary_kwargs=rotary_kwargs, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 8b5fd452e8ff..68f706a6de79 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -13,7 +13,7 @@ apply_chunking_to_forward, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs import MultiModalDataDict from vllm.model_executor.layers.activation import get_act_fn @@ -83,6 +83,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, is_cross_attention: bool = False, prefix: str = "", ) -> None: @@ -184,6 +185,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, is_cross_attention: bool = False, prefix: str = "", ) -> None: @@ -193,6 +195,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, is_cross_attention=is_cross_attention, prefix=f"{prefix}.attention", ) @@ -252,6 +255,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, layer_idx: int, prefix: str = "", ) -> None: @@ -263,6 +267,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.attention", ) @@ -273,6 +278,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, is_cross_attention=True, prefix=f"{prefix}.crossattention", ) @@ -345,6 +351,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -357,6 +364,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, layer_idx=layer_idx, prefix=f"{prefix}.layer.{layer_idx}", ) @@ -390,6 +398,7 @@ def __init__( *, quant_config: QuantizationConfig | None, cache_config: CacheConfig | None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -403,6 +412,7 @@ def __init__( config, quant_config=quant_config, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.encoder", ) @@ -538,6 +548,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -561,6 +572,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.qformer_config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.qformer", ) self.language_projection = nn.Linear( diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 233028a905f6..9eb8821ec516 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -28,7 +28,7 @@ from transformers import BloomConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -90,6 +90,7 @@ def __init__( config: BloomConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -133,6 +134,7 @@ def __init__( alibi_slopes=alibi_slopes, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -185,6 +187,7 @@ def __init__( config: BloomConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -192,7 +195,11 @@ def __init__( self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.self_attention = BloomAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.self_attention", ) self.post_attention_layernorm = nn.LayerNorm( hidden_size, eps=config.layer_norm_epsilon @@ -243,6 +250,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.embed_dim = config.hidden_size @@ -260,7 +268,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: BloomBlock( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a150428baff4..1bfadd4b0e27 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,7 +16,7 @@ ChameleonVQVAEConfig, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict @@ -269,6 +269,7 @@ def __init__( max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -323,6 +324,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -359,6 +361,7 @@ class ChameleonDecoderLayer(nn.Module): def __init__( self, config: ChameleonConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -377,6 +380,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -420,6 +424,7 @@ class ChameleonSwinDecoderLayer(nn.Module): def __init__( self, config: ChameleonConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -438,6 +443,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -846,10 +852,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else ChameleonSwinDecoderLayer ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c5d857e7c3df..32aa9fdb1972 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -13,7 +13,7 @@ from torch.nn import LayerNorm from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -51,6 +51,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -120,6 +121,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -195,6 +197,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -212,7 +215,11 @@ def __init__( # Self attention. self.self_attention = GLMAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.self_attention", ) self.hidden_dropout = config.hidden_dropout @@ -268,6 +275,7 @@ def __init__( config: ChatGLMConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -279,7 +287,13 @@ def __init__( # Transformer layers. self.start_layer, self.end_layer, self.layers = make_layers( self.num_layers, - lambda prefix: GLMBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GLMBlock( + config, + cache_config, + quant_config, + model_config, + prefix=prefix, + ), prefix=f"{prefix}.layers", ) @@ -329,6 +343,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -343,7 +358,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multi_query_group_num = config.multi_query_group_num self.kv_channels = config.kv_channels self.encoder = GLMTransformer( - config, cache_config, quant_config, prefix=f"{prefix}.encoder" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.encoder", ) self.output_layer = ParallelLMHead( diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 05a494683a85..a119b06b4a56 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,7 +14,7 @@ CLIPVisionConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict, MultiModalInput @@ -356,6 +356,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -409,6 +410,7 @@ def __init__( self.num_heads_per_partition, self.head_dim, self.scale, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -469,6 +471,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[Attention] | type[MMEncoderAttention], @@ -478,6 +481,7 @@ def __init__( self.self_attn = CLIPAttention( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -517,6 +521,7 @@ def __init__( self, config: CLIPTextConfig | CLIPVisionConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -536,6 +541,7 @@ def __init__( CLIPEncoderLayer( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -567,6 +573,7 @@ def __init__( self, config: CLIPTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", ) -> None: @@ -580,6 +587,7 @@ def __init__( self.encoder = CLIPEncoder( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.encoder", attn_cls=Attention, ) @@ -824,6 +832,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: CLIPConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config @@ -840,6 +849,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.text_model = CLIPTextTransformer( text_config, quant_config=quant_config, + model_config=model_config, prefix=maybe_prefix(prefix, "text_model"), ) self.text_projection = nn.Linear( diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py index 1cebea56a138..418af2693d19 100644 --- a/vllm/model_executor/models/cohere_asr.py +++ b/vllm/model_executor/models/cohere_asr.py @@ -96,6 +96,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = embed_dim @@ -148,6 +149,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) else: # AttentionType.DECODER (regular decoder self-attention) self.attn = Attention( @@ -159,6 +161,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) def _init_qkv( @@ -201,6 +204,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__( embed_dim=embed_dim, @@ -210,6 +214,7 @@ def __init__( quant_config=quant_config, prefix=prefix, attn_type=AttentionType.ENCODER_DECODER, + model_config=model_config, ) def _init_qkv( @@ -347,6 +352,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.first_sub_layer", + model_config=vllm_config.model_config, ) # cross attn to attend to encoder @@ -357,6 +363,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.second_sub_layer", + model_config=vllm_config.model_config, ) self.layer_norm_3 = nn.LayerNorm(self.hidden_dim) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index e73dfb1f01e3..2c885140c7a0 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -31,7 +31,7 @@ from transformers import Cohere2Config, CohereConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -131,6 +131,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -198,6 +199,7 @@ def __init__( quant_config=quant_config, per_layer_sliding_window=self.sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) if self.use_qk_norm: self.q_norm = LayerNorm( @@ -240,6 +242,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.hidden_size = config.hidden_size @@ -249,6 +252,7 @@ def __init__( cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = CohereMLP(config, quant_config=quant_config, prefix=f"{prefix}.mlp") @@ -284,6 +288,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.quant_config = quant_config self.config = config @@ -296,7 +301,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: CohereDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index ca6e6a49a98a..8c49efe16786 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,7 +8,7 @@ import torch.nn as nn from transformers import DbrxConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -187,6 +187,7 @@ class DbrxAttention(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -248,6 +249,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -272,6 +274,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -279,7 +282,11 @@ def __init__( super().__init__() self.d_model = config.d_model self.attn = DbrxAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", ) self.norm_1 = nn.LayerNorm(self.d_model) self.norm_2 = nn.LayerNorm(self.d_model) @@ -305,13 +312,18 @@ class DbrxBlock(nn.Module): def __init__( self, config: DbrxConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ): super().__init__() self.norm_attn_norm = DbrxFusedNormAttention( - config, cache_config, quant_config, prefix=f"{prefix}.norm_attn_norm" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.norm_attn_norm", ) self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn") @@ -342,9 +354,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, config.d_model, ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: DbrxBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: DbrxBlock( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), prefix=f"{prefix}.blocks", ) self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f50e38b60f8e..3e73b1c4d5ff 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -167,6 +167,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -529,6 +530,7 @@ def __init__( self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, + model_config=vllm_config.model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 4e393145462a..05a0acc58e57 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -203,6 +203,7 @@ def __init__( num_kv_heads: int, config: Dots1Config, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -258,6 +259,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -301,6 +303,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, config=config, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f038cfb21f28..23c96a92dc6d 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -33,7 +33,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -239,6 +239,7 @@ def __init__( max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -298,6 +299,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -324,6 +326,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -342,6 +345,7 @@ def __init__( max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -439,10 +443,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Ernie4_5_MoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 418fdcfa072b..12d81dddaea4 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -32,7 +32,7 @@ from transformers import PretrainedConfig # from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention @@ -97,6 +97,7 @@ def __init__( max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -164,6 +165,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -407,6 +409,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -427,6 +430,7 @@ def __init__( max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -538,10 +542,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Ernie4_5_VLMoeDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index b633fd285082..2cbf7b70752e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -33,7 +33,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -117,6 +117,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -179,6 +180,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -206,6 +208,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.attention = ExaoneAttention( @@ -218,6 +221,7 @@ def __init__( bias=bias, cache_config=cache_config, prefix=f"{prefix}.attention", + model_config=model_config, ) def forward( @@ -238,6 +242,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -259,6 +264,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.mlp = ExaoneGatedMLP( hidden_size=self.hidden_size, @@ -318,6 +324,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.wte = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: ExaoneDecoderLayer( @@ -325,6 +332,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 485b145b9cdf..13cbdc8045a2 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -29,7 +29,7 @@ from transformers import Exaone4Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -117,6 +117,7 @@ def __init__( bias: bool = False, cache_config: CacheConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -191,6 +192,7 @@ def __init__( quant_config=quant_config, per_layer_sliding_window=self.sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -222,6 +224,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -244,6 +247,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = Exaone4GatedMLP( hidden_size=self.hidden_size, @@ -313,6 +317,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Exaone4DecoderLayer( @@ -320,6 +325,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index d7282edcf4f6..41d40ea41ca6 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -24,7 +24,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -179,6 +179,7 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, mtp_layer: bool = None, prefix: str = "", ) -> None: @@ -204,6 +205,7 @@ def __init__( bias=attention_bias, cache_config=cache_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) if config.is_moe_layer[layer_idx] and not mtp_layer: @@ -254,7 +256,8 @@ class ExaoneMoeModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -287,6 +290,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.layers", diff --git a/vllm/model_executor/models/extract_hidden_states.py b/vllm/model_executor/models/extract_hidden_states.py index 608e93d6a930..c3b88186c3c0 100644 --- a/vllm/model_executor/models/extract_hidden_states.py +++ b/vllm/model_executor/models/extract_hidden_states.py @@ -14,7 +14,7 @@ import torch import torch.nn as nn -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.forward_context import get_forward_context from vllm.model_executor.layers.attention.attention import set_default_quant_scales @@ -240,6 +240,7 @@ def __init__( num_heads: int, head_size: int, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ): @@ -250,6 +251,7 @@ def __init__( self.layer_name = prefix vllm_config = get_current_vllm_config() + model_config = model_config or vllm_config.model_config # KV cache configuration cache_config = cache_config or vllm_config.cache_config @@ -265,7 +267,7 @@ def __init__( f"kv cache dtype was set to {kv_cache_dtype}" ) self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( - kv_cache_dtype, vllm_config.model_config + kv_cache_dtype, model_config ) # Initialize KV cache quantization attributes @@ -357,12 +359,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # and head_size <- hidden_size so that we can insert # the hidden states directly into the cache without # reshaping + model_config = vllm_config.model_config self.cache_only_layers = nn.ModuleDict( { str(self.target_num_hidden_layers): CacheOnlyAttentionLayer( num_heads=self.num_hidden_states, head_size=self.hidden_size, cache_config=cache_config, + model_config=model_config, prefix=maybe_prefix( prefix, f"cache_only_layers.{self.target_num_hidden_layers}" ), diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index efd24b51442a..19a6a899e1d0 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -31,7 +31,7 @@ from transformers import FalconConfig as HF_FalconConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -97,6 +97,7 @@ def __init__( config: FalconConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -176,6 +177,7 @@ def __init__( self.inv_norm_factor, num_kv_heads=self.num_kv_heads, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) elif self.use_alibi: @@ -193,6 +195,7 @@ def __init__( num_kv_heads=self.num_kv_heads, alibi_slopes=alibi_slopes, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) else: @@ -203,6 +206,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -270,13 +274,18 @@ def __init__( config: FalconConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.self_attention = FalconAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attention" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.self_attention", ) self.mlp = FalconMLP(config, quant_config, prefix=f"{prefix}.mlp") self.config = config @@ -370,6 +379,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.embed_dim = config.hidden_size @@ -386,7 +396,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: FalconDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index fba2e216e3fa..97b65c5055d6 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -215,6 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): def __init__( self, config: FalconH1Config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -280,6 +281,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -340,6 +342,7 @@ def __init__( # Instantiate the attention branch self.self_attn = FalconH1AttentionDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6e35020a6eac..237bc4af57e3 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -27,7 +27,7 @@ from transformers import GemmaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -133,6 +133,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -186,6 +187,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -208,6 +210,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -221,6 +224,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = GemmaMLP( hidden_size=self.hidden_size, @@ -266,6 +270,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -276,7 +281,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GemmaDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 425ecc65195a..dcea7d13b6b5 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -24,7 +24,7 @@ from transformers import Gemma2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -109,6 +109,7 @@ def __init__( quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -169,6 +170,7 @@ def __init__( logits_soft_cap=attn_logits_soft_cap, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -191,6 +193,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -205,6 +208,7 @@ def __init__( quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.hidden_size = config.hidden_size self.mlp = Gemma2MLP( @@ -257,6 +261,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -267,7 +272,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma2DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index b2352a3c9268..e9963a9d1a7b 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -23,7 +23,7 @@ from transformers import Gemma3TextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -116,6 +116,7 @@ def __init__( quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -207,6 +208,7 @@ def __init__( logits_soft_cap=attn_logits_soft_cap, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -238,6 +240,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -252,6 +255,7 @@ def __init__( quant_config=quant_config, attn_logits_soft_cap=None, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.hidden_size = config.hidden_size self.mlp = Gemma3MLP( @@ -306,6 +310,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -318,7 +323,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma3DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 770424ba0fdf..ac4e3c21d542 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -22,7 +22,7 @@ from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.logger import init_logger @@ -287,6 +287,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config @@ -399,6 +400,7 @@ def __init__( per_layer_sliding_window=self.sliding_window, kv_sharing_target_layer_name=kv_sharing_target_layer_name, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -434,6 +436,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() assert isinstance(config, Gemma3nTextConfig) @@ -459,6 +462,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = Gemma3nMLP( hidden_size=config.hidden_size, @@ -812,10 +816,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) # Allocate config.num_kv_shared_layers layers for self-decoder + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Gemma3nDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 89447927d5cd..e28699497557 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -30,7 +30,7 @@ from transformers import Glm4Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -69,6 +69,7 @@ def __init__( qkv_bias: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -130,6 +131,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", attn_type=attn_type, ) @@ -159,6 +161,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size @@ -172,6 +175,7 @@ def __init__( head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index d0e6cb6ada8b..82d2b1fcfb23 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -33,7 +33,7 @@ from transformers.models.glm4_moe import Glm4MoeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -241,6 +241,7 @@ def __init__( use_qk_norm: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -297,6 +298,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -331,6 +333,7 @@ def __init__( config: Glm4MoeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", enable_eplb: bool = False, ) -> None: @@ -353,6 +356,7 @@ def __init__( qkv_bias=config.attention_bias, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", use_qk_norm=config.use_qk_norm, ) @@ -414,6 +418,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config enable_eplb = vllm_config.parallel_config.enable_eplb self.config = config @@ -432,6 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, enable_eplb=enable_eplb, ), diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index cde94673e53a..b31bea4a32e1 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -30,7 +30,7 @@ import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, ParallelConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -79,6 +79,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, parallel_config: ParallelConfig | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -92,6 +93,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, enable_eplb=self.enable_eplb, ) @@ -136,6 +138,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, parallel_config=vllm_config.parallel_config, + model_config=vllm_config.model_config, ) for idx in range( self.mtp_start_layer_idx, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 41a4ca174257..0c568b0d5792 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -28,7 +28,7 @@ from transformers import GPT2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -66,6 +66,7 @@ def __init__( config: GPT2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -98,6 +99,7 @@ def __init__( scale=self.scale, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -151,6 +153,7 @@ def __init__( config: GPT2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -159,7 +162,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPT2Attention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -190,6 +197,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.add_cross_attention @@ -205,7 +213,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GPT2Block( + config, + cache_config, + quant_config, + model_config, + prefix=prefix, + ), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c6629c937dc6..5b23100fe1cd 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -29,7 +29,7 @@ from transformers import GPTBigCodeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -63,6 +63,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -106,6 +107,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -166,6 +168,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -174,7 +177,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = GPTBigCodeAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTBigMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -207,6 +214,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.add_cross_attention @@ -221,7 +229,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: GPTBigCodeBlock( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index c29103c6d52c..3eea5a56c4a3 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -27,7 +27,7 @@ from transformers import GPTJConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -65,6 +65,7 @@ def __init__( config: GPTJConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -110,6 +111,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -163,13 +165,18 @@ def __init__( config: GPTJConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attn", ) self.mlp = GPTJMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -197,6 +204,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -207,7 +215,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.n_layer, - lambda prefix: GPTJBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GPTJBlock( + config, + cache_config, + quant_config, + model_config, + prefix=prefix, + ), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8d44d12fc212..781ff3350763 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -27,7 +27,7 @@ from transformers import GPTNeoXConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -62,6 +62,7 @@ def __init__( config: GPTNeoXConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -102,6 +103,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -153,6 +155,7 @@ def __init__( config: GPTNeoXConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -164,7 +167,11 @@ def __init__( config.hidden_size, eps=config.layer_norm_eps ) self.attention = GPTNeoXAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attention" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attention", ) self.mlp = GPTNeoXMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -204,6 +211,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -214,7 +222,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GPTNeoXLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index a9ec82974227..0ea82f3c1ca7 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -9,7 +9,7 @@ from transformers import GptOssConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_dp_group, get_ep_group, @@ -71,6 +71,7 @@ def __init__( config: GptOssConfig, quant_config: QuantizationConfig | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -138,6 +139,7 @@ def __init__( num_kv_heads=self.num_local_key_value_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=AttentionType.DECODER, prefix=f"{prefix}.attn", @@ -229,6 +231,7 @@ def __init__( config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config + model_config = vllm_config.model_config self.layer_idx = extract_layer_index(prefix) self.attn = OAIAttention( @@ -236,6 +239,7 @@ def __init__( prefix=f"{prefix}.attn", quant_config=quant_config, cache_config=cache_config, + model_config=model_config, ) self.mlp = MLPBlock(vllm_config, self.layer_idx, prefix=f"{prefix}.mlp") self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 4b486ede4439..dd18403ad03e 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -32,7 +32,7 @@ from transformers import GraniteConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -168,6 +169,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -190,6 +192,7 @@ class GraniteDecoderLayer(nn.Module): def __init__( self, config: GraniteConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -213,6 +216,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -273,10 +277,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GraniteDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 171b2e0ec5a0..3aa333302823 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -32,7 +32,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -142,6 +142,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, rope_parameters: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -199,6 +200,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -226,6 +228,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config @@ -237,6 +240,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1ab069e3ba38..5c38d0bceec3 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -288,6 +288,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 7abc682c58e5..8ce243f364b4 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -14,7 +14,7 @@ from transformers.models.granitemoeshared import GraniteMoeSharedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -80,6 +80,7 @@ def __init__( config: GraniteMoeSharedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -90,6 +91,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -152,6 +154,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -170,7 +173,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: GraniteMoeSharedDecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + cache_config=cache_config, + quant_config=quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 0bd6a8f3d606..6e3aa634ff72 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -240,6 +240,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, rope_parameters: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -306,6 +307,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, @@ -333,6 +335,7 @@ class Grok1DecoderLayer(nn.Module): def __init__( self, config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -352,6 +355,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_parameters=_get_rope_parameters(config), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -470,10 +474,15 @@ def __init__( quant_config=quant_config, ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Grok1DecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index a0130402c66f..2925b686df16 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -150,6 +150,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -215,6 +216,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -257,6 +259,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -320,6 +323,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -477,6 +481,7 @@ class HunYuanDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -511,6 +516,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, @@ -526,6 +532,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, @@ -620,10 +627,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: HunYuanDecoderLayer( config=config, + model_config=model_config, layer_id=int(prefix.split(".")[-1]), cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py index 3176c4284139..9769be81f082 100644 --- a/vllm/model_executor/models/hyperclovax.py +++ b/vllm/model_executor/models/hyperclovax.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -119,6 +119,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, @@ -179,6 +180,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -206,6 +208,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -231,6 +234,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, cache_config=cache_config, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c00b9a0ee671..f4593ac55a6b 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -11,7 +11,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -95,6 +95,7 @@ def __init__( num_kv_heads: int, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -150,6 +151,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -196,6 +198,7 @@ class InternLMDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -209,6 +212,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attention", @@ -258,6 +262,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -270,7 +275,11 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: layer_type( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index da0dfe73e6f7..7a6744beab6c 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -7,7 +7,7 @@ from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig @@ -24,6 +24,7 @@ class InternLM2VEDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -37,6 +38,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attention", diff --git a/vllm/model_executor/models/interns1_pro.py b/vllm/model_executor/models/interns1_pro.py index 28331b8ef3e8..54e9d277d344 100644 --- a/vllm/model_executor/models/interns1_pro.py +++ b/vllm/model_executor/models/interns1_pro.py @@ -32,7 +32,7 @@ from torch import nn from transformers import AutoProcessor, PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_tensor_model_parallel_world_size, @@ -272,6 +272,7 @@ def __init__( head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -331,6 +332,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -371,6 +373,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -403,6 +406,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 24c004ff4c20..ab86db91321c 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -25,7 +25,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -66,6 +66,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -159,6 +160,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=loop_cache_config, + model_config=model_config, quant_config=quant_config, attn_type=attn_type, prefix=f"{unique_prefix}.attn", diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index cc0c1aa01baf..ef723222c56e 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -29,7 +29,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -88,6 +88,7 @@ def __init__( config: JAISConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -133,6 +134,7 @@ def __init__( alibi_slopes=alibi_slopes, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -205,6 +207,7 @@ def __init__( config: JAISConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -213,7 +216,11 @@ def __init__( self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.attn = JAISAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attn", ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = JAISMLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") @@ -246,6 +253,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config assert not config.scale_attn_by_inverse_layer_idx @@ -268,6 +276,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ), prefix=f"{prefix}.h", diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 4e03eb12ee44..3fc207225eba 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -32,7 +32,7 @@ from transformers import Jais2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, @@ -114,6 +114,7 @@ def __init__( quant_config: QuantizationConfig | None = None, bias: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -192,6 +193,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", ) @@ -221,6 +223,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = self.get_quant_config(vllm_config) + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -240,6 +243,7 @@ def __init__( quant_config=quant_config, bias=attention_bias, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = Jais2MLP( diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 980bcffb5f9b..c1e088b026a8 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -239,6 +239,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, prefix=f"{prefix}.attn", + model_config=model_config, ) num_experts = config.layers_num_experts[layer_idx] diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index 4cd7b63c1472..0f3223ba09d8 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -195,6 +195,7 @@ def __init__( use_nope: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", **kwargs, ) -> None: diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 95a8cdb8711d..40e1b70923f2 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -101,6 +101,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -157,6 +158,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -207,6 +209,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index d955b7127adc..562acb564708 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -192,6 +192,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -248,6 +249,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -299,6 +301,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, max_position_embeddings=max_position_embeddings, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ecced3df8ba..3cc29cf1e9dc 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -32,7 +32,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import ( @@ -133,6 +133,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -214,6 +215,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, attn_type=attn_type, @@ -260,7 +262,8 @@ def __init__( ) -> None: super().__init__() - config = config or vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = config or model_config.hf_config cache_config = vllm_config.cache_config quant_config = self.get_quant_config(vllm_config) @@ -297,6 +300,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index b84b4e2ae512..22232f2309d2 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,7 +25,7 @@ from transformers import Llama4TextConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_tensor_model_parallel_world_size, @@ -180,6 +180,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -263,6 +264,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", **( @@ -325,6 +327,7 @@ def __init__( config = config or vllm_config.model_config.hf_config cache_config = vllm_config.cache_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config self.layer_idx = extract_layer_index(prefix) @@ -342,6 +345,7 @@ def __init__( bias=False, bias_o_proj=False, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) is_moe_layer = ( diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 43475ed690c9..49a825335881 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -8,6 +8,7 @@ from vllm.config import ( CacheConfig, + ModelConfig, VllmConfig, get_current_vllm_config, str_dtype_to_torch_dtype, @@ -221,6 +222,7 @@ def __init__( max_position_embeddings: int = 32768, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, partial_rotary_factor: float = 1.0, prefix: str = "", ) -> None: @@ -292,6 +294,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, per_layer_sliding_window=sliding_window, attn_type=AttentionType.DECODER, prefix=f"{prefix}.attn", @@ -328,7 +331,8 @@ def forward( class MiMoV2FlashDecoderLayer(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() - config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config + config = model_config.hf_text_config quant_config = vllm_config.quant_config layer_id = extract_layer_index(prefix) @@ -358,6 +362,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rope_theta=getattr(config, "swa_rope_theta", rope_theta), max_position_embeddings=max_position_embeddings, quant_config=quant_config, + model_config=model_config, partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), prefix=f"{prefix}.self_attn", ) @@ -375,6 +380,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, quant_config=quant_config, + model_config=model_config, partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 54870eb2ede4..a9fda85b9b52 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -243,6 +243,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -297,6 +298,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -319,11 +321,13 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.hidden_size = config.hidden_size self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix @@ -343,6 +347,7 @@ def _init_attn_block(self): cache_config=self.cache_config, quant_config=self.quant_config, prefix=f"{self.prefix}.self_attn", + model_config=self.model_config, ) def _init_ffn_block(self): @@ -416,7 +421,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, ) self.num_experts = getattr(self.config, "num_experts", 0) - self._init_layers(prefix, config, cache_config, quant_config) + model_config = vllm_config.model_config + self._init_layers( + prefix, config, cache_config, quant_config, model_config=model_config + ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( @@ -429,11 +437,16 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None = None, ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MiniCPMDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index e61e9d06103d..4a9068bb0fd7 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,7 +29,7 @@ from torch import nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -64,6 +64,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -131,6 +132,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -201,6 +203,7 @@ def _init_attn_block(self): cache_config=self.cache_config, quant_config=self.quant_config, prefix=f"{self.prefix}.self_attn", + model_config=self.model_config, ) @@ -211,11 +214,16 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None = None, ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MiniCPM3DecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index e9f1a91bfc4a..9a446b0af1b4 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -32,7 +32,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -62,12 +62,14 @@ def __init__( config: PretrainedConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.hidden_size = config.hidden_size self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix @@ -86,6 +88,7 @@ def _init_attn_block(self): max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, + model_config=self.model_config, prefix=f"{self.prefix}.self_attn", ) @@ -149,10 +152,12 @@ def __init__( config = vllm_config.speculative_config.draft_model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.cache_config = cache_config self.quant_config = quant_config + self.model_config = model_config self.vocab_size = config.vocab_size @@ -166,7 +171,14 @@ def __init__( config.hidden_size, ) self.num_experts = getattr(self.config, "num_experts", 0) - self._init_layers(prefix, config, cache_config, quant_config, start_layer) + self._init_layers( + prefix, + config, + cache_config, + quant_config, + model_config, + start_layer, + ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], self.config.hidden_size @@ -178,6 +190,7 @@ def _init_layers( config: PretrainedConfig, cache_config: CacheConfig | None, quant_config: QuantizationConfig | None, + model_config: ModelConfig | None, start_layer: int, ): self.eagle_layers = nn.ModuleList( @@ -186,6 +199,7 @@ def _init_layers( config, cache_config, quant_config, + model_config, f"{prefix}.eagle_layers.{i + start_layer}", ) for i in range(self.config.num_hidden_layers) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 0f43bc0cdcec..5d5d13b516ae 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -153,6 +153,7 @@ def __init__( head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -213,6 +214,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, per_layer_sliding_window=attn_window_size, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -272,6 +274,7 @@ def __init__( rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 21d74d8b0580..505e4d39aabd 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -197,6 +197,7 @@ def __init__( max_position: int = 4096 * 32, rope_parameters: dict | None = None, sliding_window: int | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, cache_config: CacheConfig | None = None, @@ -245,6 +246,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -332,6 +334,7 @@ def __init__( max_position=max_position_embeddings, rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, + model_config=model_config, quant_config=quant_config, layer_idx=self._ilayer, cache_config=cache_config, diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index ce1332d0c9d1..b174428cf2f1 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -9,7 +9,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -86,6 +86,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -99,6 +100,7 @@ def __init__( bias=bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=prefix, attn_type=attn_type, ) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 376fd7a1709d..3b3b92345675 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -33,7 +33,7 @@ from transformers import MixtralConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -163,6 +163,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -218,6 +219,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -241,6 +243,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", enable_eplb: bool = False, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -253,6 +256,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.block_sparse_moe = MixtralMoE( num_experts=config.num_local_experts, @@ -316,6 +320,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.enable_eplb = parallel_config.enable_eplb self.num_redundant_experts = parallel_config.eplb_config.num_redundant_experts + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MixtralDecoderLayer( @@ -324,6 +329,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, prefix=prefix, enable_eplb=self.enable_eplb, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index a29b1a9fbfbb..f82c655c8d9b 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -63,7 +63,11 @@ def forward( class ModernBertAttention(nn.Module): def __init__( - self, config: ModernBertConfig, layer_id: int | None = None, prefix: str = "" + self, + config: ModernBertConfig, + layer_id: int | None = None, + prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -115,6 +119,7 @@ def __init__( self.num_heads, self.head_dim, self.scaling, + model_config=model_config, prefix=f"{layer_id}.attn", per_layer_sliding_window=sliding_window, ) @@ -161,7 +166,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class ModernBertLayer(nn.Module): def __init__( - self, config: ModernBertConfig, prefix: str = "", layer_id: int | None = None + self, + config: ModernBertConfig, + prefix: str = "", + layer_id: int | None = None, + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -172,7 +181,10 @@ def __init__( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias ) self.attn = ModernBertAttention( - config=config, layer_id=layer_id, prefix=f"{prefix}.attn" + config=config, + layer_id=layer_id, + prefix=f"{prefix}.attn", + model_config=model_config, ) self.mlp_norm = nn.LayerNorm( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias @@ -203,6 +215,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): config=config, layer_id=layer_id, prefix=f"{prefix}.layers.{layer_id}", + model_config=vllm_config.model_config, ) for layer_id in range(config.num_hidden_layers) ] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 1d756a2addeb..e4b9891ff8fe 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -20,7 +20,7 @@ ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import ( get_pp_group, @@ -410,6 +410,7 @@ class MolmoAttention(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -468,6 +469,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -600,6 +602,7 @@ class MolmoDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -607,7 +610,11 @@ def __init__( super().__init__() # Attention block. self.self_attn = MolmoAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) # MLP block. @@ -853,13 +860,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, ) + model_config = vllm_config.model_config decoder_layer = ( MolmoDecoderNormAfterLayer if config.norm_after else MolmoDecoderLayer ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer( - config, cache_config, quant_config, prefix=prefix + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index aa58fa6d1583..d500e98a0432 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -24,7 +24,7 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import ( get_pp_group, @@ -885,6 +885,7 @@ def __init__( self, config: TextConfig, rope_parameters: dict[str, Any], + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -959,6 +960,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -1068,6 +1070,7 @@ def __init__( self, config: TextConfig, rope_parameters: dict[str, Any], + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -1077,8 +1080,9 @@ def __init__( self.self_attn = Molmo2Attention( config, rope_parameters, - cache_config, - quant_config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, prefix=f"{prefix}.self_attn", ) @@ -1182,11 +1186,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if text_config.norm_after else Molmo2DecoderLayer ) + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( text_config.num_hidden_layers, lambda prefix: decoder_layer( text_config, hf_text_config.rope_parameters, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 85933626cd30..e7da483b3506 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -11,7 +11,7 @@ from transformers import MptConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -59,6 +59,7 @@ def __init__( config: MptConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -128,6 +129,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -189,13 +191,18 @@ def __init__( config: MptConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() hidden_size = config.d_model self.norm_1 = nn.LayerNorm(hidden_size) self.attn = MPTAttention( - config, cache_config, quant_config, prefix=f"{prefix}.attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.attn", ) self.norm_2 = nn.LayerNorm(hidden_size) self.ffn = MPTMLP(config, quant_config, prefix=f"{prefix}.ffn") @@ -225,6 +232,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config assert config.embedding_fraction == 1.0 assert config.norm_type == "low_precision_layernorm" @@ -235,7 +243,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: MPTBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: MPTBlock( + config, + cache_config, + quant_config, + model_config, + prefix=prefix, + ), prefix=f"{prefix}.blocks", ) self.norm_f = nn.LayerNorm(config.d_model) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 15d43a9ddf98..b66086c98fde 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -31,7 +31,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -152,6 +152,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -207,6 +208,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -229,6 +231,7 @@ class NemotronDecoderLayer(nn.Module): def __init__( self, config: NemotronConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -251,6 +254,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -315,10 +319,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: NemotronDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 4ec794eccf72..3b6a3d4cae3c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -490,6 +490,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", per_layer_sliding_window=sliding_window, + model_config=model_config, ) def forward( diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f2f3811c0644..356fc0d0bc6d 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -32,7 +32,7 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -86,6 +86,7 @@ def __init__( bias: bool = False, bias_o_proj: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -99,6 +100,7 @@ def __init__( bias, bias_o_proj, cache_config, + model_config, prefix, attn_type, ) @@ -131,6 +133,7 @@ def __init__( layer_idx: int, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -164,6 +167,7 @@ def __init__( bias=attention_bias, bias_o_proj=bias_o_proj, cache_config=cache_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -236,6 +240,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -262,6 +267,7 @@ def get_layer(prefix: str): layer_idx, cache_config, quant_config=quant_config, + model_config=model_config, prefix=prefix, ) diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index ae417f095eb4..cc5349438b98 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -20,7 +20,7 @@ PretrainedConfig, ) -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs import MultiModalDataDict @@ -102,6 +102,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = config.d_model @@ -113,6 +114,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.activation_fn = get_act_fn(config.activation_function) @@ -128,6 +130,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.encoder_attn", + model_config=model_config, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -252,6 +255,7 @@ def __init__( lora_config: LoRAConfig | None = None, embed_tokens: nn.Embedding | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.cache_config = cache_config @@ -273,6 +277,7 @@ def __init__( cache_config, quant_config, prefix=f"{prefix}.layers.{layer_idx}", + model_config=model_config, ) for layer_idx in range(config.decoder_layers) ] @@ -596,6 +601,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.decoder", + model_config=vllm_config.model_config, ) self.vocab_size = config.decoder.vocab_size diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 4491a6a3ea1b..09a2fb770728 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -32,7 +32,7 @@ from transformers import OlmoConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -74,6 +74,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.config = config @@ -113,6 +114,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) # Attention output projection. @@ -201,11 +203,16 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() # Attention block. self.self_attn = OlmoAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn", + model_config=model_config, ) # MLP block. @@ -246,6 +253,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config @@ -255,7 +263,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OlmoDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 212140fe15ea..19c9c4f5d055 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -33,7 +33,7 @@ from transformers import Olmo2Config, Olmo3Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_tensor_model_parallel_rank @@ -72,10 +72,18 @@ class Olmo2Attention(nn.Module): (plus another skip connection). """ - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ): super().__init__() self.config = vllm_config.model_config.hf_config assert isinstance(self.config, (Olmo2Config, Olmo3Config)) + if model_config is None: + model_config = vllm_config.model_config hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -135,6 +143,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=vllm_config.quant_config, per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", + model_config=model_config, ) # Rotary embeddings. Rope scaling is only applied on full attention layers. @@ -242,9 +251,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config assert isinstance(config, (Olmo2Config, Olmo3Config)) + model_config = vllm_config.model_config # Attention block. self.self_attn = Olmo2Attention( - vllm_config=vllm_config, prefix=f"{prefix}.self_attn" + vllm_config=vllm_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", ) # MLP block. diff --git a/vllm/model_executor/models/olmo_hybrid.py b/vllm/model_executor/models/olmo_hybrid.py index 97e56b3ff6f9..594f9cb2502b 100644 --- a/vllm/model_executor/models/olmo_hybrid.py +++ b/vllm/model_executor/models/olmo_hybrid.py @@ -596,9 +596,17 @@ def _forward_core( class OlmoHybridAttention(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ): super().__init__() self.config = vllm_config.model_config.hf_config + if model_config is None: + model_config = vllm_config.model_config hidden_size = self.config.hidden_size self.tp_size = get_tensor_model_parallel_world_size() @@ -653,6 +661,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) rope_parameters = getattr(self.config, "rope_parameters", None) @@ -771,6 +780,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: else: self.self_attn = OlmoHybridAttention( vllm_config=vllm_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) # Attention layers use these norm names diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index f0afe0e997cc..6c78734fc96b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -22,7 +22,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -119,12 +119,20 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class OlmoeAttention(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__( + self, + *, + vllm_config: VllmConfig, + model_config: ModelConfig | None = None, + prefix: str = "", + ) -> None: super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + if model_config is None: + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 4096) @@ -187,6 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def _apply_qk_norm( @@ -225,8 +234,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.hidden_size = config.hidden_size + model_config = vllm_config.model_config self.self_attn = OlmoeAttention( vllm_config=vllm_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 994ae82529ab..6f728d9e4ecf 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -30,7 +30,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ParallelConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.distributed import ( get_ep_group, get_pp_group, @@ -422,6 +422,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, bias_o_proj: bool = False, @@ -508,6 +509,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, @@ -560,6 +562,7 @@ def __init__( num_kv_heads: int, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, bias: bool = False, bias_o_proj: bool = False, @@ -662,6 +665,7 @@ def __init__( self.scaling, sink_len=self.param_sink_number, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, per_layer_sliding_window=sliding_window, @@ -820,6 +824,7 @@ def __init__( if config is None: config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config @@ -885,6 +890,7 @@ def __init__( ), rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, bias_o_proj=bias_o_proj, @@ -915,6 +921,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), max_position_embeddings=max_position_embeddings, + model_config=model_config, quant_config=quant_config, bias=attention_bias, bias_o_proj=bias_o_proj, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 81653b9516ac..c68327950073 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -28,7 +28,7 @@ from transformers import OPTConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -78,6 +78,7 @@ def __init__( bias: bool = True, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -110,6 +111,7 @@ def __init__( scale=self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -130,6 +132,7 @@ def __init__( config: OPTConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -141,6 +144,7 @@ def __init__( bias=config.enable_bias, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.do_layer_norm_before = config.do_layer_norm_before @@ -203,6 +207,7 @@ def __init__( config: OPTConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -257,7 +262,11 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OPTDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) @@ -303,9 +312,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.decoder = OPTDecoder( - config, cache_config, quant_config, prefix=f"{prefix}.decoder" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.decoder", ) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states"], config.hidden_size diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 3cacb9d61cd5..268be2a77032 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -16,7 +16,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -93,6 +93,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -146,6 +147,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -168,6 +170,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -181,6 +184,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=model_config, ) self.mlp = OrionMLP( hidden_size=self.hidden_size, @@ -226,6 +230,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -236,7 +241,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OrionDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 56505ec7be20..26c638d0f047 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -34,7 +34,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -185,6 +186,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, attn_type=attn_type, @@ -216,6 +218,7 @@ class OuroDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -237,6 +240,7 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -332,11 +336,13 @@ def __init__( ) # Use the provided decoder layer type or default to OuroDecoderLayer + model_config = vllm_config.model_config decoder_layer_type = decoder_layer_type or OuroDecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer_type( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index a03a785577ee..2130e3d982f0 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -31,7 +31,7 @@ from transformers import PersimmonConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -95,6 +95,7 @@ def __init__( config: PersimmonConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -144,6 +145,7 @@ def __init__( scale=self.scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -189,6 +191,7 @@ def __init__( config: PersimmonConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -197,6 +200,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = PersimmonMLP( @@ -246,6 +250,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size self.config = config @@ -255,7 +260,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: PersimmonDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 75c42c0d3930..9495be016af5 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -46,7 +46,7 @@ from transformers import PhiConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -81,6 +81,7 @@ def __init__( config: PhiConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -121,6 +122,7 @@ def __init__( scaling, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -176,6 +178,7 @@ def __init__( config: PhiConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -183,7 +186,11 @@ def __init__( config.hidden_size, eps=config.layer_norm_eps ) self.self_attn = PhiAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.self_attn", ) self.mlp = PhiMLP(config, quant_config, prefix=f"{prefix}.mlp") @@ -211,6 +218,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.quant_config = quant_config @@ -219,7 +227,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: PhiLayer(config, cache_config, quant_config, prefix=prefix), + lambda prefix: PhiLayer( + config, + cache_config, + quant_config, + model_config, + prefix=prefix, + ), prefix=f"{prefix}.layers", ) self.final_layernorm = nn.LayerNorm( diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 0b55b7ec8392..29a589ed0ef3 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -32,7 +32,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE @@ -310,6 +310,7 @@ def __init__( max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -364,6 +365,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -386,6 +388,7 @@ def __init__( config: PhiMoEConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -401,6 +404,7 @@ def __init__( ), cache_config=cache_config, quant_config=quant_config, + model_config=model_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -453,6 +457,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size @@ -466,7 +471,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: PhiMoEDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index e179638a869b..48ab77eb05b4 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -21,7 +21,7 @@ position_ids_in_meshgrid, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict @@ -345,6 +345,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): with self._mark_tower_model(vllm_config, "image"): self.vision_encoder = VisionTransformer( self.vision_args, + model_config=vllm_config.model_config, prefix=maybe_prefix(prefix, "vision_encoder"), ) self.pre_mm_projector_norm = ( @@ -696,6 +697,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -760,6 +762,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -767,6 +770,7 @@ def __init__( self.attention = Attention( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attention", disable_tp=disable_tp, ) @@ -800,6 +804,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", disable_tp: bool = False, ): @@ -810,6 +815,7 @@ def __init__( TransformerBlock( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{idx}", disable_tp=disable_tp, ) @@ -850,6 +856,7 @@ def __init__( self, args: VisionEncoderArgs, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -866,6 +873,7 @@ def __init__( self.transformer = Transformer( args, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.transformer", disable_tp=disable_tp, ) diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 44b1207745ec..43bfb2188148 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -615,6 +615,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 2ba38a7b1f8f..920aaeacad86 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -196,6 +196,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, per_layer_sliding_window=config.interleaved_sliding_window[layer_idx], prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b4526beac637..d2e2a805ec87 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -96,6 +96,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.hidden_size = hidden_size @@ -133,6 +134,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) def forward( @@ -155,6 +157,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -167,6 +170,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, ) self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -209,6 +213,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -219,7 +224,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: QWenBlock(config, cache_config, quant_config, prefix=prefix), + lambda prefix: QWenBlock( + config, + cache_config, + quant_config, + prefix=prefix, + model_config=model_config, + ), prefix=f"{prefix}.h", ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 27aa6175b9bc..38024a4c738c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -34,7 +34,7 @@ from transformers import Qwen2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import ( @@ -126,6 +126,7 @@ def __init__( rope_parameters: dict[str, Any], max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -195,6 +196,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, attn_type=attn_type, prefix=f"{prefix}.attn", @@ -241,6 +243,7 @@ def __init__( self, config: Qwen2Config, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -269,6 +272,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", @@ -366,6 +370,7 @@ def __init__( super().__init__() config = vllm_config.model_config.hf_config.get_text_config() + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -402,6 +407,7 @@ def __init__( lambda prefix: decoder_layer_type( config=config, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=prefix, ), diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 4b0c756165a5..e52f7c1015cb 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -35,7 +35,7 @@ from transformers import Qwen2MoeConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -206,6 +206,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", dual_chunk_attention_config: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -262,6 +263,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": dual_chunk_attention_config, @@ -290,6 +292,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -307,6 +310,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.self_attn", dual_chunk_attention_config=dual_chunk_attention_config, + model_config=model_config, ) # Note: Qwen/Qwen2-57B-A14B-Instruct does not have @@ -365,6 +369,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.vocab_size = config.vocab_size self.config = config @@ -382,6 +387,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + model_config=model_config, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 6dec60232b1d..825474a4ed79 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -31,7 +31,7 @@ from transformers import Qwen3Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention.encoder_only_attention import ( @@ -68,6 +68,7 @@ def __init__( rms_norm_eps: float = 1e-06, qkv_bias: bool = False, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, @@ -129,6 +130,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=attn_type, @@ -167,6 +169,7 @@ def __init__( self, config: Qwen3Config, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: @@ -195,6 +198,7 @@ def __init__( qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, + model_config=model_config, quant_config=quant_config, rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/qwen3_dflash.py b/vllm/model_executor/models/qwen3_dflash.py index ce45136d7c0b..4c5abacc85fd 100644 --- a/vllm/model_executor/models/qwen3_dflash.py +++ b/vllm/model_executor/models/qwen3_dflash.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention @@ -68,6 +68,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.layer_name = prefix @@ -118,6 +119,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=attn_type, + model_config=model_config, ) self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -162,6 +164,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + model_config = vllm_config.model_config set_default_rope_theta(config, default_theta=1000000) attn_type = AttentionType.DECODER @@ -178,6 +181,7 @@ def __init__( rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, + model_config=model_config, ) self.mlp = Qwen3MLP( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f2ce070be8b4..a889d110487a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -33,7 +33,7 @@ from torch import nn from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_ep_group, get_pp_group, @@ -270,6 +270,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", dual_chunk_attention_config: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -326,6 +327,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": dual_chunk_attention_config, @@ -365,6 +367,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config = vllm_config.model_config.hf_text_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -384,6 +387,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config=quant_config, prefix=f"{prefix}.self_attn", dual_chunk_attention_config=dual_chunk_attention_config, + model_config=model_config, ) # `mlp_only_layers` in the config. diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 6cf386cc8ba2..89305ceab26f 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -266,6 +266,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", + model_config=model_config, **{ "layer_idx": extract_layer_index(prefix), "dual_chunk_attention_config": self.dual_chunk_attention_config, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index d90174911fb6..4fa1e36108b5 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -31,7 +31,7 @@ from transformers import PretrainedConfig as SeedOssConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -116,6 +116,7 @@ def __init__( head_dim: int, rope_parameters: dict, max_position: int = 4096 * 32, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -169,6 +170,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, attn_type=attn_type, @@ -192,6 +194,7 @@ class SeedOssDecoderLayer(nn.Module): def __init__( self, config: SeedOssConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -215,6 +218,7 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, rope_parameters=config.rope_parameters, @@ -309,11 +313,13 @@ def __init__( self.embed_tokens = PPMissingLayer() # Use the provided decoder layer type or default to SeedDecoderLayer + model_config = vllm_config.model_config decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer_type( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index ce3a260d0ef6..db48eb6bf812 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -15,7 +15,7 @@ SiglipVisionConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import MultiModalDataDict, MultiModalInput @@ -360,6 +360,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -414,6 +415,7 @@ def __init__( self.num_heads_per_partition, self.head_dim, self.scale, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -480,6 +482,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", attn_cls: type[EncoderOnlyAttention] | type[MMEncoderAttention], @@ -491,6 +494,7 @@ def __init__( self.self_attn = SiglipAttention( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", attn_cls=attn_cls, ) @@ -525,6 +529,7 @@ def __init__( self, config: SiglipVisionConfig | SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, num_hidden_layers_override: int | None = None, *, prefix: str = "", @@ -544,6 +549,7 @@ def __init__( SiglipEncoderLayer( config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.layers.{layer_idx}", attn_cls=attn_cls, ) @@ -575,6 +581,7 @@ def __init__( self, config: SiglipTextConfig, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, *, prefix: str = "", ) -> None: @@ -588,6 +595,7 @@ def __init__( self.encoder = SiglipEncoder( config=config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.encoder", attn_cls=EncoderOnlyAttention, ) @@ -1042,6 +1050,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: SiglipConfig = vllm_config.model_config.hf_config + model_config = vllm_config.model_config quant_config = vllm_config.quant_config self.config = config @@ -1059,6 +1068,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.text_model = SiglipTextTransformer( text_config, quant_config=quant_config, + model_config=model_config, prefix=maybe_prefix(prefix, "text_model"), ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index bff866d0d0c2..10a1d541e6a8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -31,7 +31,7 @@ from transformers import PretrainedConfig from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -113,6 +113,7 @@ def __init__( max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, prefix: str = "", ) -> None: @@ -168,6 +169,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -190,6 +192,7 @@ class SolarDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -212,6 +215,7 @@ def __init__( max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -274,10 +278,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() + model_config = vllm_config.model_config self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: SolarDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 034c9c18ff7b..1af5cb2a57ea 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,7 +29,7 @@ from torch import nn from transformers import StableLmConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -98,6 +98,7 @@ def __init__( config: StableLmConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() @@ -158,6 +159,7 @@ def __init__( num_kv_heads=self.num_key_value_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -180,11 +182,16 @@ def __init__( config: StableLmConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ) -> None: super().__init__() self.self_attn = StablelmAttention( - config, cache_config, quant_config, prefix=f"{prefix}.self_attn" + config, + cache_config, + quant_config, + model_config, + prefix=f"{prefix}.self_attn", ) self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) @@ -221,6 +228,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, @@ -231,7 +239,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: StablelmDecoderLayer( - config, cache_config, quant_config, prefix=prefix + config, + cache_config, + quant_config, + model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 5f08a59e2364..0b3df4a183bc 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -29,7 +29,7 @@ from transformers import Starcoder2Config from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.attention import Attention @@ -67,6 +67,7 @@ def __init__( config: Starcoder2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -123,6 +124,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.attn", ) @@ -176,6 +178,7 @@ def __init__( config: Starcoder2Config, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + model_config: ModelConfig | None = None, prefix: str = "", ): super().__init__() @@ -184,6 +187,7 @@ def __init__( config, cache_config, quant_config=quant_config, + model_config=model_config, prefix=f"{prefix}.self_attn", ) self.mlp = Starcoder2MLP( @@ -225,6 +229,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + model_config = vllm_config.model_config self.config = config self.vocab_size = config.vocab_size @@ -238,7 +243,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( - config, cache_config, quant_config=quant_config, prefix=prefix + config, + cache_config, + quant_config=quant_config, + model_config=model_config, + prefix=prefix, ), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 07653fa6b377..b99cda87735d 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -10,7 +10,7 @@ import torch from torch import nn -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, @@ -88,6 +88,7 @@ class StepAttention(nn.Module): def __init__( self, config, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -144,6 +145,7 @@ def __init__( self.head_dim, self.scale, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, alibi_slopes=alibi_slopes, @@ -200,12 +202,14 @@ class StepDecoderLayer(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size self.self_attn = StepAttention( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 18b689166a5f..0a01c5b2e604 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -149,6 +149,7 @@ def __init__( share_q_dim: int | None = None, max_position_embedding: int = 8192, head_dim: int = 256, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -205,6 +206,7 @@ def __init__( self.head_dim, scaling, self.num_kv_heads, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn", ) @@ -226,6 +228,7 @@ class Step3TextDecoderLayer(nn.Module): def __init__( self, config: Step3TextConfig, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -237,6 +240,7 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=1, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, norm_eps=config.rms_norm_eps, @@ -315,6 +319,7 @@ class Step3TextModel(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.vocab_size = config.vocab_size @@ -334,6 +339,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( config=config, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index bb4bf14a9632..810838773216 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -138,6 +138,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, rope_scaling: dict[str, Any] | None = None, + model_config: ModelConfig | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, # Step3p5 specific args @@ -245,6 +246,7 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -432,6 +434,7 @@ def __init__( ) -> None: super().__init__() config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config self.hidden_size = config.hidden_size layer_idx = extract_layer_index(prefix) self.layer_idx = layer_idx @@ -470,6 +473,7 @@ def __init__( rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=head_dim if head_dim else getattr(config, "head_dim", None), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, rope_scaling=getattr(config, "rope_scaling", None), diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 8b3ef56c80a9..b4491a23c1c9 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -581,6 +581,7 @@ def create_attention_instances(self) -> dict[int, Attention]: scale=head_size**-0.5, num_kv_heads=num_kv_heads, cache_config=self.cache_config, + model_config=self.model_config, quant_config=self.quant_config, logits_soft_cap=logits_soft_cap, per_layer_sliding_window=per_layer_sliding_window, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f0f6f619b022..3e0fee8c6b40 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -153,6 +153,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__() self.embed_dim = embed_dim @@ -206,6 +207,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + model_config=model_config, ) else: # AttentionType.DECODER (regular decoder self-attention) self.attn = Attention( @@ -218,6 +220,7 @@ def __init__( prefix=f"{prefix}.attn", attn_type=self.attn_type, per_layer_sliding_window=per_layer_sliding_window, + model_config=model_config, ) def _init_qkv( @@ -260,6 +263,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + model_config: ModelConfig | None = None, ): super().__init__( embed_dim=embed_dim, @@ -269,6 +273,7 @@ def __init__( quant_config=quant_config, prefix=prefix, attn_type=AttentionType.ENCODER_DECODER, + model_config=model_config, ) def _init_qkv( @@ -367,6 +372,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=vllm_config.model_config, ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.mlp = WhisperMLP( @@ -410,6 +416,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + model_config=vllm_config.model_config, ) self.self_attn_layer_norm = nn.LayerNorm(config.d_model) self.encoder_attn = WhisperCrossAttention( @@ -418,6 +425,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.encoder_attn", + model_config=vllm_config.model_config, ) self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) self.mlp = WhisperMLP( diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index 8e4322ea335d..8b4447bd5632 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -11,7 +11,7 @@ import torch.nn.functional as F from torch import nn -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm @@ -277,6 +277,7 @@ def __init__( num_kv_heads: int | None = None, alibi_slopes: list[float] | None = None, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, logits_soft_cap: float | None = None, per_layer_sliding_window: int | None = None, @@ -312,6 +313,7 @@ def __init__( num_kv_heads=num_kv_heads, alibi_slopes=alibi_slopes, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, logits_soft_cap=logits_soft_cap, per_layer_sliding_window=per_layer_sliding_window, @@ -344,6 +346,7 @@ def __init__( per_layer_sliding_window: int | None = None, block_pool_size: int = 1, cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ): @@ -386,6 +389,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=AttentionType.DECODER, @@ -444,11 +448,11 @@ def forward( class WhisperCausalEncoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + config = model_config.hf_config sliding_window = getattr(config, "sliding_window", None) block_pool_size = config.block_pool_size assert block_pool_size > 1 - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -462,6 +466,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): block_pool_size=block_pool_size, per_layer_sliding_window=sliding_window, cache_config=cache_config, + model_config=model_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index b4d844ba6d76..45071d44d996 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -118,6 +118,7 @@ def __init__( config: Zamba2Config, bare_block_idx: int, num_hybrid_layers: int, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,6 +190,7 @@ def __init__( self.num_attention_heads, self.attention_head_dim, self.scale, + model_config=model_config, cache_config=cache_config, prefix=f"{prefix}.attn.{j}", ) @@ -403,6 +405,7 @@ def __init__( config: Zamba2Config, bare_block_idx: int, num_hybrid_layers: int, + model_config: ModelConfig | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -424,6 +427,7 @@ def __init__( config, bare_block_idx=bare_block_idx, num_hybrid_layers=num_hybrid_layers, + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, @@ -715,6 +719,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: config, bare_block_idx=idx, num_hybrid_layers=len(layer2block_map), + model_config=model_config, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}", diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 94f8c096e313..e69096716036 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -352,7 +352,7 @@ def resolve_kv_cache_dtype_string( def kv_cache_dtype_str_to_dtype( - kv_cache_dtype: str, model_config: ModelConfig + kv_cache_dtype: str, model_config: ModelConfig | None ) -> torch.dtype: if kv_cache_dtype == "auto": # Model config may not be specified for unit tests, default to float16