✨ support granite 4 dense (#635)

joerunde · web-flow · commit 344a1a19e42b · 2026-01-16T12:06:14.000-07:00
# Description

Adds support for configuring the unreleased granite 4 dense models the
same way as the granite 3 models, so they work out of the box

---------

Signed-off-by: Joe Runde &lt;joe@joerun.de&gt;
diff --git a/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json b/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json
@@ -0,0 +1,90 @@
+{
+  "architectures": [
+    "GraniteMoeHybridForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 100257,
+  "embedding_multiplier": 12,
+  "eos_token_id": 100257,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "init_method": "mup",
+  "initializer_range": 0.1,
+  "intermediate_size": 12800,
+  "layer_types": [
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention"
+  ],
+  "logits_scaling": 16,
+  "mamba_chunk_size": 256,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_head": 64,
+  "mamba_d_state": 256,
+  "mamba_expand": 2,
+  "mamba_n_groups": 1,
+  "mamba_n_heads": 128,
+  "mamba_proj_bias": false,
+  "max_position_embeddings": 131072,
+  "model_type": "granitemoehybrid",
+  "normalization_function": "rmsnorm",
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+  "pad_token_id": 100256,
+  "position_embedding_type": "rope",
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.01,
+  "shared_intermediate_size": 12800,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.56.0",
+  "use_cache": true,
+  "vocab_size": 100352
+}
diff --git a/tests/fixtures/model_configs/ibm-granite/granite4/config.json b/tests/fixtures/model_configs/ibm-granite/granite4/config.json
diff --git a/tests/models/test_granite.py b/tests/models/test_granite.py
@@ -38,19 +38,35 @@ def test_granite_3_8b_detection():
 
     assert not SpyrePlatform.is_granite_3_8b(granite_micro_config.model_config)
 
+    assert not SpyrePlatform.is_granite_4_8b_dense(granite_3_8b_config.model_config)
+
+
+@pytest.mark.cpu
+def test_granite_4_dense_detection():
+    """Check that we can detect the model config for granite 4 8b (dense)"""
+
+    granite_4_dense_config = VllmConfig(
+        model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" / "granite-4-8b-dense")),
+        cache_config=NO_SWAP_CONFIG(),
+    )
+
+    assert SpyrePlatform.is_granite_4_8b_dense(granite_4_dense_config.model_config)
+    assert not SpyrePlatform.is_granite_3_8b(granite_4_dense_config.model_config)
+
 
 @pytest.mark.cpu
 @pytest.mark.parametrize(
-    "sendnn_configured, sendnn_version, expected_blocks",
+    "model_name, sendnn_configured, sendnn_version, expected_blocks",
     [
-        (True, (0, 0, 0), 8192),
-        (True, (1, 0, 2), 2080),
-        (True, (1, 1, 0), 8192),
-        (False, (1, 0, 2), 8192),
+        ("granite-3.3-8b-instruct", True, (0, 0, 0), 8192),
+        ("granite-3.3-8b-instruct", True, (1, 0, 2), 2080),
+        ("granite-3.3-8b-instruct", True, (1, 1, 0), 8192),
+        ("granite-3.3-8b-instruct", False, (1, 0, 2), 8192),
+        ("granite-4-8b-dense", True, (1, 1, 0), 8192),
     ],
     ids=lambda vals: f"{vals}",
 )
-def test_granite_3_8b_overrides(sendnn_configured, sendnn_version, expected_blocks):
+def test_granite_overrides(model_name, sendnn_configured, sendnn_version, expected_blocks):
     """Check that the correct values are overridden for g3.3 8b"""
 
     # Must ensure no env vars have been overridden before testing
@@ -64,9 +80,7 @@ def test_granite_3_8b_overrides(sendnn_configured, sendnn_version, expected_bloc
         tp4_config = ParallelConfig(tensor_parallel_size=4)
 
         granite_3_8b_config = VllmConfig(
-            model_config=ModelConfig(
-                model=str(FIXTURES_PATH / "ibm-granite" / "granite-3.3-8b-instruct")
-            ),
+            model_config=ModelConfig(model=str(FIXTURES_PATH / "ibm-granite" / model_name)),
             parallel_config=tp4_config,
             cache_config=NO_SWAP_CONFIG(),
         )
diff --git a/vllm_spyre/config/known_model_configs.json b/vllm_spyre/config/known_model_configs.json
@@ -23,8 +23,10 @@
       "format": "float-quantized"
     }
   },
-  "ibm-granite/granite4": {
-    "model_type": "granitemoehybrid"
+  "ibm-granite/granite-4-8b-dense": {
+    "model_type": "granitemoehybrid",
+    "vocab_size": 100352,
+    "num_experts_per_tok": 0
   },
   "ibm-granite/granite-embedding-125m-english": {
     "model_type": "roberta",
diff --git a/vllm_spyre/config/supported_configs.yaml b/vllm_spyre/config/supported_configs.yaml
@@ -23,7 +23,7 @@
     { cb: True, tp_size: 4, max_model_len: 16384, max_num_seqs: 4 },
     { cb: True, tp_size: 4, max_model_len: 32768, max_num_seqs: 32 },
   ]
-- model: "ibm-granite/granite4"
+- model: "ibm-granite/granite-4-8b-dense"
   configs: [
     { cb: False, tp_size: 1, warmup_shapes: [[2048, 1024, 16]] },
     { cb: False, tp_size: 4, warmup_shapes: [[6144, 2048,  1]] },
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -1,5 +1,7 @@
 import sys
 
+from transformers import GraniteMoeHybridConfig
+
 # When running this plugin on a Mac, we assume it's for local development
 # purposes. However, due to a compatibility issue with vLLM, which overrides
 # the Triton module with a placeholder, vLLM may fail to load on macOS. To
@@ -206,7 +208,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
 
         # Hardcode some things for granite-3.3-8b-instruct
-        if cls.is_granite_3_8b(vllm_config.model_config):
+        if cls.is_granite_3_8b(vllm_config.model_config) or cls.is_granite_4_8b_dense(
+            vllm_config.model_config
+        ):
             cls.configure_granite_3_8b(vllm_config)
 
         # To disable any paged attention ops in the base scheduler, we:
@@ -731,7 +735,7 @@ def is_granite_3_8b(cls, model_config: ModelConfig):
         """Returns true if we have a model that looks like
         ibm-granite/granite-3.3-8b-instruct"""
         if not isinstance(model_config.hf_config, GraniteConfig):
-            # Not granite at all
+            # Not granite 3 at all
             return False
 
         return (
@@ -743,6 +747,24 @@ def is_granite_3_8b(cls, model_config: ModelConfig):
             and model_config.hf_config.num_attention_heads == 32
         )
 
+    @classmethod
+    def is_granite_4_8b_dense(cls, model_config: ModelConfig):
+        """Returns true if we have a dense granite 4 model with the same architecture as granite 3.3
+        8b"""
+        if not isinstance(model_config.hf_config, GraniteMoeHybridConfig):
+            # Not granite 4 at all
+            return False
+
+        return (
+            model_config.hf_config.num_hidden_layers == 40
+            and model_config.hf_config.num_experts_per_tok == 0  # dense model
+            and model_config.hf_config.max_position_embeddings == 131072
+            and model_config.hf_config.hidden_size == 4096
+            and model_config.hf_config.vocab_size == 100352
+            and model_config.hf_config.num_key_value_heads == 8
+            and model_config.hf_config.num_attention_heads == 32
+        )
+
     @classmethod
     def sendnn_configured(cls) -> bool:
         if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn":

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`{ cb: True, tp_size: 4, max_model_len: 16384, max_num_seqs: 4 },`
`24`	`24`	`{ cb: True, tp_size: 4, max_model_len: 32768, max_num_seqs: 32 },`
`25`	`25`	`]`
`26`		`-- model: "ibm-granite/granite4"`
	`26`	`+- model: "ibm-granite/granite-4-8b-dense"`
`27`	`27`	`configs: [`
`28`	`28`	`{ cb: False, tp_size: 1, warmup_shapes: [[2048, 1024, 16]] },`
`29`	`29`	`{ cb: False, tp_size: 4, warmup_shapes: [[6144, 2048, 1]] },`