[model] support gemma4_unified (#108)

Jintao-Huang · web-flow · commit 5b27c58376e1 · 2026-06-04T16:01:03.000+08:00
diff --git a/README.md b/README.md
@@ -145,7 +145,7 @@ The following is the list of models supported by MCore-Bridge:
 | Series     | model_type                                            |
 | -------- | ------------------------------------------------------------ |
 | Qwen     | qwen2_vl, qwen2_5_vl, qwen2_5_omni<br />qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr<br />qwen3_5, qwen3_5_moe |
-| Gemma     | gemma4                                   |
+| Gemma     | gemma4, gemma4_unified                                   |
 | GLM      | glm4v, glm4v_moe |
 | Kimi     | kimi_vl                                   |
 | InternVL | internvl_chat, internvl                           |
diff --git a/README_zh.md b/README_zh.md
@@ -142,7 +142,7 @@ uv pip install -e . --torch-backend=auto
 | 系列     | model_type                                                   |
 | -------- | ------------------------------------------------------------ |
 | Qwen     | qwen2_vl, qwen2_5_vl, qwen2_5_omni<br />qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr<br />qwen3_5, qwen3_5_moe |
-| Gemma     | gemma4                                   |
+| Gemma     | gemma4, gemma4_unified                                   |
 | GLM      | glm4v, glm4v_moe |
 | Kimi     | kimi_vl                                   |
 | InternVL | internvl_chat, internvl                           |
diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py
@@ -190,7 +190,7 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]:
         n_shared_experts = res.pop('n_shared_experts')
     elif llm_model_type in {'ernie4_5', 'ernie4_5_moe', 'glm4'}:
         res['rotary_interleaved'] = True
-    elif hf_model_type in {'gemma4'}:
+    elif hf_model_type in {'gemma4', 'gemma4_unified'}:
         res['qk_layernorm'] = True
         res['window_size'] = f'{window_size - 1},0'
         window_attn_skip_freq = ','.join(['1' if lt == 'sliding_attention' else '0' for lt in layer_types])
diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py
@@ -32,6 +32,7 @@ class MLLMModelType:
     kimi_vl = 'kimi_vl'
     llama4 = 'llama4'
     gemma4 = 'gemma4'
+    gemma4_unified = 'gemma4_unified'
 
     kimi_k25 = 'kimi_k25'
 
diff --git a/src/mcore_bridge/model/gpts/deepseek_v4.py b/src/mcore_bridge/model/gpts/deepseek_v4.py
@@ -391,7 +391,7 @@ def _convert_hf_state_dict(self, hf_state_dict, to_mcore):
                     k = k[:-len('.scale')] + '.weight_scale_inv'
                 new_res[k] = v
             res = new_res
-        elif not to_mcore:
+        else:
             res = self._remove_prefix(res, 'model.')
             new_res = {}
             for k, v in res.items():
diff --git a/src/mcore_bridge/model/gpts/qwen3_emb.py b/src/mcore_bridge/model/gpts/qwen3_emb.py
@@ -11,7 +11,7 @@ def _convert_hf_state_dict(self, hf_state_dict, to_mcore):
         res = super()._convert_hf_state_dict(hf_state_dict, to_mcore)
         if to_mcore:
             res = self._add_prefix(res, 'model.')
-        elif not to_mcore:
+        else:
             res = self._remove_prefix(res, 'model.')
         return res
 
diff --git a/src/mcore_bridge/model/mm_gpts/gemma4.py b/src/mcore_bridge/model/mm_gpts/gemma4.py
@@ -166,7 +166,7 @@ def __init__(
         orig_k_layernorm = submodules.k_layernorm
         config.kv_channels = self.head_dim
         config.num_query_groups = self.num_key_value_heads
-        if self.is_sliding and config.window_size is None:
+        if text_config.use_bidirectional_attention == 'vision':
             kwargs['attn_mask_type'] = AttnMaskType.arbitrary
         if self.is_kv_shared_layer:
             submodules.k_layernorm = IdentityOp
@@ -291,10 +291,10 @@ def forward(self, hidden_states: Tensor, attention_mask: Tensor, **kwargs) -> Tu
         packed_seq_params = kwargs.get('packed_seq_params')
         attention_bias = kwargs.get('attention_bias')
         mixed_qkv, _ = self.linear_qkv(hidden_states)
-        if getattr(self, 'world_size', None) is not None and self.config.num_query_groups < self.world_size:
+        if getattr(self, 'world_size', None) is not None and self.num_key_value_heads < self.world_size:
             mixed_qkv = all_gather_last_dim_from_tensor_parallel_region(mixed_qkv)
-            idx = get_tensor_model_parallel_rank() // (self.world_size // self.config.num_query_groups)
-            size = mixed_qkv.size()[-1] // self.config.num_query_groups
+            idx = get_tensor_model_parallel_rank() // (self.world_size // self.num_key_value_heads)
+            size = mixed_qkv.size()[-1] // self.num_key_value_heads
             mixed_qkv = mixed_qkv[:, :, idx * size:(idx + 1) * size]
 
         thd_format = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
@@ -327,9 +327,9 @@ def forward(self, hidden_states: Tensor, attention_mask: Tensor, **kwargs) -> Tu
                 value = value.squeeze(1)
         # Query [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
-        if getattr(self, 'world_size', None) is not None and self.config.num_query_groups < self.world_size:
-            idx = get_tensor_model_parallel_rank() % (self.world_size // self.config.num_query_groups)
-            size = query.shape[2] // (self.world_size // self.config.num_query_groups)
+        if getattr(self, 'world_size', None) is not None and self.num_key_value_heads < self.world_size:
+            idx = get_tensor_model_parallel_rank() % (self.world_size // self.num_key_value_heads)
+            size = query.shape[2] // (self.world_size // self.num_key_value_heads)
             query = query[:, :, idx * size:(idx + 1) * size, :]
         query = self.q_layernorm(query)
         if isinstance(rotary_pos_emb, torch.Tensor):
@@ -609,23 +609,24 @@ def forward(self, *args, **kwargs):
         extra_block_kwargs['shared_kv_states'] = shared_kv_states
         kwargs['extra_block_kwargs'] = extra_block_kwargs
         attention_mask = kwargs.get('attention_mask')
-        kwargs['attention_mask'] = {'sliding_attention': attention_mask, 'full_attention': attention_mask}
+        attention_mask = {'sliding_attention': attention_mask, 'full_attention': attention_mask}
         if self.text_config.use_bidirectional_attention == 'vision':
-            kwargs['attention_mask']['sliding_attention'] = self._create_sliding_attention_mask(
-                attention_mask, mm_token_type_ids)
+            self._update_attention_mask(attention_mask, mm_token_type_ids)
+        kwargs['attention_mask'] = attention_mask
         hidden_states = super().forward(*args, **kwargs)
         if self.hidden_size_per_layer_input and not self.post_process:
             hidden_states = self._pack_pp_output(hidden_states, per_layer_inputs, shared_kv_states)
         return hidden_states
 
-    def _create_sliding_attention_mask(self, attention_mask, mm_token_type_ids):
+    def _update_attention_mask(self, attention_mask, mm_token_type_ids):
+        sliding_attention = attention_mask['sliding_attention']
+        full_attention = attention_mask['full_attention']
+        # sliding
         window_size = self.text_config.sliding_window - 1
-        seq_len = attention_mask.shape[-1]
-
-        window_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=attention_mask.device)
+        seq_len = sliding_attention.shape[-1]
+        window_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=sliding_attention.device)
         window_mask = ~torch.triu(window_mask, diagonal=-window_size)
-
-        attention_mask = attention_mask | window_mask
+        sliding_attention = sliding_attention | window_mask
         if mm_token_type_ids is not None:
             is_vision = mm_token_type_ids > 0
             is_prev_vision = torch.roll(is_vision, shifts=1, dims=-1)
@@ -635,8 +636,10 @@ def _create_sliding_attention_mask(self, attention_mask, mm_token_type_ids):
             q_group = vision_group_ids.unsqueeze(1).unsqueeze(-1)
             k_group = vision_group_ids.unsqueeze(1).unsqueeze(-2)
             same_vision_group = (q_group == k_group) & (q_group >= 0) & (k_group >= 0)
-            attention_mask = attention_mask & ~same_vision_group
-        return attention_mask
+            sliding_attention = sliding_attention & ~same_vision_group
+            full_attention = full_attention & ~same_vision_group
+        attention_mask['sliding_attention'] = sliding_attention
+        attention_mask['full_attention'] = full_attention
 
     def _pack_pp_output(self, hidden_states, per_layer_inputs, shared_kv_states):
         per_layer_inputs = per_layer_inputs.view(*hidden_states.shape[:2], -1)
@@ -854,3 +857,62 @@ def _replace_router(self, transformer_layer_spec, mlp_key='experts_mlp'):
         visual_cls=Gemma4Vit,
         loader=Gemma4Loader,
     ))
+
+
+class Gemma4UnifiedVit(Gemma4Vit):
+    module_mapping = {
+        'model.embed_vision': 'embed_vision',
+        'model.embed_audio': 'embed_audio',
+    }
+    _vision_tower = []
+
+    def prepare_model(self, hf_config: PretrainedConfig):
+        from transformers.models.gemma4_unified.modeling_gemma4_unified import (Gemma4UnifiedModel,
+                                                                                Gemma4UnifiedMultimodalEmbedder,
+                                                                                Gemma4UnifiedVisionEmbedder)
+        dtype = hf_config.torch_dtype
+        self.embed_vision = (
+            Gemma4UnifiedVisionEmbedder(hf_config.vision_config, hf_config.text_config).to(dtype)
+            if hf_config.vision_config is not None else None)
+
+        self.embed_audio = (
+            Gemma4UnifiedMultimodalEmbedder(hf_config.audio_config, hf_config.text_config).to(dtype)
+            if hf_config.audio_config is not None else None)
+        self.register_buffer('embed_scale', torch.tensor(hf_config.hidden_size**0.5).to(dtype), persistent=False)
+        self.model_cls = Gemma4UnifiedModel
+
+
+class Gemma4UnifiedBridge(Gemma4Bridge):
+
+    def _convert_hf_state_dict(self, hf_state_dict, to_mcore):
+        res = super()._convert_hf_state_dict(hf_state_dict, to_mcore)
+        new_state_dict = {}
+        if to_mcore:
+            for k, v in res.items():
+                if k.startswith('model.embed_vision.embedding_projection.'):
+                    new_state_dict['model.embed_vision.multimodal_embedder.embedding_projection.'
+                                   + k[len('model.embed_vision.embedding_projection.'):]] = v
+                elif k.startswith('model.vision_embedder.'):
+                    new_state_dict['model.embed_vision.' + k[len('model.vision_embedder.'):]] = v
+                else:
+                    new_state_dict[k] = v
+        else:
+            for k, v in res.items():
+                if k.startswith('model.embed_vision.multimodal_embedder.'):
+                    new_state_dict['model.embed_vision.' + k[len('model.embed_vision.multimodal_embedder.'):]] = v
+                elif k.startswith('model.embed_vision.'):
+                    new_state_dict['model.vision_embedder.' + k[len('model.embed_vision.'):]] = v
+                else:
+                    new_state_dict[k] = v
+        res = new_state_dict
+        return res
+
+
+register_model(
+    ModelMeta(
+        ModelType.gemma4_unified,
+        ['gemma4_unified'],
+        bridge_cls=Gemma4UnifiedBridge,
+        visual_cls=Gemma4UnifiedVit,
+        loader=Gemma4Loader,
+    ))