diff --git a/‎src/liger_kernel/transformers/model/paligemma.py‎
Lines changed: 184 additions & 0 deletions b/‎src/liger_kernel/transformers/model/paligemma.py‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎src/liger_kernel/transformers/monkey_patch.py‎
Lines changed: 39 additions & 13 deletions b/‎src/liger_kernel/transformers/monkey_patch.py‎
Lines changed: 39 additions & 13 deletions
diff --git a/‎test/convergence/bf16/test_mini_models_multimodal.py‎
Lines changed: 75 additions & 3 deletions b/‎test/convergence/bf16/test_mini_models_multimodal.py‎
Lines changed: 75 additions & 3 deletions
@@ -21,6 +21,190 @@
 logger = logging.get_logger(__name__)
 
 
+@add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+    >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
+    >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+
+    >>> prompt = "answer en Where is the cow standing?"
+    >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(**inputs, max_length=30)
+    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "answer en Where is the cow standing?\nbeach"
+    ```"""
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # the attention mask is turned 4d after, we keep track of the original one
+    input_attention_mask = attention_mask
+
+    if inputs_embeds is None:
+        # 1. Extra the input embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        # 2. Merge text and images
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.multi_modal_projector(selected_image_feature)
+
+            if cache_position is None:
+                cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
+            inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+            )
+
+        else:
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+            # generation with cache
+            if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                # TODO @molbap this will only work for dynamic cache.
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                # Get the target length
+                target_seqlen = cache_position[-1] + 1
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], target_seqlen - attention_mask.shape[1] + 1),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses PaliGemma+ Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+    attention_mask = attention_mask.to(inputs_embeds.dtype)
+    outputs = self.language_model.model(
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+
+    hidden_states = outputs[0]
+
+    loss = None
+    logits = None
+
+    if self.training and (labels is not None):
+        shift_hidden_states = hidden_states[..., :-1, :]
+        shift_labels = labels[..., 1:]
+
+        hidden_device = shift_hidden_states.device
+
+        if attention_mask is not None:
+            # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
+            # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+            shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
+            shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
+            shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+        else:
+            shift_hidden_states = shift_hidden_states.contiguous()
+            shift_labels = shift_labels.contiguous()
+
+        # Flatten hidden state
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
+        shift_labels = shift_labels.view(-1).to(hidden_device)
+
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
+
+    else:
+        logits = self.language_model.lm_head(hidden_states)
+        if labels is not None:
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if input_attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                shift_attention_mask = input_attention_mask[..., 1:]
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return PaliGemmaCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
 @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
 @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
 @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
 
@@ -631,6 +631,7 @@ def apply_liger_kernel_to_paligemma(
 
     # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
 
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
     from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
     from transformers.models.paligemma import modeling_paligemma
     from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
@@ -639,6 +640,7 @@ def apply_liger_kernel_to_paligemma(
     from transformers.models.siglip.modeling_siglip import SiglipVisionModel
 
     from liger_kernel.transformers.model.paligemma import lce_forward
+    from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
 
     # The vision_tower is a SiglipVisionModel
     if layer_norm:
@@ -647,13 +649,22 @@ def apply_liger_kernel_to_paligemma(
     # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
     # The multi_modal_projector is Linear, nothing to do
 
-    # The language_model is Gemma2ForCausalLM
-    apply_liger_kernel_to_gemma2(rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, geglu=geglu)
+    # The language_model is GemmaForCausalLM or Gemma2ForCausalLM
+    apply_liger_kernel_to_gemma(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
+    apply_liger_kernel_to_gemma2(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
     # Handle loss function
     if cross_entropy:
         modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+        if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
+            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+        else:  # if version < 4.46.1
+            logger.warning(TRANSFORMER_DEPRECATION_WARNING)
+            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
 
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -672,16 +683,31 @@ def apply_liger_kernel_to_paligemma(
                 _patch_layer_norm_module(layer.layer_norm1)
                 _patch_layer_norm_module(layer.layer_norm2)
 
-        language_model: Gemma2ForCausalLM = model.language_model
-
-        apply_liger_kernel_to_gemma2(
-            rope=rope,
-            cross_entropy=False,
-            fused_linear_cross_entropy=False,
-            rms_norm=rms_norm,
-            geglu=geglu,
-            model=language_model,
-        )
+        language_model = model.language_model
+
+        if isinstance(language_model, GemmaForCausalLM):
+            apply_liger_kernel_to_gemma(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+
+        elif isinstance(language_model, Gemma2ForCausalLM):
+            apply_liger_kernel_to_gemma2(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+        else:
+            raise TypeError(
+                "The language_model of a PaliGemma model must be either GemmaForCausalLM or Gemma2ForCausalLM."
+            )
 
 
 def apply_liger_kernel_to_qwen2(
 
@@ -64,6 +64,10 @@
     MLLAMA_AVAILABLE = False
 
 try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.gemma.configuration_gemma import GemmaConfig
     from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast
     from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
     from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
@@ -72,7 +76,7 @@
     from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
     from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
 
-    PALIGEMMA_AVAILABLE = True
+    PALIGEMMA_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.46.0")
 except ImportError:
     PALIGEMMA_AVAILABLE = False
 
@@ -152,6 +156,55 @@
 
 if PALIGEMMA_AVAILABLE:
     MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
+        model_class=PaliGemmaForConditionalGeneration,
+        mini_model_config=PaliGemmaConfig(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+                projection_dim=1024,  # 2304
+            ),
+            text_config=GemmaConfig(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                pad_token_id=0,
+                # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                bos_token_id=1,  # 128000
+                eos_token_id=2,  # 128001
+                tie_word_embeddings=True,
+                rope_theta=10000.0,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=4,  # NOTE: outside the vocab size
+            attn_implementation="eager",
+            vocab_size=32000,
+            projection_dim=1024,
+        ),
+    )
+    MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
         liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
         liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
         model_class=PaliGemmaForConditionalGeneration,
@@ -297,7 +350,7 @@
     )
 
 
-def create_processor(model_name):
+def create_processor(model_name: str):
     if model_name == "mini_qwen2_vl":
         tokenizer_config = load_tokenizer_config(
             os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
@@ -352,7 +405,7 @@ def create_processor(model_name):
         image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
         return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
 
-    elif model_name == "mini_paligemma":
+    elif model_name.startswith("mini_paligemma"):
         tokenizer_config = load_tokenizer_config(
             os.path.join(
                 FAKE_CONFIGS_PATH,
@@ -580,6 +633,25 @@ def run_mini_model_multimodal(
                 ),
             ],
         ),
+        pytest.param(
+            "mini_paligemma2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not PALIGEMMA_AVAILABLE,
+                    reason="Paligemma2 not available in this version of transformers",
+                ),
+            ],
+        ),
     ],
 )
 def test_mini_model_multimodal(