Align MLlama code with Transformers 4.55 (#2319)

pbielak · web-flow · commit 8c36e02ef911 · 2025-11-06T16:22:02.000+01:00
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
@@ -18,6 +18,7 @@
 import json
 import logging
 import os
+from contextlib import nullcontext
 from pathlib import Path
 
 import PIL.Image
@@ -353,51 +354,53 @@ def main():
 
         htcore.hpu_set_env()
 
+    if model_type == "mllama" and args.use_flash_attention:
+        config._attn_implementation = "gaudi_fused_sdpa"
+        if args.flash_attention_recompute:
+            os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1"
+
     if args.world_size > 1:
         import deepspeed
 
-        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
-            model = AutoModelForVision2Seq.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype)
+        context = deepspeed.OnDevice(dtype=model_dtype, device="cpu")
+    else:
+        context = nullcontext()
+
+    with context:
+        model = AutoModelForVision2Seq.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, config=config)
+
+    if args.world_size > 1:
         if model_type == "mllama":
             model.language_model = initialize_distributed_model(args, model.language_model, logger, model_dtype)
             model.to("hpu")
         else:
             model = initialize_distributed_model(args, model, logger, model_dtype)
-        generator = pipeline(
-            "image-to-text",
-            model=model,
-            config=args.model_name_or_path,
-            tokenizer=args.model_name_or_path,
-            image_processor=args.model_name_or_path,
-            torch_dtype=model_dtype,
-            device="hpu",
-        )
-    else:
-        generator = pipeline(
-            "image-to-text",
-            model=args.model_name_or_path,
-            config=args.model_name_or_path,
-            tokenizer=args.model_name_or_path,
-            image_processor=None if model_type == "chatglm" else args.model_name_or_path,
-            torch_dtype=model_dtype,
-            device="hpu",
-        )
-        if args.use_hpu_graphs:
-            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
-            generator.model = wrap_in_hpu_graph(generator.model)
+    generator = pipeline(
+        "image-to-text",
+        model=model,
+        config=config,
+        tokenizer=args.model_name_or_path,
+        image_processor=None if model_type == "chatglm" else args.model_name_or_path,
+        torch_dtype=model_dtype,
+        device="hpu",
+    )
+
+    if args.world_size < 2 and args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        generator.model = wrap_in_hpu_graph(generator.model)
 
     if "falcon-11B-vlm" in args.model_name_or_path:
         # WA falcon vlm issue that image_token_id == embed size.
         generator.model.resize_token_embeddings(generator.tokenizer.vocab_size + 1)
         processor.patch_size = config.vision_config.patch_size
+
     generate_kwargs = {
         "lazy_mode": use_lazy_mode,
         "hpu_graphs": args.use_hpu_graphs,
         "max_new_tokens": args.max_new_tokens,
         "ignore_eos": args.ignore_eos,
-        "use_flash_attention": args.use_flash_attention,
-        "flash_attention_recompute": args.flash_attention_recompute,
         "bucket_internal": args.bucket_internal,
         "bucket_size": args.bucket_size,
         "limit_hpu_graphs": args.limit_hpu_graphs,
@@ -406,6 +409,14 @@ def main():
         "logits_bf16": args.logits_bf16,
     }
 
+    if model_type != "mllama":
+        generate_kwargs.update(
+            {
+                "use_flash_attention": args.use_flash_attention,
+                "flash_attention_recompute": args.flash_attention_recompute,
+            }
+        )
+
     if args.sdp_on_bf16:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
diff --git a/optimum/habana/transformers/integrations/gaudi_fused_sdpa_attention.py b/optimum/habana/transformers/integrations/gaudi_fused_sdpa_attention.py
@@ -18,13 +18,13 @@ def gaudi_fused_sdpa_attention_forward(
 ) -> tuple[torch.Tensor, None]:
     bsz, num_heads, tgt_len, head_dim = query.shape
 
+    softmax_mode = "fast" if os.getenv("FLASH_ATTENTION_FAST_SOFTMAX") == "1" else "None"
+
     if tgt_len == 1:
         # next token
-        softmax_mode = True if os.getenv("QUANT_CONFIG", "") else False
-        recompute_mode = False
+        recompute_mode = True if os.getenv("QUANT_CONFIG", "") else False
     else:
         # first token
-        softmax_mode = "fast" if os.getenv("FLASH_ATTENTION_FAST_SOFTMAX") == "1" else "None"
         recompute_mode = True if os.getenv("FLASH_ATTENTION_RECOMPUTE") == "1" else False
 
     attn_output = FusedSDPA.apply(
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
@@ -138,9 +138,7 @@
     GaudiMllamaTextModel,
     GaudiMllamaTextSelfAttention,
     GaudiMllamaVisionEncoder,
-    GaudiMllamaVisionEncoderLayer,
     GaudiMllamaVisionModel,
-    GaudiMllamaVisionSdpaAttention,
     GaudiMptAttention,
     GaudiMptBlock,
     GaudiMptForCausalLM,
@@ -847,19 +845,18 @@ def adapt_transformers_to_gaudi():
     transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration = GaudiWhisperForConditionalGeneration
 
     # Optimization for mllama on Gaudi
-    transformers.models.mllama.modeling_mllama.MllamaSelfAttentionDecoderLayer = GaudiMllamaSelfAttentionDecoderLayer
     transformers.models.mllama.modeling_mllama.MllamaCrossAttentionDecoderLayer = GaudiMllamaCrossAttentionDecoderLayer
     transformers.models.mllama.modeling_mllama.MllamaForCausalLM = GaudiMllamaForCausalLM
-    transformers.models.mllama.modeling_mllama.MllamaTextSelfAttention = GaudiMllamaTextSelfAttention
-    transformers.models.mllama.modeling_mllama.MllamaTextCrossAttention = GaudiMllamaTextCrossAttention
     transformers.models.mllama.modeling_mllama.MllamaForConditionalGeneration = GaudiMllamaForConditionalGeneration
+    transformers.models.mllama.modeling_mllama.MllamaModel = GaudiMllamaModel
+    transformers.models.mllama.modeling_mllama.MllamaSelfAttentionDecoderLayer = GaudiMllamaSelfAttentionDecoderLayer
+    transformers.models.mllama.modeling_mllama.MllamaTextCrossAttention = GaudiMllamaTextCrossAttention
     transformers.models.mllama.modeling_mllama.MllamaTextModel = GaudiMllamaTextModel
-    transformers.models.mllama.modeling_mllama.MllamaVisionModel = GaudiMllamaVisionModel
+    transformers.models.mllama.modeling_mllama.MllamaTextSelfAttention = GaudiMllamaTextSelfAttention
     transformers.models.mllama.modeling_mllama.MllamaVisionEncoder = GaudiMllamaVisionEncoder
-    transformers.models.mllama.modeling_mllama.MllamaVisionEncoderLayer = GaudiMllamaVisionEncoderLayer
-    transformers.models.mllama.modeling_mllama.MllamaVisionSdpaAttention = GaudiMllamaVisionSdpaAttention
-    transformers.models.mllama.modeling_mllama.MllamaModel = GaudiMllamaModel
+    transformers.models.mllama.modeling_mllama.MllamaVisionModel = GaudiMllamaVisionModel
 
+    # Optimization for deciLM on Gaudi
     transformers.AutoConfig.register("deci", DeciLMConfig)
     transformers.AutoModelForCausalLM.register(DeciLMConfig, DeciLMForCausalLM)
 
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
@@ -226,9 +226,7 @@
     GaudiMllamaTextModel,
     GaudiMllamaTextSelfAttention,
     GaudiMllamaVisionEncoder,
-    GaudiMllamaVisionEncoderLayer,
     GaudiMllamaVisionModel,
-    GaudiMllamaVisionSdpaAttention,
 )
 from .modeling_all_models import (
     KVCache,
diff --git a/optimum/habana/transformers/models/mllama/__init__.py b/optimum/habana/transformers/models/mllama/__init__.py
@@ -8,7 +8,5 @@
     GaudiMllamaTextModel,
     GaudiMllamaTextSelfAttention,
     GaudiMllamaVisionEncoder,
-    GaudiMllamaVisionEncoderLayer,
     GaudiMllamaVisionModel,
-    GaudiMllamaVisionSdpaAttention,
 )
diff --git a/optimum/habana/transformers/models/mllama/modeling_mllama.py b/optimum/habana/transformers/models/mllama/modeling_mllama.py

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,5 @@`
`8`	`8`	`GaudiMllamaTextModel,`
`9`	`9`	`GaudiMllamaTextSelfAttention,`
`10`	`10`	`GaudiMllamaVisionEncoder,`
`11`		`- GaudiMllamaVisionEncoderLayer,`
`12`	`11`	`GaudiMllamaVisionModel,`
`13`		`- GaudiMllamaVisionSdpaAttention,`
`14`	`12`	`)`