Set attn_implementation="eager" for ViT, BERT and BGE-M3

vkovacevicTT · vkovacevicTT · commit 3e995192ebad · 2026-01-19T17:16:10.000Z
diff --git a/benchmark/tt-xla/encoders.py b/benchmark/tt-xla/encoders.py
@@ -9,7 +9,7 @@
 
 from benchmark.utils import aggregate_ttnn_perf_metrics, sanitize_filename
 from encoder_benchmark import benchmark_encoder_torch_xla
-from utils import apply_mean_pooling, apply_last_token_pooling
+from utils import apply_mean_pooling, apply_last_token_pooling, patch_transformers_for_eager_attn
 
 
 DTYPE_MAP = {
@@ -156,6 +156,11 @@ def test_encoder(
 def test_bert(output_file):
     from third_party.tt_forge_models.bert.sentence_embedding_generation.pytorch.loader import ModelLoader
 
+    # TODO(vkovacevic): Issue #804
+    from transformers import BertModel
+
+    patch_transformers_for_eager_attn(BertModel)
+
     # Configuration
     data_format = "bfloat16"
     input_sequence_length = 384
@@ -307,9 +312,15 @@ def test_bge_m3(output_file):
     import torch
     import numpy as np
     from collections import defaultdict
+
     from third_party.tt_forge_models.bge_m3.encode.pytorch.loader import ModelLoader
     from FlagEmbedding import BGEM3FlagModel
 
+    # TODO(vkovacevic): Issue #804
+    from transformers import XLMRobertaModel
+
+    patch_transformers_for_eager_attn(XLMRobertaModel)
+
     # Configuration
     data_format = "float32"
     input_sequence_length = 512
diff --git a/benchmark/tt-xla/utils.py b/benchmark/tt-xla/utils.py
@@ -377,3 +377,27 @@ def move_to_cpu(data):
         moved = [move_to_cpu(item) for item in data]
         return type(data)(moved)
     return data
+
+
+# TODO(vkovacevic): Issue #804
+def patch_transformers_for_eager_attn(cls):
+    """Monkey patch a transformers model class to use eager attention implementation.
+
+    This patches the from_pretrained method to inject attn_implementation="eager",
+    which disables SDPA and other optimized attention implementations that may not
+    be compatible with certain backends.
+
+    Args:
+        cls: A transformers model class (e.g., ViTForImageClassification, BertModel)
+    """
+    from functools import wraps
+
+    original_from_pretrained = cls.from_pretrained.__func__
+
+    @classmethod
+    @wraps(original_from_pretrained)
+    def patched_from_pretrained(cls, *args, **kwargs):
+        kwargs.setdefault("attn_implementation", "eager")
+        return original_from_pretrained(cls, *args, **kwargs)
+
+    cls.from_pretrained = patched_from_pretrained
diff --git a/benchmark/tt-xla/vision_models.py b/benchmark/tt-xla/vision_models.py
@@ -6,6 +6,7 @@
 import os
 
 from benchmark.utils import aggregate_ttnn_perf_metrics, sanitize_filename
+from utils import patch_transformers_for_eager_attn
 from vision_benchmark import benchmark_vision_torch_xla
 
 # Defaults for all vision models
@@ -229,6 +230,11 @@ def test_unet(output_file):
 def test_vit(output_file):
     from third_party.tt_forge_models.vit.pytorch.loader import ModelLoader, ModelVariant
 
+    # TODO(vkovacevic): Issue #804
+    from transformers import ViTForImageClassification
+
+    patch_transformers_for_eager_attn(ViTForImageClassification)
+
     variant = ModelVariant.BASE
     read_logits_fn = lambda output: output.logits
     test_vision(