Select flash attention automatically when available

palonso · palonso · commit d7188dd88292 · 2025-07-15T17:28:31.000+02:00
diff --git a/src/nets/common_former.py b/src/nets/common_former.py
@@ -37,7 +37,11 @@ def forward(self, x):
 
         use_dropout = 0.0 if not self.training else self.dropout
 
-        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+        sdp_backend = SDPBackend.DEFAULT
+        if SDPBackend.is_supported(SDPBackend.FLASH_ATTENTION):
+            sdp_backend = SDPBackend.FLASH_ATTENTION
+
+        with sdpa_kernel(sdp_backend):
             context_vec = nn.functional.scaled_dot_product_attention(
                 queries,
                 keys,
diff --git a/src/nets/conformer.py b/src/nets/conformer.py
@@ -19,7 +19,6 @@ def __init__(
         qkv_bias=False,
         use_rope=False,
         max_len=10000,
-        use_flash_attention=True,
     ):
         super().__init__()
 
@@ -31,7 +30,6 @@ def __init__(
         self.d_in = d_in
         self.use_rope = use_rope
         self.rope_dim = self.head_dim
-        self.use_flash_attention = use_flash_attention
 
         self.qkv = nn.Linear(d_in, 3 * d_out, bias=qkv_bias)
         self.proj = nn.Linear(d_in, d_out)
@@ -74,17 +72,11 @@ def forward(self, x):
 
         use_dropout = 0.0 if not self.training else self.dropout
 
-        if self.use_flash_attention:
-            with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-                context_vec = nn.functional.scaled_dot_product_attention(
-                    queries,
-                    keys,
-                    values,
-                    attn_mask=None,
-                    dropout_p=use_dropout,
-                    is_causal=True,
-                )
-        else:
+        sdp_backend = SDPBackend.DEFAULT
+        if SDPBackend.is_supported(SDPBackend.FLASH_ATTENTION):
+            sdp_backend = SDPBackend.FLASH_ATTENTION
+
+        with sdpa_kernel(sdp_backend):
             context_vec = nn.functional.scaled_dot_product_attention(
                 queries,
                 keys,
@@ -290,15 +282,13 @@ def __init__(
         alpha=0.1,
         beta=0.1,
         use_rope=False,
-        use_flash_attention=True,
     ):
         super(ConformerBlock, self).__init__()
         self.feed_forward_residual_factor = feed_forward_residual_factor
         self.use_deepnorm = use_deepnorm
         self.alpha = alpha
         self.beta = beta
         self.use_rope = use_rope
-        self.use_flash_attention = use_flash_attention
 
         self.ff1 = FeedForwardBlock(embed_dim, feed_forward_expansion_factor, dropout)
         self.attention = MHAPyTorchScaledDotProduct(
@@ -307,7 +297,6 @@ def __init__(
             num_heads=num_heads,
             dropout=dropout,
             use_rope=use_rope,
-            use_flash_attention=self.use_flash_attention,
         )
         self.conv_block = ConvBlock(embed_dim, conv_kernel_size, dropout)
         self.ff2 = FeedForwardBlock(embed_dim, feed_forward_expansion_factor, dropout)
@@ -399,7 +388,6 @@ def __init__(
         use_rope: bool,
         num_patches: int,
         patch_size: Tuple[int, int] | None = None,
-        use_flash_attention: bool = True,
     ):
         super(Conformer, self).__init__()
         self.embed_dim = embed_dim
@@ -414,7 +402,6 @@ def __init__(
         self.use_deepnorm = use_deepnorm
         self.use_rope = use_rope
         self.num_patches = num_patches
-        self.use_flash_attention = use_flash_attention
 
         self.input_dropout = nn.Dropout(input_dropout)
 
@@ -437,7 +424,6 @@ def __init__(
                     alpha=self.alpha_deepnorm,
                     beta=self.beta_deepnorm,
                     use_rope=self.use_rope,
-                    use_flash_attention=self.use_flash_attention,
                 )
                 for _ in range(depth)
             ]