Fix inference speed. (#928)

PoTaTo-Mika · pre-commit-ci[bot] · web-flow · commit 3eb2f3299fb8 · 2025-03-20T08:59:54.000Z
* Fix inference speed. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/fish_speech/models/text2semantic/inference.py b/fish_speech/models/text2semantic/inference.py
@@ -13,7 +13,6 @@
 import torch._dynamo.config
 import torch._inductor.config
 from loguru import logger
-from torch.nn.attention import SDPBackend, sdpa_kernel
 from tqdm import tqdm
 from transformers import AutoTokenizer
 
@@ -24,12 +23,7 @@
     TextPart,
     VQPart,
 )
-from fish_speech.models.text2semantic.llama import (
-    BaseModelArgs,
-    BaseTransformer,
-    DualARTransformer,
-    NaiveTransformer,
-)
+from fish_speech.models.text2semantic.llama import BaseModelArgs
 from fish_speech.text import clean_text, split_text
 from fish_speech.tokenizer import IM_END_TOKEN, FishTokenizer
 
@@ -42,6 +36,15 @@
     torch._inductor.config.fx_graph_cache = True
 
 
+from torch.nn.attention import SDPBackend, sdpa_kernel
+
+from fish_speech.models.text2semantic.llama import (
+    BaseTransformer,
+    DualARTransformer,
+    NaiveTransformer,
+)
+
+
 def multinomial_sample_one_no_sync(
     probs_sort,
 ):  # Does multinomial sampling without a cuda synchronization
@@ -369,12 +372,8 @@ def decode_n_tokens(
             window = previous_tokens[:, i - win_size : i]
 
         with (
-            sdpa_kernel(
-                [
-                    SDPBackend.FLASH_ATTENTION,
-                    SDPBackend.EFFICIENT_ATTENTION,
-                    SDPBackend.MATH,
-                ]
+            torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_mem_efficient=False, enable_math=True
             )
             if torch.cuda.is_available()
             else nullcontext()