fix(sla-core): support Ampere graphic for sagesla (#44)

Franc1sCai · web-flow · commit c96037364958 · 2025-12-26T12:06:37.000+08:00
diff --git a/turbodiffusion/SLA/core.py b/turbodiffusion/SLA/core.py
@@ -206,30 +206,37 @@ def forward(self, q, k, v, return_sparsity=False):
 
         assert headdim in [64, 128], "headdim should be in [64, 128]. For other headdim, you can use padding and specify the softmax scale."
 
-        ## quant v
-        b, h_kv, kv_len, head_dim = v.shape
-        padded_len = (kv_len + 127) // 128 * 128
-        v_transposed_permutted = torch.empty((b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device)
-        fused.transpose_pad_permute_cuda(v, v_transposed_permutted, 1)
-        v_fp8 = torch.empty(v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device)
-        v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
-        fused.scale_fuse_quant_cuda(v_transposed_permutted, v_fp8, v_scale, kv_len, 2.25, 1)
-
         o_s = torch.empty_like(q)
-        if arch == "sm90":
-            qattn.qk_int8_sv_f8_accum_f32_block_sparse_attn_inst_buf_fuse_v_scale_sm90(
-                q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, q_scale, k_scale, v_scale, 1, False, 1, scale
+
+        if arch in ("sm80", "sm86", "sm87"):
+            pvthreshold = torch.full((q.shape[-3],), 1e6, dtype=torch.float32, device=q.device)
+            v_fp16 = v.to(torch.float16)
+            qattn.qk_int8_sv_f16_accum_f16_block_sparse_attn_inst_buf_with_pv_threshold(
+                q_int8, k_int8, v_fp16, o_s, lut, valid_block_num, pvthreshold, q_scale, k_scale, 1, False, 1, scale, 0
             )
         else:
-            pvthreshold = torch.full((q.shape[-3],), 1e6, dtype=torch.float32, device=q.device)
-            if SAGE2PP_ENABLED:
-                qk_int8_sv_f8_accum_f16_block_sparse_attn_inst_buf_fuse_v_scale_with_pv_threshold(
-                    q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, pvthreshold, q_scale, k_scale, v_scale, 1, False, 1, scale, 0
+            b, h_kv, kv_len, head_dim = v.shape
+            padded_len = (kv_len + 127) // 128 * 128
+            v_transposed_permutted = torch.empty((b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device)
+            fused.transpose_pad_permute_cuda(v, v_transposed_permutted, 1)
+            v_fp8 = torch.empty(v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device)
+            v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
+            fused.scale_fuse_quant_cuda(v_transposed_permutted, v_fp8, v_scale, kv_len, 2.25, 1)
+
+            if arch == "sm90":
+                qattn.qk_int8_sv_f8_accum_f32_block_sparse_attn_inst_buf_fuse_v_scale_sm90(
+                    q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, q_scale, k_scale, v_scale, 1, False, 1, scale
                 )
             else:
-                qattn.qk_int8_sv_f8_accum_f32_block_sparse_attn_inst_buf_fuse_v_scale_with_pv_threshold(
-                    q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, pvthreshold, q_scale, k_scale, v_scale, 1, False, 1, scale, 0
-                )
+                pvthreshold = torch.full((q.shape[-3],), 1e6, dtype=torch.float32, device=q.device)
+                if SAGE2PP_ENABLED:
+                    qk_int8_sv_f8_accum_f16_block_sparse_attn_inst_buf_fuse_v_scale_with_pv_threshold(
+                        q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, pvthreshold, q_scale, k_scale, v_scale, 1, False, 1, scale, 0
+                    )
+                else:
+                    qattn.qk_int8_sv_f8_accum_f32_block_sparse_attn_inst_buf_fuse_v_scale_with_pv_threshold(
+                        q_int8, k_int8, v_fp8, o_s, lut, valid_block_num, pvthreshold, q_scale, k_scale, v_scale, 1, False, 1, scale, 0
+                    )
 
         ########## SPARGE END ##########