bugfix: fix the behavior of MLA kernel when kv-length is 0 (#868)

yzh119 · web-flow · commit 6ec3baed4541 · 2025-02-17T16:56:32.000-05:00
The scheduling algorithm in #863 do not consider some requests have kv-cache length 0, this PR fixes the issue.
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh
@@ -1134,7 +1134,8 @@ inline cudaError_t MLAPlan(void* float_buffer, size_t float_workspace_size_in_by
             qo_indptr_h[i] * num_heads +
             std::min((qo_tile_idx + 1) * cluster_tile_q, packed_qo_len);
       }
-      while (remaining_len > 0) {
+      bool zero_kv_len = (remaining_len == 0);
+      while (remaining_len > 0 || zero_kv_len) {
         auto [cluster_idx, accum_cost] = cluster_cost_heap.pop();
         int actual_len = std::min(remaining_len, kv_len_limit);
         cluster_cost_heap.insert(
@@ -1154,6 +1155,7 @@ inline cudaError_t MLAPlan(void* float_buffer, size_t float_workspace_size_in_by
         cluster_kv_end[cluster_idx].push_back(kv_start + actual_len);
         remaining_len -= actual_len;
         kv_start += actual_len;
+        if (zero_kv_len) break;
       }
       split_kv_count += int(split_kv);
     }
diff --git a/tests/test_deepseek_mla.py b/tests/test_deepseek_mla.py
@@ -171,7 +171,7 @@ def generate_kv_from_cache(ckv, kpe, kv_len, batch_size, num_heads):
 
 
 @pytest.mark.parametrize("batch_size", [1, 17, 37])
-@pytest.mark.parametrize("kv_len", [17, 33, 96, 97, 114, 514, 1024])
+@pytest.mark.parametrize("kv_len", [0, 17, 33, 96, 97, 114, 514, 1024])
 @pytest.mark.parametrize("qo_len", [1, 17, 37, 77])
 @pytest.mark.parametrize("num_heads", [4, 32, 128])
 @pytest.mark.parametrize("causal", [False, True])
@@ -243,7 +243,8 @@ def test_batch_mla_page_attention(
     o_ref, lse_ref = attention_ref(batch_size, q, k, v, causal, sm_scale)
     lse_ref = lse_ref.flatten(0, 1)
     torch.testing.assert_close(o, o_ref, rtol=1e-3, atol=1e-3)
-    torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
+    if kv_len != 0:
+        torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
 
     # test with pre-allocated output
     o_buffer = torch.empty_like(o)

Original file line number	Diff line number	Diff line change
`@@ -1134,7 +1134,8 @@ inline cudaError_t MLAPlan(void* float_buffer, size_t float_workspace_size_in_by`
`1134`	`1134`	`qo_indptr_h[i] * num_heads +`
`1135`	`1135`	`std::min((qo_tile_idx + 1) * cluster_tile_q, packed_qo_len);`
`1136`	`1136`	`}`
`1137`		`- while (remaining_len > 0) {`
	`1137`	`+ bool zero_kv_len = (remaining_len == 0);`
	`1138`	`+ while (remaining_len > 0 \|\| zero_kv_len) {`
`1138`	`1139`	`auto [cluster_idx, accum_cost] = cluster_cost_heap.pop();`
`1139`	`1140`	`int actual_len = std::min(remaining_len, kv_len_limit);`
`1140`	`1141`	`cluster_cost_heap.insert(`
`@@ -1154,6 +1155,7 @@ inline cudaError_t MLAPlan(void* float_buffer, size_t float_workspace_size_in_by`
`1154`	`1155`	`cluster_kv_end[cluster_idx].push_back(kv_start + actual_len);`
`1155`	`1156`	`remaining_len -= actual_len;`
`1156`	`1157`	`kv_start += actual_len;`
	`1158`	`+ if (zero_kv_len) break;`
`1157`	`1159`	`}`
`1158`	`1160`	`split_kv_count += int(split_kv);`
`1159`	`1161`	`}`