flashinfer-ai
diff --git a/‎benchmarks/bench_mamba_ssd_combined.py‎
Lines changed: 688 additions & 0 deletions b/‎benchmarks/bench_mamba_ssd_combined.py‎
Lines changed: 688 additions & 0 deletions
diff --git a/‎csrc/seq_chunk_cumsum.cu‎
Lines changed: 58 additions & 0 deletions b/‎csrc/seq_chunk_cumsum.cu‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎csrc/seq_chunk_cumsum_jit_binding.cu‎
Lines changed: 27 additions & 0 deletions b/‎csrc/seq_chunk_cumsum_jit_binding.cu‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎flashinfer/jit/mamba/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎flashinfer/jit/mamba/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎flashinfer/jit/mamba/seq_chunk_cumsum.py‎
Lines changed: 33 additions & 0 deletions b/‎flashinfer/jit/mamba/seq_chunk_cumsum.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎flashinfer/mamba/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎flashinfer/mamba/__init__.py‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "flashinfer/mamba/seq_chunk_cumsum.cuh"
+#include "tvm_ffi_utils.h"
+
+using namespace flashinfer::mamba;
+using tvm::ffi::Optional;
+
+void seq_chunk_cumsum(TensorView seq_idx, TensorView chunk_indices, TensorView chunk_offsets,
+                      TensorView output, Optional<TensorView> tile_state, int64_t chunk_size,
+                      int64_t num_logical_chunks, int64_t num_seqs) {
+  CHECK_INPUT(seq_idx);
+  CHECK_INPUT(chunk_indices);
+  CHECK_INPUT(chunk_offsets);
+  CHECK_INPUT(output);
+
+  auto stream = get_stream(seq_idx.device());
+
+  uint8_t* tile_state_ptr = nullptr;
+  std::size_t tile_state_size = 0;
+  if (tile_state.has_value()) {
+    CHECK_INPUT(tile_state.value());
+    tile_state_ptr = static_cast<uint8_t*>(tile_state.value().data_ptr());
+    tile_state_size = static_cast<std::size_t>(tile_state.value().shape()[0]);
+  }
+
+  cudaError_t status;
+  DISPATCH_DLPACK_IDTYPE_TO_CTYPE(seq_idx.dtype(), SeqIdxT, [&] {
+    status = SeqChunkCumsumLauncher(static_cast<const SeqIdxT*>(seq_idx.data_ptr()),
+                                    static_cast<const int32_t*>(chunk_indices.data_ptr()),
+                                    static_cast<const int32_t*>(chunk_offsets.data_ptr()),
+                                    static_cast<int32_t*>(output.data_ptr()), tile_state_ptr,
+                                    tile_state_size, static_cast<int>(chunk_size),
+                                    static_cast<int>(num_logical_chunks),
+                                    static_cast<int>(num_seqs), stream);
+    return true;
+  });
+
+  TVM_FFI_ICHECK(status == cudaSuccess)
+      << "SeqChunkCumsumLauncher failed: " << cudaGetErrorString(status);
+}
+
+int64_t seq_chunk_cumsum_tile_state_size(int64_t num_seqs) {
+  return static_cast<int64_t>(SeqChunkCumsumWorkspaceSize(static_cast<int>(num_seqs)));
+}
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tvm_ffi_utils.h"
+
+using tvm::ffi::Optional;
+
+void seq_chunk_cumsum(TensorView seq_idx, TensorView chunk_indices, TensorView chunk_offsets,
+                      TensorView output, Optional<TensorView> tile_state, int64_t chunk_size,
+                      int64_t num_logical_chunks, int64_t num_seqs);
+
+int64_t seq_chunk_cumsum_tile_state_size(int64_t num_seqs);
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(seq_chunk_cumsum, seq_chunk_cumsum);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(seq_chunk_cumsum_tile_state_size, seq_chunk_cumsum_tile_state_size);
@@ -18,8 +18,10 @@
     gen_selective_state_update_module,
     gen_selective_state_update_sm90_module,
 )
+from .seq_chunk_cumsum import gen_seq_chunk_cumsum_module
 
 __all__ = [
     "gen_selective_state_update_module",
     "gen_selective_state_update_sm90_module",
+    "gen_seq_chunk_cumsum_module",
 ]
@@ -0,0 +1,33 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .. import env as jit_env
+from ..core import JitSpec, gen_jit_spec
+
+
+def gen_seq_chunk_cumsum_module() -> JitSpec:
+    """Generate JIT module for seq_chunk_cumsum kernel.
+
+    No Jinja, no dtype parameterization — everything is int32.
+    No architecture restrictions — plain CUDA (no tensor cores).
+    """
+    return gen_jit_spec(
+        "mamba_seq_chunk_cumsum",
+        [
+            jit_env.FLASHINFER_CSRC_DIR / "seq_chunk_cumsum.cu",
+            jit_env.FLASHINFER_CSRC_DIR / "seq_chunk_cumsum_jit_binding.cu",
+        ],
+    )
@@ -17,3 +17,10 @@
 from .selective_state_update import selective_state_update
 
 __all__ = ["selective_state_update"]
+
+try:
+    from .ssd_combined import SSDCombined
+
+    __all__.append("SSDCombined")
+except ImportError:
+    pass
Original file line number	Diff line number	Diff line change
`@@ -18,8 +18,10 @@`
`18`	`18`	`gen_selective_state_update_module,`
`19`	`19`	`gen_selective_state_update_sm90_module,`
`20`	`20`	`)`
	`21`	`+from .seq_chunk_cumsum import gen_seq_chunk_cumsum_module`
`21`	`22`
`22`	`23`	`__all__ = [`
`23`	`24`	`"gen_selective_state_update_module",`
`24`	`25`	`"gen_selective_state_update_sm90_module",`
	`26`	`+ "gen_seq_chunk_cumsum_module",`
`25`	`27`	`]`