PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/append_attn/ds_mla_cache_kernel.cu‎
Lines changed: 616 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attn/ds_mla_cache_kernel.cu‎
Lines changed: 616 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/ds_mla_cache_kernel.cuh‎
Lines changed: 548 additions & 0 deletions b/‎custom_ops/gpu_ops/append_attn/ds_mla_cache_kernel.cuh‎
Lines changed: 548 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 55 additions & 0 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 1 deletion b/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/sparse_indexer/exception.h‎
Lines changed: 115 additions & 0 deletions b/‎custom_ops/gpu_ops/sparse_indexer/exception.h‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/sparse_indexer/indexer_topk.cu‎
Lines changed: 144 additions & 0 deletions b/‎custom_ops/gpu_ops/sparse_indexer/indexer_topk.cu‎
Lines changed: 144 additions & 0 deletions
@@ -1133,6 +1133,47 @@ std::vector<paddle::Tensor> get_attn_mask_q(
     const paddle::optional<paddle::Tensor>& attn_mask_kv,
     const int kv_token_num);
 
+void RadixTopkRaggedTransform(
+    paddle::Tensor& input,
+    paddle::Tensor& output_indices,
+    const paddle::Tensor& offsets,
+    paddle::Tensor& lengths,
+    paddle::optional<paddle::Tensor>& seq_len_decoder,
+    paddle::optional<paddle::Tensor>& batch_id_per_token,
+    paddle::optional<paddle::Tensor>& maybe_row_states_buffer,
+    int top_k,
+    int q_num_heads = 0);
+
+std::vector<paddle::Tensor> DSMLAWriteCacheKernel(
+    const paddle::Tensor& kv_nope,
+    const paddle::Tensor& kv_pe,
+    const paddle::Tensor& kv_cache,
+    const paddle::Tensor& slot_mapping,
+    const paddle::Tensor& seq_lens,
+    const paddle::Tensor& seq_lens_decoder,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
+    const paddle::Tensor& block_tables,
+    const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& scale,
+    const std::string& cache_quant_type_str,
+    const int max_seq_len,
+    const bool is_prefill);
+
+std::vector<paddle::Tensor> IndexerKQuantAndCacheKernel(
+    const paddle::Tensor& k,
+    const paddle::Tensor& kv_cache,
+    const paddle::Tensor& slot_mapping,
+    const int64_t quant_block_size,
+    const std::string& scale_fmt);
+
+std::vector<paddle::Tensor> CpGatherIndexerKQuantCacheKernel(
+    const paddle::Tensor& kv_cache,
+    paddle::Tensor& dst_k,
+    paddle::Tensor& dst_scale,
+    const paddle::Tensor& block_table,
+    const paddle::Tensor& cu_seq_lens);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("get_expert_token_num",
         &GetExpertTokenNum,
@@ -1736,4 +1777,18 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("custom_numpy_to_tensor",
         &CustomNumpyToTensor,
         "custom_numpy_to_tensor function");
+
+  m.def("radix_topk_ragged_transform",
+        &RadixTopkRaggedTransform,
+        "radix_topk_ragged_transform function");
+
+  m.def("dsk_attn_write_cache", &DSMLAWriteCacheKernel, "dsk_attn_write_cache");
+
+  m.def("indexer_k_quant_and_cache",
+        &IndexerKQuantAndCacheKernel,
+        "indexer_k_quant_and_cache");
+
+  m.def("cp_gather_indexer_k_quant_cache",
+        &CpGatherIndexerKQuantCacheKernel,
+        "cp_gather_indexer_k_quant_cache");
 }
@@ -662,7 +662,8 @@ inline const char *getEnvVar(const char *varName) {
 
 inline bool checkAttentionBackend() {
   const char *backend = getEnvVar("FD_ATTENTION_BACKEND");
-  if (backend && std::strcmp(backend, "MLA_ATTN") == 0) {
+  if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 ||
+                  std::strcmp(backend, "DSA_ATTN") == 0)) {
     return true;
   }
   return false;
 
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FLASHINFER_EXCEPTION_H_
+#define FLASHINFER_EXCEPTION_H_
+
+#include <exception>
+#include <iostream>
+#include <sstream>
+
+#define FLASHINFER_ERROR(message) \
+  throw flashinfer::Error(__FUNCTION__, __FILE__, __LINE__, message)
+
+// Base case for empty arguments
+inline void write_to_stream(std::ostringstream& oss) {
+  // No-op for empty arguments
+}
+
+template <typename T>
+void write_to_stream(std::ostringstream& oss, T&& val) {
+  oss << std::forward<T>(val);
+}
+
+template <typename T, typename... Args>
+void write_to_stream(std::ostringstream& oss, T&& val, Args&&... args) {
+  oss << std::forward<T>(val) << " ";
+  write_to_stream(oss, std::forward<Args>(args)...);
+}
+
+// Helper macro to handle empty __VA_ARGS__
+#define FLASHINFER_CHECK_IMPL(condition, message) \
+  if (!(condition)) {                             \
+    FLASHINFER_ERROR(message);                    \
+  }
+
+// Main macro that handles both cases
+#define FLASHINFER_CHECK(condition, ...)   \
+  do {                                     \
+    if (!(condition)) {                    \
+      std::ostringstream oss;              \
+      write_to_stream(oss, ##__VA_ARGS__); \
+      std::string msg = oss.str();         \
+      if (msg.empty()) {                   \
+        msg = "Check failed: " #condition; \
+      }                                    \
+      FLASHINFER_ERROR(msg);               \
+    }                                      \
+  } while (0)
+
+// Warning macro
+#define FLASHINFER_WARN(...)                                           \
+  do {                                                                 \
+    std::ostringstream oss;                                            \
+    write_to_stream(oss, ##__VA_ARGS__);                               \
+    std::string msg = oss.str();                                       \
+    if (msg.empty()) {                                                 \
+      msg = "Warning triggered";                                       \
+    }                                                                  \
+    flashinfer::Warning(__FUNCTION__, __FILE__, __LINE__, msg).emit(); \
+  } while (0)
+
+namespace flashinfer {
+class Error : public std::exception {
+ private:
+  std::string message_;
+
+ public:
+  Error(const std::string& func,
+        const std::string& file,
+        int line,
+        const std::string& message) {
+    std::ostringstream oss;
+    oss << "Error in function '" << func << "' "
+        << "at " << file << ":" << line << ": " << message;
+    message_ = oss.str();
+  }
+
+  virtual const char* what() const noexcept override {
+    return message_.c_str();
+  }
+};
+
+class Warning {
+ private:
+  std::string message_;
+
+ public:
+  Warning(const std::string& func,
+          const std::string& file,
+          int line,
+          const std::string& message) {
+    std::ostringstream oss;
+    oss << "Warning in function '" << func << "' "
+        << "at " << file << ":" << line << ": " << message;
+    message_ = oss.str();
+  }
+
+  void emit() const { std::cerr << message_ << std::endl; }
+};
+
+}  // namespace flashinfer
+
+#endif  // FLASHINFER_EXCEPTION_H_
@@ -0,0 +1,144 @@
+
+#include "indexer_topk.cuh"
+
+#include <cuda_bf16.h>
+
+#include "paddle/extension.h"
+
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/optional.h"
+
+#include "append_attn/mem_util.cuh"
+#include "append_attn/mma_tensor_op.cuh"
+#include "append_attn/utils.cuh"
+#include "helper.h"
+
+// using namespace flashinfer;
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+template <paddle::DataType T>
+cudaError_t DispatchTopK(paddle::Tensor& input,
+                         paddle::Tensor& output_indices,
+                         const paddle::Tensor& offsets,
+                         paddle::Tensor& lengths,
+                         uint32_t num_rows,
+                         const int32_t* seq_len_decoder,
+                         const int32_t* batch_id_per_token,
+                         uint32_t top_k,
+                         uint32_t q_num_heads,
+                         uint32_t max_len,
+                         flashinfer::sampling::RadixRowState* row_states_ptr,
+                         cudaStream_t stream) {
+  typedef PDTraits<T> traits_;
+  typedef typename traits_::DataType DataType_;
+  typedef typename traits_::data_t data_t;
+
+  cudaError_t status;
+  status =
+      flashinfer::sampling::TopKRaggedTransformDispatch<DataType_, int32_t>(
+          reinterpret_cast<DataType_*>(input.data<data_t>()),
+          static_cast<int32_t*>(output_indices.data<int32_t>()),
+          static_cast<const int32_t*>(offsets.data<int32_t>()),
+          static_cast<int32_t*>(lengths.data<int32_t>()),
+          num_rows,
+          seq_len_decoder,
+          batch_id_per_token,
+          static_cast<uint32_t>(top_k),
+          static_cast<uint32_t>(q_num_heads),
+          max_len,
+          row_states_ptr,
+          stream);
+  return status;
+}
+
+void RadixTopkRaggedTransform(
+    paddle::Tensor& input,
+    paddle::Tensor& output_indices,
+    const paddle::Tensor& offsets,
+    paddle::Tensor& lengths,
+    paddle::optional<paddle::Tensor>& seq_len_decoder,
+    paddle::optional<paddle::Tensor>& batch_id_per_token,
+    paddle::optional<paddle::Tensor>& maybe_row_states_buffer,
+    int top_k,
+    int q_num_heads = 0) {
+  //   CHECK_INPUT(input);
+  //   CHECK_INPUT(output_indices);
+  //   CHECK_INPUT(offsets);
+  //   CHECK_INPUT(lengths);
+  //   CHECK_DIM(2, input);           // input: (num_rows, max_len)
+  //   CHECK_DIM(2, output_indices);  // output_indices: (num_rows, top_k)
+  //   CHECK_DIM(1, offsets);         // offsets: (num_rows,)
+  //   CHECK_DIM(1, lengths);         // lengths: (num_rows,)
+
+  unsigned int num_rows = input.dims()[0];
+  unsigned int max_len = input.dims()[1];
+
+  static cudaStream_t stream = input.stream();
+  cudaError_t status;
+  auto input_dtype = input.dtype();
+
+  //   sampling::RadixRowState* row_states_ptr = nullptr;
+  //   if (maybe_row_states_buffer.has_value()) {
+  //     row_states_ptr =
+  //         static_cast<sampling::RadixRowState*>(maybe_row_states_buffer.value().data_ptr());
+  //   }
+  flashinfer::sampling::RadixRowState* row_states_ptr = nullptr;
+  if (maybe_row_states_buffer) {
+    auto& tensor_ptr = maybe_row_states_buffer.get();
+    row_states_ptr = reinterpret_cast<flashinfer::sampling::RadixRowState*>(
+        tensor_ptr.data<uint8_t>());
+  }
+
+  const int32_t* seq_len_ptr = nullptr;
+  if (seq_len_decoder) {
+    auto& tensor_ptr = seq_len_decoder.get();
+    seq_len_ptr = static_cast<const int32_t*>(tensor_ptr.data<int32_t>());
+  }
+  const int32_t* batch_id_per_token_ptr = nullptr;
+  if (batch_id_per_token) {
+    auto& tensor_ptr = batch_id_per_token.get();
+    batch_id_per_token_ptr =
+        static_cast<const int32_t*>(tensor_ptr.data<int32_t>());
+  }
+
+  if (input_dtype == paddle::DataType::BFLOAT16) {
+    status = DispatchTopK<paddle::DataType::BFLOAT16>(input,
+                                                      output_indices,
+                                                      offsets,
+                                                      lengths,
+                                                      num_rows,
+                                                      seq_len_ptr,
+                                                      batch_id_per_token_ptr,
+                                                      top_k,
+                                                      q_num_heads,
+                                                      max_len,
+                                                      row_states_ptr,
+                                                      stream);
+  } else if (input_dtype == paddle::DataType::FLOAT32) {
+    status = DispatchTopK<paddle::DataType::FLOAT32>(input,
+                                                     output_indices,
+                                                     offsets,
+                                                     lengths,
+                                                     num_rows,
+                                                     seq_len_ptr,
+                                                     batch_id_per_token_ptr,
+                                                     top_k,
+                                                     q_num_heads,
+                                                     max_len,
+                                                     row_states_ptr,
+                                                     stream);
+  }
+}
+
+PD_BUILD_STATIC_OP(radix_topk_ragged_transform)
+    .Inputs({"input",
+             "output_indices",
+             "offsets",
+             "lengths",
+             paddle::Optional("seq_len_decoder"),
+             paddle::Optional("batch_id_per_token"),
+             paddle::Optional("maybe_row_states_buffer")})
+    .Attrs({"top_k : int", "q_num_heads : int"})
+    .SetKernelFn(PD_KERNEL(RadixTopkRaggedTransform));
Original file line number	Diff line number	Diff line change
`@@ -662,7 +662,8 @@ inline const char getEnvVar(const char varName) {`
`662`	`662`
`663`	`663`	`inline bool checkAttentionBackend() {`
`664`	`664`	`const char *backend = getEnvVar("FD_ATTENTION_BACKEND");`
`665`		`- if (backend && std::strcmp(backend, "MLA_ATTN") == 0) {`
	`665`	`+ if (backend && (std::strcmp(backend, "MLA_ATTN") == 0 \|\|`
	`666`	`+ std::strcmp(backend, "DSA_ATTN") == 0)) {`
`666`	`667`	`return true;`
`667`	`668`	`}`
`668`	`669`	`return false;`