Merge pull request #629 from yuerqiqi/feat/concat-slice

chenghuaWang · web-flow · commit d1615d271844 · 2026-02-03T21:20:39.000+08:00
[Ascend] Implement Concat and Slice operators
diff --git a/mllm/backends/ascend/AscendBackend.cpp b/mllm/backends/ascend/AscendBackend.cpp
@@ -14,12 +14,15 @@
 #include "mllm/backends/ascend/ops/AscendViewOp.hpp"
 #include "mllm/backends/ascend/ops/AscendMatMulOp.hpp"
 #include "mllm/backends/ascend/ops/AscendSoftmaxOp.hpp"
+#include "mllm/backends/ascend/ops/AscendConcatOp.hpp"
+#include "mllm/backends/ascend/ops/AscendSliceOp.hpp"
 
 namespace mllm::ascend {
 
 AscendBackend::AscendBackend() : Backend(kAscend, createAscendAllocator()) {
  regOpFactory<AscendAddOpFactory,AscendSubOpFactory,AscendMulOpFactory,AscendX2XOpFactory,AscendSiLUOpFactory,
-              AscendLinearOpFactory,AscendRMSNormOpFactory,AscendViewOpFactory,AscendMatMulOpFactory,AscendSoftmaxOpFactory>();
+              AscendLinearOpFactory,AscendRMSNormOpFactory,AscendViewOpFactory,AscendMatMulOpFactory,AscendSoftmaxOpFactory,
+              AscendConcatOpFactory, AscendSliceOpFactory>();
   auto& devices = AscendDeviceMetaInfo::instance().devices;
   for (const auto& device : devices) {
     const auto bytes_to_mb = [](size_t bytes) { return bytes / (1024.0 * 1024.0); };
diff --git a/mllm/backends/ascend/ops/AscendConcatOp.cpp b/mllm/backends/ascend/ops/AscendConcatOp.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendConcatOp.hpp"
+
+#include <iostream>
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendConcatOp::AscendConcatOp(const aops::ConcatOpOptions& options) : aops::ConcatOp(options) {}
+
+void AscendConcatOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendConcatOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  MLLM_RT_ASSERT(inputs.size() >= 1);
+  MLLM_RT_ASSERT_EQ(outputs.size(), 1);
+
+  if (inputs.size() == 1) {
+    const size_t data_size = inputs[0].bytes();
+    const void* src_data = inputs[0].ptr<void>();
+    void* dst_data = outputs[0].ptr<void>();
+
+    if (src_data != dst_data) {
+      auto ret = aclrtMemcpy(dst_data, data_size, src_data, data_size, ACL_MEMCPY_DEVICE_TO_DEVICE);
+      if (ret != ACL_SUCCESS) {
+        MLLM_ACL_CHECK(ret);
+      }
+      syncGlobalAtbStream();
+    }
+    return;
+  }
+
+  int32_t concat_dim = options().dim;
+  if (concat_dim < 0) {
+    concat_dim += static_cast<int32_t>(inputs[0].rank());
+  }
+
+  auto run_concat = [&](const Tensor& left, const Tensor& right, Tensor& out) {
+    atb::infer::ConcatParam param;
+    param.concatDim = concat_dim;
+
+    atb::Operation* op = nullptr;
+    auto st = atb::CreateOperation(param, &op);
+    if (st != atb::NO_ERROR || op == nullptr) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Concat) failed, status={}", static_cast<int>(st));
+    }
+
+    atb::Context* atb_ctx = getGlobalAtbContext();
+
+    atb::SVector<atb::Tensor> inTensors;
+    atb::Tensor atb_left;
+    atb::Tensor atb_right;
+    fillAtbTensor(left, atb_left);
+    fillAtbTensor(right, atb_right);
+    inTensors.push_back(atb_left);
+    inTensors.push_back(atb_right);
+
+    atb::Tensor atb_out;
+    fillAtbTensor(out, atb_out);
+    atb::SVector<atb::Tensor> outTensors;
+    outTensors.push_back(atb_out);
+
+    atb::VariantPack vp;
+    vp.inTensors = inTensors;
+    vp.outTensors = outTensors;
+
+    uint64_t workspaceSize = 0;
+    st = op->Setup(vp, workspaceSize, atb_ctx);
+    if (st != atb::NO_ERROR) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB ConcatOp Setup failed, status={}", static_cast<int>(st));
+    }
+
+    void* workspace = nullptr;
+    int workspace_block_id = -1;
+    if (workspaceSize > 0) {
+      auto& mem_mgr = getAscendMemoryManager();
+      mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+      mem_mgr.getBlockPtr(workspace_block_id, workspace);
+    }
+
+    {
+      ASCEND_TIME_SCOPE("AscendConcatOp::forward");
+      st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+    }
+
+    if (st != atb::NO_ERROR) {
+      MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB ConcatOp Execute failed, status={}", static_cast<int>(st));
+    }
+
+    syncGlobalAtbStream();
+
+    if (workspace_block_id != -1) {
+      auto& mem_mgr = getAscendMemoryManager();
+      mem_mgr.freeBlock(workspace_block_id);
+    }
+
+    atb::DestroyOperation(op);
+  };
+
+  std::vector<int32_t> current_shape = inputs[0].shape();
+  Tensor current = inputs[0];
+
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    current_shape[concat_dim] += inputs[i].shape()[concat_dim];
+
+    if (i == inputs.size() - 1) {
+      run_concat(current, inputs[i], outputs[0]);
+    } else {
+      Tensor temp = Tensor::empty(current_shape, outputs[0].dtype(), outputs[0].device()).alloc();
+      run_concat(current, inputs[i], temp);
+      current = temp;
+    }
+  }
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendConcatOp.hpp b/mllm/backends/ascend/ops/AscendConcatOp.hpp
@@ -0,0 +1,27 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/ConcatOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendConcatOp final : public aops::ConcatOp {
+ public:
+  explicit AscendConcatOp(const aops::ConcatOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendConcatOpFactory final : public TypedOpFactory<OpTypes::kConcat, aops::ConcatOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::ConcatOpOptions& options) override {
+    return std::make_shared<AscendConcatOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSliceOp.cpp b/mllm/backends/ascend/ops/AscendSliceOp.cpp
@@ -0,0 +1,136 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/ascend/ops/AscendSliceOp.hpp"
+
+#include <acl/acl.h>
+#include <atb/atb_infer.h>
+#include <atb/types.h>
+#include <atb/utils.h>
+#include <atb/infer_op_params.h>
+
+#include "mllm/utils/Common.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/backends/ascend/memory/AscendMemoryManager.hpp"
+#include "mllm/backends/ascend/AscendCommon.hpp"
+
+namespace mllm::ascend {
+
+AscendSliceOp::AscendSliceOp(const aops::SliceOpOptions& options) : aops::SliceOp(options) {}
+
+void AscendSliceOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  BaseOp::setup(inputs, outputs);
+}
+
+void AscendSliceOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  auto& input = inputs[0];
+  auto shape = input.shape();
+  auto slice_index = options().indices_;
+
+  MLLM_RT_ASSERT_EQ(slice_index.size(), shape.size());
+
+  std::vector<int> out_shape;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    const auto& pair = slice_index[i];
+    int32_t start = pair.start_;
+    int32_t end = pair.end_;
+
+    if (start == kAll) { start = 0; }
+    if (end == kAll) { end = shape[i]; }
+
+    if (start < 0) { start = start + shape[i]; }
+    if (end < 0) { end = end + shape[i]; }
+
+    start = std::max(0, std::min(start, static_cast<int>(shape[i])));
+    end = std::max(0, std::min(end, static_cast<int>(shape[i])));
+    
+    int len = std::max(0, end - start);
+    out_shape.push_back(len);
+  }
+  
+  outputs.emplace_back(Tensor::empty(out_shape, input.dtype(), input.device()));
+}
+
+void AscendSliceOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  atb::infer::SliceParam param;
+  auto& input = inputs[0];
+  auto shape = input.shape();
+  auto slice_index = options().indices_;
+  
+  for(size_t i=0; i<shape.size(); ++i) {
+      int32_t start = slice_index[i].start_;
+      int32_t end = slice_index[i].end_;
+      int32_t dim_size = shape[i];
+
+      if (start == kAll) start = 0;
+      if (end == kAll) end = dim_size;
+
+      if (start < 0) start += dim_size;
+      if (end < 0) end += dim_size;
+
+      start = std::max(0, std::min(start, dim_size));
+      end = std::max(0, std::min(end, dim_size));
+
+      param.offsets.push_back(start);
+      param.size.push_back(std::max(0, end - start));
+  }
+
+  atb::Operation* op = nullptr;
+  auto st = atb::CreateOperation(param, &op);
+  if (st != atb::NO_ERROR || op == nullptr) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB CreateOperation(Slice) failed, status={}", static_cast<int>(st));
+  }
+
+  atb::Context* atb_ctx = getGlobalAtbContext();
+  
+  atb::SVector<atb::Tensor> inTensors;
+  std::vector<atb::Tensor> atb_inputs(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    fillAtbTensor(inputs[i], atb_inputs[i]);
+    inTensors.push_back(atb_inputs[i]);
+  }
+
+  atb::Tensor atb_output;
+  fillAtbTensor(outputs[0], atb_output);
+  atb::SVector<atb::Tensor> outTensors;
+  outTensors.push_back(atb_output);
+
+  atb::VariantPack vp;
+  vp.inTensors = inTensors;
+  vp.outTensors = outTensors;
+  
+  uint64_t workspaceSize = 0;
+  st = op->Setup(vp, workspaceSize, atb_ctx);
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SliceOp Setup failed, status={}", static_cast<int>(st));
+  }
+  
+  void* workspace = nullptr;
+  int workspace_block_id = -1;
+  if (workspaceSize > 0) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.allocateBlock(static_cast<uint32_t>(workspaceSize), workspace_block_id);
+    mem_mgr.getBlockPtr(workspace_block_id, workspace);
+  }
+
+  {
+    ASCEND_TIME_SCOPE("AscendSliceOp::forward");
+    st = op->Execute(vp, reinterpret_cast<uint8_t*>(workspace), workspaceSize, atb_ctx);
+  }
+  
+  if (st != atb::NO_ERROR) {
+    MLLM_ERROR_EXIT(ExitCode::kAscendError, "ATB SliceOp Execute failed, status={}", static_cast<int>(st));
+  }
+
+  syncGlobalAtbStream();
+
+  if (workspace_block_id != -1) {
+    auto& mem_mgr = getAscendMemoryManager();
+    mem_mgr.freeBlock(workspace_block_id);
+  }
+
+  atb::DestroyOperation(op);
+}
+
+}  // namespace mllm::ascend
diff --git a/mllm/backends/ascend/ops/AscendSliceOp.hpp b/mllm/backends/ascend/ops/AscendSliceOp.hpp
@@ -0,0 +1,28 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SliceOp.hpp"
+#include "mllm/core/OpTypes.hpp"
+
+namespace mllm::ascend {
+
+class AscendSliceOp final : public aops::SliceOp {
+ public:
+  explicit AscendSliceOp(const aops::SliceOpOptions& options);
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class AscendSliceOpFactory final : public TypedOpFactory<OpTypes::kSlice, aops::SliceOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SliceOpOptions& options) override {
+    return std::make_shared<AscendSliceOp>(options);
+  }
+};
+
+}  // namespace mllm::ascend
diff --git a/tests/ascend/AscendConcatKernelTest.hpp b/tests/ascend/AscendConcatKernelTest.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/mllm.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/nn/Functional.hpp"
+#include "KernelTestHelper.hpp" // Has KernelTest base class
+
+class AscendConcatKernelTest : public KernelTest {
+ public:
+  bool ConcatFloat16Test(const std::vector<mllm::Tensor::shape_t>& input_shapes, int dim) {
+    using namespace mllm;
+
+    std::vector<Tensor> inputs_cpu;
+    for (const auto& shape : input_shapes) {
+      inputs_cpu.push_back(Tensor::random(shape, -1.0, 1.0, kFloat16, kCPU));
+    }
+
+    // CPU Reference
+    auto out_cpu = nn::functional::concat(inputs_cpu, dim);
+
+    // Ascend
+    std::vector<Tensor> inputs_ascend;
+    for (auto& t : inputs_cpu) {
+      inputs_ascend.push_back(t.to(kAscend));
+    }
+
+    auto out_ascend = nn::functional::concat(inputs_ascend, dim);
+    auto out_back = out_ascend.to(kCPU);
+
+    auto result = test::allClose(out_back, out_cpu, 1e-2, 1e-2);
+    if (!result.is_close) {
+        std::cout << "[ConcatTest] FAILED! dim=" << dim << std::endl;
+        return false;
+    }
+    std::cout << "[ConcatTest] PASSED dim=" << dim << std::endl;
+    return true;
+  }
+};
diff --git a/tests/ascend/AscendSliceKernelTest.hpp b/tests/ascend/AscendSliceKernelTest.hpp
diff --git a/tests/ascend/KernelTest.cpp b/tests/ascend/KernelTest.cpp