Samsung · hseok-oh · Nov 21, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt
@@ -1,7 +1,7 @@
 nnfw_find_package(Ruy REQUIRED)
 
 file(GLOB SOURCES "*.cc")
-list(APPEND SOURCES ops/GGMLHelper.cc ops/OperationUtils.cc)
+list(APPEND SOURCES ops/OperationUtils.cc)
 macro(OP NAME)
   list(APPEND SOURCES ops/${NAME}Layer.cc)
 endmacro(OP)
@@ -20,8 +20,6 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
-# Set public: ExternalContext is used in train backend
-target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)
 
 set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
   OUTPUT_NAME backend_cpu

diff --git a/runtime/onert/backend/cpu/ExternalContext.cc b/runtime/onert/backend/cpu/ExternalContext.cc
@@ -70,11 +70,4 @@ void ExternalContext::setMaxNumThreads(int max_num_threads)
   _ruy_context->set_max_num_threads(_max_num_threads);
 }
 
-void ExternalContext::initGgmlContext()
-{
-  if (_ggml_context == nullptr)
-    _ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
-      ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
-}
-
 } // namespace onert::backend::cpu
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
@@ -19,7 +19,6 @@
 
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
-#include <ggml.h>
 
 #include <memory>
 
@@ -36,14 +35,11 @@ class ExternalContext
 
   int32_t maxNumThreads() const { return _max_num_threads; }
 
-  void initGgmlContext();
-
   ruy::Context *ruy_context() const { return _ruy_context.get(); }
 
 private:
   int32_t _max_num_threads;
   const std::unique_ptr<ruy::Context> _ruy_context;
-  std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
 };
 
 } // namespace onert::backend::cpu

diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -16,7 +16,6 @@
 
 #include "FullyConnectedLayer.h"
 
-#include "GGMLHelper.h"
 #include "../Tensor.h"
 #include "../KernelGenerator.h"
 #include "../Validator.h"
@@ -28,7 +27,21 @@
 namespace onert::backend::cpu
 {
 
-void Validator::visit(const ir::operation::FullyConnected &) { _supported = true; }
+void Validator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto weight_node = &_graph.operands().at(weight_index);
+
+  _supported = false;
+
+  if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
+      weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
+    return;
+
+  _supported = true;
+}
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
@@ -206,39 +219,6 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
     throw std::runtime_error{"FullyConnected: unsupported sparsity"};
 }
 
-void FullyConnectedLayer::fullyConnectedGGMLWeight()
-{
-  if (_bias)
-    throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
-
-  // convert tensor
-  auto input = getGGMLTensor(_input);
-  auto weights = getGGMLTensor(_weights);
-  auto output = getGGMLTensor(_output);
-  {
-    output.op = GGML_OP_MUL_MAT;
-    output.src[0] = &weights;
-    output.src[1] = &input;
-  }
-  auto *nodes = &output;
-
-  // create graph
-  struct ggml_cgraph graph;
-  {
-    memset(&graph, 0, sizeof(graph));
-    graph.n_nodes = 1;
-    graph.nodes = &nodes;
-  }
-
-  // get cplan
-  auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
-  std::vector<uint8_t> buf(cplan.work_size);
-  cplan.work_data = buf.data();
-
-  // compute
-  ggml_graph_compute(&graph, &cplan);
-}
-
 void FullyConnectedLayer::fullyConnected16x1Float32()
 {
 #if defined(__aarch64__) && defined(USE_NEON)
@@ -279,10 +259,6 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
   }
 #endif
   _external_context = external_context;
-
-  if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
-      _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
-    _external_context->initGgmlContext();
 }
 
 void FullyConnectedLayer::run()
@@ -295,11 +271,6 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedSparseWeight();
   }
-  else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
-           _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
-  {
-    fullyConnectedGGMLWeight();
-  }
   else if (_input->data_type() == OperandType::FLOAT32)
   {
     _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();

diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.cc b/runtime/onert/backend/cpu/ops/GatherLayer.cc
@@ -17,7 +17,6 @@
 #include "GatherLayer.h"
 
 #include "OperationUtils.h"
-#include "GGMLHelper.h"
 #include "../KernelGenerator.h"
 #include "../Validator.h"
 
@@ -26,7 +25,20 @@
 namespace onert::backend::cpu
 {
 
-void Validator::visit(const ir::operation::Gather &) { _supported = true; }
+void Validator::visit(const ir::operation::Gather &node)
+{
+  using ir::operation::Gather;
+
+  const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
+  const auto input_node = &_graph.operands().at(input_index);
+
+  _supported = false;
+
+  if (input_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0)
+    return;
+
+  _supported = true;
+}
 
 void KernelGenerator::visit(const ir::operation::Gather &node)
 {
@@ -43,7 +55,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
 
   auto fn = std::make_unique<ops::GatherLayer>();
 
-  fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
+  fn->configure(input_tensor, indices_tensor, output_tensor, axis);
 
   _return_fn = std::move(fn);
 }
@@ -54,16 +66,12 @@ namespace onert::backend::cpu::ops
 {
 
 void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
-                            IPortableTensor *output, int32_t axis, ExternalContext *ctx)
+                            IPortableTensor *output, int32_t axis)
 {
   _input = input;
   _indices = indices;
   _axis = axis;
   _output = output;
-  _ctx = ctx;
-
-  if (_input->data_type() == OperandType::QUANT_GGML_Q4_0)
-    ctx->initGgmlContext();
 }
 
 template <typename InputType> void GatherLayer::runByInputType()
@@ -97,53 +105,6 @@ template <typename InputType> void GatherLayer::runByInputType()
   }
 }
 
-void GatherLayer::runByGGMLQuantInputType()
-{
-  // Supporting condition
-  // Input: rank 2
-  // Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
-  // Axis: 0
-  if (getShape(_input).DimensionsCount() != 2)
-    throw std::runtime_error("Gather: block quantized input tensor must be rank 2");
-
-  if (getShape(_indices).DimensionsCount() >= 4 &&
-      (getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
-    throw std::runtime_error("Gather: invalid indices tensor shape");
-
-  if (_indices->data_type() != ir::DataType::INT32)
-    throw std::runtime_error("Gather: indices tensor must be int32 type");
-
-  if (_axis != 0)
-    throw std::runtime_error("Gather: axis must be 0");
-
-  // convert tensor
-  auto input = getGGMLTensor(_input);
-  auto indices = getGGMLTensor(_indices);
-  auto output = getGGMLTensor(_output);
-  {
-    output.op = GGML_OP_GET_ROWS;
-    output.src[0] = &input;
-    output.src[1] = &indices;
-  }
-  auto *nodes = &output;
-
-  // create graph
-  struct ggml_cgraph graph;
-  {
-    memset(&graph, 0, sizeof(graph));
-    graph.n_nodes = 1;
-    graph.nodes = &nodes;
-  }
-
-  // get cplan
-  auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
-  std::vector<uint8_t> buf(cplan.work_size);
-  cplan.work_data = buf.data();
-
-  // compute
-  ggml_graph_compute(&graph, &cplan);
-}
-
 void GatherLayer::run()
 {
   switch (_input->data_type())
@@ -157,9 +118,6 @@ void GatherLayer::run()
     case OperandType::INT32:
       runByInputType<int32_t>();
       break;
-    case OperandType::QUANT_GGML_Q4_0:
-      runByGGMLQuantInputType();
-      break;
     case OperandType::BOOL8:
       runByInputType<bool>();
       break;

diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.h b/runtime/onert/backend/cpu/ops/GatherLayer.h
@@ -17,8 +17,6 @@
 #ifndef __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
 #define __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
 
-#include "../ExternalContext.h"
-
 #include <backend/IPortableTensor.h>
 
 #include <exec/IFunction.h>
@@ -29,28 +27,26 @@ namespace onert::backend::cpu::ops
 class GatherLayer : public ::onert::exec::IFunction
 {
 public:
-  GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
+  GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}
   {
     // DO NOTHING
   }
 
 public:
   void configure(const IPortableTensor *input, const IPortableTensor *indices,
-                 IPortableTensor *output, int32_t axis, ExternalContext *ctx);
+                 IPortableTensor *output, int32_t axis);
 
   void run() override;
 
 private:
   template <typename OpType> void runByInputType();
-  void runByGGMLQuantInputType();
 
 private:
   const IPortableTensor *_input;
   const IPortableTensor *_indices;
   IPortableTensor *_output;
 
   int32_t _axis;
-  ExternalContext *_ctx;
 };
 
 } // namespace onert::backend::cpu::ops

diff --git a/runtime/onert/backend/ggml/CMakeLists.txt b/runtime/onert/backend/ggml/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LIB_ONERT_BACKEND_GGML onert_backend_ggml)
 
 file(GLOB SOURCES "*.cc")
+list(APPEND SOURCES ops/GGMLHelper.cc)
 macro(OP NAME)
   list(APPEND SOURCES ops/${NAME}Layer.cc)
 endmacro(OP)

diff --git a/runtime/onert/backend/ggml/Operation.lst b/runtime/onert/backend/ggml/Operation.lst
@@ -0,0 +1,2 @@
+OP(FullyConnected)
+OP(Gather)