diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt
index 6fbf85edfe9..6744b5c7e13 100644
--- a/runtime/onert/backend/cpu/CMakeLists.txt
+++ b/runtime/onert/backend/cpu/CMakeLists.txt
@@ -1,7 +1,7 @@
 nnfw_find_package(Ruy REQUIRED)
 
 file(GLOB SOURCES "*.cc")
-list(APPEND SOURCES ops/GGMLHelper.cc ops/OperationUtils.cc)
+list(APPEND SOURCES ops/OperationUtils.cc)
 macro(OP NAME)
   list(APPEND SOURCES ops/${NAME}Layer.cc)
 endmacro(OP)
@@ -20,8 +20,6 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
-# Set public: ExternalContext is used in train backend
-target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)
 
 set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
   OUTPUT_NAME backend_cpu
diff --git a/runtime/onert/backend/cpu/ExternalContext.cc b/runtime/onert/backend/cpu/ExternalContext.cc
index 7df60c0f61e..de55970f801 100644
--- a/runtime/onert/backend/cpu/ExternalContext.cc
+++ b/runtime/onert/backend/cpu/ExternalContext.cc
@@ -70,11 +70,4 @@ void ExternalContext::setMaxNumThreads(int max_num_threads)
   _ruy_context->set_max_num_threads(_max_num_threads);
 }
 
-void ExternalContext::initGgmlContext()
-{
-  if (_ggml_context == nullptr)
-    _ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
-      ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
-}
-
 } // namespace onert::backend::cpu
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index 2f0fa7dcf7c..cdf01e25c6a 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -19,7 +19,6 @@
 
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
-#include <ggml.h>
 
 #include <memory>
 
@@ -36,14 +35,11 @@ class ExternalContext
 
   int32_t maxNumThreads() const { return _max_num_threads; }
 
-  void initGgmlContext();
-
   ruy::Context *ruy_context() const { return _ruy_context.get(); }
 
 private:
   int32_t _max_num_threads;
   const std::unique_ptr<ruy::Context> _ruy_context;
-  std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
 };
 
 } // namespace onert::backend::cpu
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index 00eddd79202..6898dd4217d 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -16,7 +16,6 @@
 
 #include "FullyConnectedLayer.h"
 
-#include "GGMLHelper.h"
 #include "../Tensor.h"
 #include "../KernelGenerator.h"
 #include "../Validator.h"
@@ -28,7 +27,21 @@
 namespace onert::backend::cpu
 {
 
-void Validator::visit(const ir::operation::FullyConnected &) { _supported = true; }
+void Validator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto weight_node = &_graph.operands().at(weight_index);
+
+  _supported = false;
+
+  if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
+      weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
+    return;
+
+  _supported = true;
+}
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
@@ -206,39 +219,6 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
     throw std::runtime_error{"FullyConnected: unsupported sparsity"};
 }
 
-void FullyConnectedLayer::fullyConnectedGGMLWeight()
-{
-  if (_bias)
-    throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
-
-  // convert tensor
-  auto input = getGGMLTensor(_input);
-  auto weights = getGGMLTensor(_weights);
-  auto output = getGGMLTensor(_output);
-  {
-    output.op = GGML_OP_MUL_MAT;
-    output.src[0] = &weights;
-    output.src[1] = &input;
-  }
-  auto *nodes = &output;
-
-  // create graph
-  struct ggml_cgraph graph;
-  {
-    memset(&graph, 0, sizeof(graph));
-    graph.n_nodes = 1;
-    graph.nodes = &nodes;
-  }
-
-  // get cplan
-  auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
-  std::vector<uint8_t> buf(cplan.work_size);
-  cplan.work_data = buf.data();
-
-  // compute
-  ggml_graph_compute(&graph, &cplan);
-}
-
 void FullyConnectedLayer::fullyConnected16x1Float32()
 {
 #if defined(__aarch64__) && defined(USE_NEON)
@@ -279,10 +259,6 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
   }
 #endif
   _external_context = external_context;
-
-  if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
-      _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
-    _external_context->initGgmlContext();
 }
 
 void FullyConnectedLayer::run()
@@ -295,11 +271,6 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedSparseWeight();
   }
-  else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
-           _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
-  {
-    fullyConnectedGGMLWeight();
-  }
   else if (_input->data_type() == OperandType::FLOAT32)
   {
     _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.cc b/runtime/onert/backend/cpu/ops/GatherLayer.cc
index 2a43fa71147..20f5d673734 100644
--- a/runtime/onert/backend/cpu/ops/GatherLayer.cc
+++ b/runtime/onert/backend/cpu/ops/GatherLayer.cc
@@ -17,7 +17,6 @@
 #include "GatherLayer.h"
 
 #include "OperationUtils.h"
-#include "GGMLHelper.h"
 #include "../KernelGenerator.h"
 #include "../Validator.h"
 
@@ -26,7 +25,20 @@
 namespace onert::backend::cpu
 {
 
-void Validator::visit(const ir::operation::Gather &) { _supported = true; }
+void Validator::visit(const ir::operation::Gather &node)
+{
+  using ir::operation::Gather;
+
+  const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
+  const auto input_node = &_graph.operands().at(input_index);
+
+  _supported = false;
+
+  if (input_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0)
+    return;
+
+  _supported = true;
+}
 
 void KernelGenerator::visit(const ir::operation::Gather &node)
 {
@@ -43,7 +55,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
 
   auto fn = std::make_unique<ops::GatherLayer>();
 
-  fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
+  fn->configure(input_tensor, indices_tensor, output_tensor, axis);
 
   _return_fn = std::move(fn);
 }
@@ -54,16 +66,12 @@ namespace onert::backend::cpu::ops
 {
 
 void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
-                            IPortableTensor *output, int32_t axis, ExternalContext *ctx)
+                            IPortableTensor *output, int32_t axis)
 {
   _input = input;
   _indices = indices;
   _axis = axis;
   _output = output;
-  _ctx = ctx;
-
-  if (_input->data_type() == OperandType::QUANT_GGML_Q4_0)
-    ctx->initGgmlContext();
 }
 
 template <typename InputType> void GatherLayer::runByInputType()
@@ -97,53 +105,6 @@ template <typename InputType> void GatherLayer::runByInputType()
   }
 }
 
-void GatherLayer::runByGGMLQuantInputType()
-{
-  // Supporting condition
-  // Input: rank 2
-  // Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
-  // Axis: 0
-  if (getShape(_input).DimensionsCount() != 2)
-    throw std::runtime_error("Gather: block quantized input tensor must be rank 2");
-
-  if (getShape(_indices).DimensionsCount() >= 4 &&
-      (getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
-    throw std::runtime_error("Gather: invalid indices tensor shape");
-
-  if (_indices->data_type() != ir::DataType::INT32)
-    throw std::runtime_error("Gather: indices tensor must be int32 type");
-
-  if (_axis != 0)
-    throw std::runtime_error("Gather: axis must be 0");
-
-  // convert tensor
-  auto input = getGGMLTensor(_input);
-  auto indices = getGGMLTensor(_indices);
-  auto output = getGGMLTensor(_output);
-  {
-    output.op = GGML_OP_GET_ROWS;
-    output.src[0] = &input;
-    output.src[1] = &indices;
-  }
-  auto *nodes = &output;
-
-  // create graph
-  struct ggml_cgraph graph;
-  {
-    memset(&graph, 0, sizeof(graph));
-    graph.n_nodes = 1;
-    graph.nodes = &nodes;
-  }
-
-  // get cplan
-  auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
-  std::vector<uint8_t> buf(cplan.work_size);
-  cplan.work_data = buf.data();
-
-  // compute
-  ggml_graph_compute(&graph, &cplan);
-}
-
 void GatherLayer::run()
 {
   switch (_input->data_type())
@@ -157,9 +118,6 @@ void GatherLayer::run()
     case OperandType::INT32:
       runByInputType<int32_t>();
       break;
-    case OperandType::QUANT_GGML_Q4_0:
-      runByGGMLQuantInputType();
-      break;
     case OperandType::BOOL8:
       runByInputType<bool>();
       break;
diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.h b/runtime/onert/backend/cpu/ops/GatherLayer.h
index 0761b0a8b5c..dcb7dc2bca6 100644
--- a/runtime/onert/backend/cpu/ops/GatherLayer.h
+++ b/runtime/onert/backend/cpu/ops/GatherLayer.h
@@ -17,8 +17,6 @@
 #ifndef __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
 #define __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
 
-#include "../ExternalContext.h"
-
 #include <backend/IPortableTensor.h>
 
 #include <exec/IFunction.h>
@@ -29,20 +27,19 @@ namespace onert::backend::cpu::ops
 class GatherLayer : public ::onert::exec::IFunction
 {
 public:
-  GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
+  GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}
   {
     // DO NOTHING
   }
 
 public:
   void configure(const IPortableTensor *input, const IPortableTensor *indices,
-                 IPortableTensor *output, int32_t axis, ExternalContext *ctx);
+                 IPortableTensor *output, int32_t axis);
 
   void run() override;
 
 private:
   template <typename OpType> void runByInputType();
-  void runByGGMLQuantInputType();
 
 private:
   const IPortableTensor *_input;
@@ -50,7 +47,6 @@ class GatherLayer : public ::onert::exec::IFunction
   IPortableTensor *_output;
 
   int32_t _axis;
-  ExternalContext *_ctx;
 };
 
 } // namespace onert::backend::cpu::ops
diff --git a/runtime/onert/backend/ggml/CMakeLists.txt b/runtime/onert/backend/ggml/CMakeLists.txt
index dab1412cb94..738490e123c 100644
--- a/runtime/onert/backend/ggml/CMakeLists.txt
+++ b/runtime/onert/backend/ggml/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LIB_ONERT_BACKEND_GGML onert_backend_ggml)
 
 file(GLOB SOURCES "*.cc")
+list(APPEND SOURCES ops/GGMLHelper.cc)
 macro(OP NAME)
   list(APPEND SOURCES ops/${NAME}Layer.cc)
 endmacro(OP)
diff --git a/runtime/onert/backend/ggml/Operation.lst b/runtime/onert/backend/ggml/Operation.lst
index e69de29bb2d..b0c2fd22507 100644
--- a/runtime/onert/backend/ggml/Operation.lst
+++ b/runtime/onert/backend/ggml/Operation.lst
@@ -0,0 +1,2 @@
+OP(FullyConnected)
+OP(Gather)
diff --git a/runtime/onert/backend/ggml/ops/FullyConnectedLayer.cc b/runtime/onert/backend/ggml/ops/FullyConnectedLayer.cc
new file mode 100644
index 00000000000..656e908a84c
--- /dev/null
+++ b/runtime/onert/backend/ggml/ops/FullyConnectedLayer.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FullyConnectedLayer.h"
+
+#include "GGMLHelper.h"
+#include "../KernelGenerator.h"
+#include "../Validator.h"
+
+namespace onert::backend::ggml
+{
+
+void Validator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto weight_node = &_graph.operands().at(weight_index);
+
+  _supported = false;
+
+  if (weight_node->typeInfo().type() != ir::DataType::QUANT_GGML_Q4_0 &&
+      weight_node->typeInfo().type() != ir::DataType::QUANT_GGML_Q8_0)
+    return;
+
+  if (node.param().activation != ir::Activation::NONE)
+    return;
+
+  _supported = true;
+}
+
+void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+{
+  using ir::operation::FullyConnected;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+  const auto activation = node.param().activation;
+  const auto weights_format = node.param().weights_format;
+  if (weights_format != ir::FullyConnectedWeightsFormat::Default)
+    throw std::runtime_error("Unsupported FullyConnected Weights Format");
+
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
+  auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
+
+  auto fn = std::make_unique<ops::FullyConnectedLayer>();
+
+  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor,
+                _external_context);
+
+  _return_fn = std::move(fn);
+}
+
+} // namespace onert::backend::ggml
+
+namespace onert::backend::ggml::ops
+{
+
+FullyConnectedLayer::FullyConnectedLayer()
+  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
+    _activation(ir::Activation::NONE), _external_context(nullptr)
+{
+  // DO NOTHING
+}
+
+FullyConnectedLayer::~FullyConnectedLayer() = default;
+
+void FullyConnectedLayer::fullyConnectedGGMLWeight()
+{
+  if (_bias)
+    throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
+
+  // convert tensor
+  auto input = getGGMLTensor(_input);
+  auto weights = getGGMLTensor(_weights);
+  auto output = getGGMLTensor(_output);
+  {
+    output.op = GGML_OP_MUL_MAT;
+    output.src[0] = &weights;
+    output.src[1] = &input;
+  }
+  auto *nodes = &output;
+
+  // create graph
+  struct ggml_cgraph graph;
+  {
+    memset(&graph, 0, sizeof(graph));
+    graph.n_nodes = 1;
+    graph.nodes = &nodes;
+  }
+
+  // get cplan
+  auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
+  std::vector<uint8_t> buf(cplan.work_size);
+  cplan.work_data = buf.data();
+
+  // compute
+  ggml_graph_compute(&graph, &cplan);
+}
+
+void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
+                                    const IPortableTensor *bias, ir::Activation activation,
+                                    IPortableTensor *output,
+                                    const std::shared_ptr<ExternalContext> &external_context)
+{
+  _input = input;
+  _weights = weights;
+  _bias = bias;
+  _activation = activation;
+  _output = output;
+  _external_context = external_context;
+}
+
+void FullyConnectedLayer::run()
+{
+  if (_weights->data_type() == ir::DataType::QUANT_GGML_Q4_0 ||
+      _weights->data_type() == ir::DataType::QUANT_GGML_Q8_0)
+  {
+    fullyConnectedGGMLWeight();
+  }
+  else
+  {
+    throw std::runtime_error{"FullyConnected: unsupported data type"};
+  }
+}
+
+void FullyConnectedLayer::prepare()
+{
+  // DO NOTHING
+}
+
+} // namespace onert::backend::ggml::ops
diff --git a/runtime/onert/backend/ggml/ops/FullyConnectedLayer.h b/runtime/onert/backend/ggml/ops/FullyConnectedLayer.h
new file mode 100644
index 00000000000..18e12aaeb07
--- /dev/null
+++ b/runtime/onert/backend/ggml/ops/FullyConnectedLayer.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GGML_OPS_FULLYCONNECTEDLAYER_H__
+#define __ONERT_BACKEND_GGML_OPS_FULLYCONNECTEDLAYER_H__
+
+#include "../ExternalContext.h"
+
+#include <backend/IPortableTensor.h>
+#include <exec/IFunction.h>
+#include <ir/InternalType.h>
+
+namespace onert::backend::ggml::ops
+{
+
+class FullyConnectedLayer : public ::onert::exec::IFunction
+{
+public:
+  FullyConnectedLayer();
+  ~FullyConnectedLayer();
+
+public:
+  void fullyConnectedGGMLWeight();
+
+  void configure(const IPortableTensor *input, const IPortableTensor *weights,
+                 const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output,
+                 const std::shared_ptr<ExternalContext> &external_context);
+
+  void run() override;
+
+  void prepare() override;
+
+protected:
+  const IPortableTensor *_input;
+  const IPortableTensor *_weights;
+  const IPortableTensor *_bias;
+  IPortableTensor *_output;
+  ir::Activation _activation;
+
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace onert::backend::ggml::ops
+
+#endif // __ONERT_BACKEND_GGML_OPS_FULLYCONNECTEDLAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/GGMLHelper.cc b/runtime/onert/backend/ggml/ops/GGMLHelper.cc
similarity index 95%
rename from runtime/onert/backend/cpu/ops/GGMLHelper.cc
rename to runtime/onert/backend/ggml/ops/GGMLHelper.cc
index 8b66f024171..459baa8c2d3 100644
--- a/runtime/onert/backend/cpu/ops/GGMLHelper.cc
+++ b/runtime/onert/backend/ggml/ops/GGMLHelper.cc
@@ -16,7 +16,7 @@
 
 #include "GGMLHelper.h"
 
-namespace onert::backend::cpu::ops
+namespace onert::backend::ggml::ops
 {
 
 ggml_type getGGMLType(ir::DataType type)
@@ -64,4 +64,4 @@ struct ggml_tensor getGGMLTensor(const IPortableTensor *tensor)
   return res;
 }
 
-} // namespace onert::backend::cpu::ops
+} // namespace onert::backend::ggml::ops
diff --git a/runtime/onert/backend/cpu/ops/GGMLHelper.h b/runtime/onert/backend/ggml/ops/GGMLHelper.h
similarity index 82%
rename from runtime/onert/backend/cpu/ops/GGMLHelper.h
rename to runtime/onert/backend/ggml/ops/GGMLHelper.h
index d692dc23d7d..1e55cce84ea 100644
--- a/runtime/onert/backend/cpu/ops/GGMLHelper.h
+++ b/runtime/onert/backend/ggml/ops/GGMLHelper.h
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_GGML_HELPER_H__
-#define __ONERT_BACKEND_CPU_GGML_HELPER_H__
+#ifndef __ONERT_BACKEND_GGML_GGML_HELPER_H__
+#define __ONERT_BACKEND_GGML_GGML_HELPER_H__
 
 #include <backend/IPortableTensor.h>
 
 #include <ggml.h>
 
-namespace onert::backend::cpu::ops
+namespace onert::backend::ggml::ops
 {
 
 struct ggml_tensor getGGMLTensor(const IPortableTensor *tensor);
 
-} // namespace onert::backend::cpu::ops
+} // namespace onert::backend::ggml::ops
 
 #endif
diff --git a/runtime/onert/backend/ggml/ops/GatherLayer.cc b/runtime/onert/backend/ggml/ops/GatherLayer.cc
new file mode 100644
index 00000000000..25446d78a12
--- /dev/null
+++ b/runtime/onert/backend/ggml/ops/GatherLayer.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GatherLayer.h"
+
+#include "GGMLHelper.h"
+#include "OperationUtils.h"
+#include "../KernelGenerator.h"
+#include "../Validator.h"
+
+namespace onert::backend::ggml
+{
+
+void Validator::visit(const ir::operation::Gather &node)
+{
+  using ir::operation::Gather;
+
+  const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
+  const auto input_node = &_graph.operands().at(input_index);
+
+  _supported = false;
+
+  if (input_node->typeInfo().type() != ir::DataType::QUANT_GGML_Q4_0)
+    return;
+
+  _supported = true;
+}
+
+void KernelGenerator::visit(const ir::operation::Gather &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+  auto input_tensor = _tensor_reg->getPortableTensor(input_index);
+  auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
+
+  const auto rank = _ctx.at(input_index).shape().rank();
+  const auto axis = ops::getAxis(rank, node.param().axis);
+
+  auto fn = std::make_unique<ops::GatherLayer>();
+
+  fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
+
+  _return_fn = std::move(fn);
+}
+
+} // namespace onert::backend::ggml
+
+namespace onert::backend::ggml::ops
+{
+
+void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
+                            IPortableTensor *output, int32_t axis, ExternalContext *ctx)
+{
+  _input = input;
+  _indices = indices;
+  _axis = axis;
+  _output = output;
+  _ctx = ctx;
+}
+
+void GatherLayer::runByGGMLQuantInputType()
+{
+  // Supporting condition
+  // Input: rank 2
+  // Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
+  // Axis: 0
+  if (_input->getShape().rank() != 2)
+    throw std::runtime_error("Gather: block quantized input tensor must be rank 2");
+
+  if (_indices->getShape().rank() >= 4 &&
+      (_indices->getShape().rank() != 4 || _indices->getShape().dim(0) != 1))
+    throw std::runtime_error("Gather: invalid indices tensor shape");
+
+  if (_indices->data_type() != ir::DataType::INT32)
+    throw std::runtime_error("Gather: indices tensor must be int32 type");
+
+  if (_axis != 0)
+    throw std::runtime_error("Gather: axis must be 0");
+
+  // convert tensor
+  auto input = getGGMLTensor(_input);
+  auto indices = getGGMLTensor(_indices);
+  auto output = getGGMLTensor(_output);
+  {
+    output.op = GGML_OP_GET_ROWS;
+    output.src[0] = &input;
+    output.src[1] = &indices;
+  }
+  auto *nodes = &output;
+
+  // create graph
+  struct ggml_cgraph graph;
+  {
+    memset(&graph, 0, sizeof(graph));
+    graph.n_nodes = 1;
+    graph.nodes = &nodes;
+  }
+
+  // get cplan
+  auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
+  std::vector<uint8_t> buf(cplan.work_size);
+  cplan.work_data = buf.data();
+
+  // compute
+  ggml_graph_compute(&graph, &cplan);
+}
+
+void GatherLayer::run()
+{
+  switch (_input->data_type())
+  {
+    case ir::DataType::QUANT_GGML_Q4_0:
+      runByGGMLQuantInputType();
+      break;
+    default:
+      throw std::runtime_error("Gather: unsupported input data type");
+  }
+}
+
+} // namespace onert::backend::ggml::ops
diff --git a/runtime/onert/backend/ggml/ops/GatherLayer.h b/runtime/onert/backend/ggml/ops/GatherLayer.h
new file mode 100644
index 00000000000..eb5fcd81492
--- /dev/null
+++ b/runtime/onert/backend/ggml/ops/GatherLayer.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GGML_OPS_GATHERLAYER_H__
+#define __ONERT_BACKEND_GGML_OPS_GATHERLAYER_H__
+
+#include "../ExternalContext.h"
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert::backend::ggml::ops
+{
+
+class GatherLayer : public ::onert::exec::IFunction
+{
+public:
+  GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void configure(const IPortableTensor *input, const IPortableTensor *indices,
+                 IPortableTensor *output, int32_t axis, ExternalContext *ctx);
+
+  void run() override;
+
+private:
+  template <typename OpType> void runByInputType();
+  void runByGGMLQuantInputType();
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_indices;
+  IPortableTensor *_output;
+
+  int32_t _axis;
+  ExternalContext *_ctx;
+};
+
+} // namespace onert::backend::ggml::ops
+
+#endif // __ONERT_BACKEND_GGML_OPS_GATHERLAYER_H__
diff --git a/runtime/onert/backend/ggml/ops/OperationUtils.h b/runtime/onert/backend/ggml/ops/OperationUtils.h
new file mode 100644
index 00000000000..96a953f29e2
--- /dev/null
+++ b/runtime/onert/backend/ggml/ops/OperationUtils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GGML_OPS_OPERATION_UTILS_H__
+#define __ONERT_BACKEND_GGML_OPS_OPERATION_UTILS_H__
+
+#include <cstdint>
+
+namespace onert::backend::ggml::ops
+{
+
+inline int32_t getAxis(uint32_t rank, int32_t axis)
+{
+  auto ret = axis;
+
+  if (axis < 0)
+  {
+    ret += rank;
+  }
+
+  return ret;
+}
+
+} // namespace onert::backend::ggml::ops
+
+#endif // __ONERT_BACKEND_GGML_OPS_OPERATION_UTILS_H__
diff --git a/runtime/tests/nnfw_api/src/GenModelTests/one_op_tests/Gather.test.cc b/runtime/tests/nnfw_api/src/GenModelTests/one_op_tests/Gather.test.cc
index ac6dbd345f7..0868aa51cd6 100644
--- a/runtime/tests/nnfw_api/src/GenModelTests/one_op_tests/Gather.test.cc
+++ b/runtime/tests/nnfw_api/src/GenModelTests/one_op_tests/Gather.test.cc
@@ -74,7 +74,7 @@ TEST_F(GenModelTest, OneOp_Gather_Q4_0)
   tc.addInput<int32_t>({2});
   tc.addOutput<float>(std::vector<float>{params.begin() + 64, params.begin() + 96});
   _context->addTestCase(tc);
-  _context->setBackends({"cpu"});
+  _context->setBackends({"ggml"});
 
   SUCCEED();
 }
@@ -95,7 +95,7 @@ TEST_F(GenModelTest, neg_OneOp_Gather_Q4_0_InvalidOutType)
   cgen.setInputsAndOutputs({indice}, {output});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"cpu"});
+  _context->setBackends({"ggml"});
   _context->expectFailModelLoad();
 
   SUCCEED();
@@ -115,7 +115,7 @@ TEST_F(GenModelTest, neg_OneOp_Gather_Q4_0_shape)
   cgen.setInputsAndOutputs({indice}, {output});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->setBackends({"cpu"});
+  _context->setBackends({"ggml"});
   _context->expectFailCompile();
 
   SUCCEED();