Skip to content

Commit 1693a07

Browse files
authored
[onert] Move ggml support to ggml backend (#16287)
This commit moves ggml weight operator support from CPU backend to ggml backend. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <hseok82.oh@samsung.com>
1 parent 1ba159f commit 1693a07

16 files changed

Lines changed: 486 additions & 131 deletions

runtime/onert/backend/cpu/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
nnfw_find_package(Ruy REQUIRED)
22

33
file(GLOB SOURCES "*.cc")
4-
list(APPEND SOURCES ops/GGMLHelper.cc ops/OperationUtils.cc)
4+
list(APPEND SOURCES ops/OperationUtils.cc)
55
macro(OP NAME)
66
list(APPEND SOURCES ops/${NAME}Layer.cc)
77
endmacro(OP)
@@ -20,8 +20,6 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
2020
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
2121
target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
2222
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
23-
# Set public: ExternalContext is used in train backend
24-
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)
2523

2624
set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
2725
OUTPUT_NAME backend_cpu

runtime/onert/backend/cpu/ExternalContext.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,4 @@ void ExternalContext::setMaxNumThreads(int max_num_threads)
7070
_ruy_context->set_max_num_threads(_max_num_threads);
7171
}
7272

73-
void ExternalContext::initGgmlContext()
74-
{
75-
if (_ggml_context == nullptr)
76-
_ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
77-
ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
78-
}
79-
8073
} // namespace onert::backend::cpu

runtime/onert/backend/cpu/ExternalContext.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
#include <util/ConfigSource.h>
2121
#include <ruy/context.h>
22-
#include <ggml.h>
2322

2423
#include <memory>
2524

@@ -36,14 +35,11 @@ class ExternalContext
3635

3736
int32_t maxNumThreads() const { return _max_num_threads; }
3837

39-
void initGgmlContext();
40-
4138
ruy::Context *ruy_context() const { return _ruy_context.get(); }
4239

4340
private:
4441
int32_t _max_num_threads;
4542
const std::unique_ptr<ruy::Context> _ruy_context;
46-
std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
4743
};
4844

4945
} // namespace onert::backend::cpu

runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc

Lines changed: 15 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
#include "FullyConnectedLayer.h"
1818

19-
#include "GGMLHelper.h"
2019
#include "../Tensor.h"
2120
#include "../KernelGenerator.h"
2221
#include "../Validator.h"
@@ -28,7 +27,21 @@
2827
namespace onert::backend::cpu
2928
{
3029

31-
void Validator::visit(const ir::operation::FullyConnected &) { _supported = true; }
30+
void Validator::visit(const ir::operation::FullyConnected &node)
31+
{
32+
using ir::operation::FullyConnected;
33+
34+
const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
35+
const auto weight_node = &_graph.operands().at(weight_index);
36+
37+
_supported = false;
38+
39+
if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
40+
weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
41+
return;
42+
43+
_supported = true;
44+
}
3245

3346
void KernelGenerator::visit(const ir::operation::FullyConnected &node)
3447
{
@@ -206,39 +219,6 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
206219
throw std::runtime_error{"FullyConnected: unsupported sparsity"};
207220
}
208221

209-
void FullyConnectedLayer::fullyConnectedGGMLWeight()
210-
{
211-
if (_bias)
212-
throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
213-
214-
// convert tensor
215-
auto input = getGGMLTensor(_input);
216-
auto weights = getGGMLTensor(_weights);
217-
auto output = getGGMLTensor(_output);
218-
{
219-
output.op = GGML_OP_MUL_MAT;
220-
output.src[0] = &weights;
221-
output.src[1] = &input;
222-
}
223-
auto *nodes = &output;
224-
225-
// create graph
226-
struct ggml_cgraph graph;
227-
{
228-
memset(&graph, 0, sizeof(graph));
229-
graph.n_nodes = 1;
230-
graph.nodes = &nodes;
231-
}
232-
233-
// get cplan
234-
auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
235-
std::vector<uint8_t> buf(cplan.work_size);
236-
cplan.work_data = buf.data();
237-
238-
// compute
239-
ggml_graph_compute(&graph, &cplan);
240-
}
241-
242222
void FullyConnectedLayer::fullyConnected16x1Float32()
243223
{
244224
#if defined(__aarch64__) && defined(USE_NEON)
@@ -279,10 +259,6 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
279259
}
280260
#endif
281261
_external_context = external_context;
282-
283-
if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
284-
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
285-
_external_context->initGgmlContext();
286262
}
287263

288264
void FullyConnectedLayer::run()
@@ -295,11 +271,6 @@ void FullyConnectedLayer::run()
295271
{
296272
fullyConnectedSparseWeight();
297273
}
298-
else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
299-
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
300-
{
301-
fullyConnectedGGMLWeight();
302-
}
303274
else if (_input->data_type() == OperandType::FLOAT32)
304275
{
305276
_is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();

runtime/onert/backend/cpu/ops/GatherLayer.cc

Lines changed: 16 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "GatherLayer.h"
1818

1919
#include "OperationUtils.h"
20-
#include "GGMLHelper.h"
2120
#include "../KernelGenerator.h"
2221
#include "../Validator.h"
2322

@@ -26,7 +25,20 @@
2625
namespace onert::backend::cpu
2726
{
2827

29-
void Validator::visit(const ir::operation::Gather &) { _supported = true; }
28+
void Validator::visit(const ir::operation::Gather &node)
29+
{
30+
using ir::operation::Gather;
31+
32+
const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
33+
const auto input_node = &_graph.operands().at(input_index);
34+
35+
_supported = false;
36+
37+
if (input_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0)
38+
return;
39+
40+
_supported = true;
41+
}
3042

3143
void KernelGenerator::visit(const ir::operation::Gather &node)
3244
{
@@ -43,7 +55,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
4355

4456
auto fn = std::make_unique<ops::GatherLayer>();
4557

46-
fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
58+
fn->configure(input_tensor, indices_tensor, output_tensor, axis);
4759

4860
_return_fn = std::move(fn);
4961
}
@@ -54,16 +66,12 @@ namespace onert::backend::cpu::ops
5466
{
5567

5668
void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
57-
IPortableTensor *output, int32_t axis, ExternalContext *ctx)
69+
IPortableTensor *output, int32_t axis)
5870
{
5971
_input = input;
6072
_indices = indices;
6173
_axis = axis;
6274
_output = output;
63-
_ctx = ctx;
64-
65-
if (_input->data_type() == OperandType::QUANT_GGML_Q4_0)
66-
ctx->initGgmlContext();
6775
}
6876

6977
template <typename InputType> void GatherLayer::runByInputType()
@@ -97,53 +105,6 @@ template <typename InputType> void GatherLayer::runByInputType()
97105
}
98106
}
99107

100-
void GatherLayer::runByGGMLQuantInputType()
101-
{
102-
// Supporting condition
103-
// Input: rank 2
104-
// Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
105-
// Axis: 0
106-
if (getShape(_input).DimensionsCount() != 2)
107-
throw std::runtime_error("Gather: block quantized input tensor must be rank 2");
108-
109-
if (getShape(_indices).DimensionsCount() >= 4 &&
110-
(getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
111-
throw std::runtime_error("Gather: invalid indices tensor shape");
112-
113-
if (_indices->data_type() != ir::DataType::INT32)
114-
throw std::runtime_error("Gather: indices tensor must be int32 type");
115-
116-
if (_axis != 0)
117-
throw std::runtime_error("Gather: axis must be 0");
118-
119-
// convert tensor
120-
auto input = getGGMLTensor(_input);
121-
auto indices = getGGMLTensor(_indices);
122-
auto output = getGGMLTensor(_output);
123-
{
124-
output.op = GGML_OP_GET_ROWS;
125-
output.src[0] = &input;
126-
output.src[1] = &indices;
127-
}
128-
auto *nodes = &output;
129-
130-
// create graph
131-
struct ggml_cgraph graph;
132-
{
133-
memset(&graph, 0, sizeof(graph));
134-
graph.n_nodes = 1;
135-
graph.nodes = &nodes;
136-
}
137-
138-
// get cplan
139-
auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
140-
std::vector<uint8_t> buf(cplan.work_size);
141-
cplan.work_data = buf.data();
142-
143-
// compute
144-
ggml_graph_compute(&graph, &cplan);
145-
}
146-
147108
void GatherLayer::run()
148109
{
149110
switch (_input->data_type())
@@ -157,9 +118,6 @@ void GatherLayer::run()
157118
case OperandType::INT32:
158119
runByInputType<int32_t>();
159120
break;
160-
case OperandType::QUANT_GGML_Q4_0:
161-
runByGGMLQuantInputType();
162-
break;
163121
case OperandType::BOOL8:
164122
runByInputType<bool>();
165123
break;

runtime/onert/backend/cpu/ops/GatherLayer.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
#ifndef __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
1818
#define __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
1919

20-
#include "../ExternalContext.h"
21-
2220
#include <backend/IPortableTensor.h>
2321

2422
#include <exec/IFunction.h>
@@ -29,28 +27,26 @@ namespace onert::backend::cpu::ops
2927
class GatherLayer : public ::onert::exec::IFunction
3028
{
3129
public:
32-
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
30+
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}
3331
{
3432
// DO NOTHING
3533
}
3634

3735
public:
3836
void configure(const IPortableTensor *input, const IPortableTensor *indices,
39-
IPortableTensor *output, int32_t axis, ExternalContext *ctx);
37+
IPortableTensor *output, int32_t axis);
4038

4139
void run() override;
4240

4341
private:
4442
template <typename OpType> void runByInputType();
45-
void runByGGMLQuantInputType();
4643

4744
private:
4845
const IPortableTensor *_input;
4946
const IPortableTensor *_indices;
5047
IPortableTensor *_output;
5148

5249
int32_t _axis;
53-
ExternalContext *_ctx;
5450
};
5551

5652
} // namespace onert::backend::cpu::ops

runtime/onert/backend/ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
set(LIB_ONERT_BACKEND_GGML onert_backend_ggml)
22

33
file(GLOB SOURCES "*.cc")
4+
list(APPEND SOURCES ops/GGMLHelper.cc)
45
macro(OP NAME)
56
list(APPEND SOURCES ops/${NAME}Layer.cc)
67
endmacro(OP)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
OP(FullyConnected)
2+
OP(Gather)

0 commit comments

Comments
 (0)