Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions runtime/onert/backend/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
nnfw_find_package(Ruy REQUIRED)

file(GLOB SOURCES "*.cc")
list(APPEND SOURCES ops/GGMLHelper.cc ops/OperationUtils.cc)
list(APPEND SOURCES ops/OperationUtils.cc)
macro(OP NAME)
list(APPEND SOURCES ops/${NAME}Layer.cc)
endmacro(OP)
Expand All @@ -20,8 +20,6 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
# Set public: ExternalContext is used in train backend
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)

set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
OUTPUT_NAME backend_cpu
Expand Down
7 changes: 0 additions & 7 deletions runtime/onert/backend/cpu/ExternalContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,4 @@ void ExternalContext::setMaxNumThreads(int max_num_threads)
_ruy_context->set_max_num_threads(_max_num_threads);
}

void ExternalContext::initGgmlContext()
{
if (_ggml_context == nullptr)
_ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
}

} // namespace onert::backend::cpu
4 changes: 0 additions & 4 deletions runtime/onert/backend/cpu/ExternalContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <util/ConfigSource.h>
#include <ruy/context.h>
#include <ggml.h>

#include <memory>

Expand All @@ -36,14 +35,11 @@ class ExternalContext

int32_t maxNumThreads() const { return _max_num_threads; }

void initGgmlContext();

ruy::Context *ruy_context() const { return _ruy_context.get(); }

private:
int32_t _max_num_threads;
const std::unique_ptr<ruy::Context> _ruy_context;
std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
};

} // namespace onert::backend::cpu
Expand Down
59 changes: 15 additions & 44 deletions runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include "FullyConnectedLayer.h"

#include "GGMLHelper.h"
#include "../Tensor.h"
#include "../KernelGenerator.h"
#include "../Validator.h"
Expand All @@ -28,7 +27,21 @@
namespace onert::backend::cpu
{

void Validator::visit(const ir::operation::FullyConnected &) { _supported = true; }
void Validator::visit(const ir::operation::FullyConnected &node)
{
using ir::operation::FullyConnected;

const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
const auto weight_node = &_graph.operands().at(weight_index);

_supported = false;

if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
return;

_supported = true;
}

void KernelGenerator::visit(const ir::operation::FullyConnected &node)
{
Expand Down Expand Up @@ -206,39 +219,6 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
throw std::runtime_error{"FullyConnected: unsupported sparsity"};
}

void FullyConnectedLayer::fullyConnectedGGMLWeight()
{
if (_bias)
throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};

// convert tensor
auto input = getGGMLTensor(_input);
auto weights = getGGMLTensor(_weights);
auto output = getGGMLTensor(_output);
{
output.op = GGML_OP_MUL_MAT;
output.src[0] = &weights;
output.src[1] = &input;
}
auto *nodes = &output;

// create graph
struct ggml_cgraph graph;
{
memset(&graph, 0, sizeof(graph));
graph.n_nodes = 1;
graph.nodes = &nodes;
}

// get cplan
auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
std::vector<uint8_t> buf(cplan.work_size);
cplan.work_data = buf.data();

// compute
ggml_graph_compute(&graph, &cplan);
}

void FullyConnectedLayer::fullyConnected16x1Float32()
{
#if defined(__aarch64__) && defined(USE_NEON)
Expand Down Expand Up @@ -279,10 +259,6 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
}
#endif
_external_context = external_context;

if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
_external_context->initGgmlContext();
}

void FullyConnectedLayer::run()
Expand All @@ -295,11 +271,6 @@ void FullyConnectedLayer::run()
{
fullyConnectedSparseWeight();
}
else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
{
fullyConnectedGGMLWeight();
}
else if (_input->data_type() == OperandType::FLOAT32)
{
_is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
Expand Down
74 changes: 16 additions & 58 deletions runtime/onert/backend/cpu/ops/GatherLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "GatherLayer.h"

#include "OperationUtils.h"
#include "GGMLHelper.h"
#include "../KernelGenerator.h"
#include "../Validator.h"

Expand All @@ -26,7 +25,20 @@
namespace onert::backend::cpu
{

void Validator::visit(const ir::operation::Gather &) { _supported = true; }
void Validator::visit(const ir::operation::Gather &node)
{
using ir::operation::Gather;

const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
const auto input_node = &_graph.operands().at(input_index);

_supported = false;

if (input_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0)
return;

_supported = true;
}

void KernelGenerator::visit(const ir::operation::Gather &node)
{
Expand All @@ -43,7 +55,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)

auto fn = std::make_unique<ops::GatherLayer>();

fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
fn->configure(input_tensor, indices_tensor, output_tensor, axis);

_return_fn = std::move(fn);
}
Expand All @@ -54,16 +66,12 @@ namespace onert::backend::cpu::ops
{

void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
IPortableTensor *output, int32_t axis, ExternalContext *ctx)
IPortableTensor *output, int32_t axis)
{
_input = input;
_indices = indices;
_axis = axis;
_output = output;
_ctx = ctx;

if (_input->data_type() == OperandType::QUANT_GGML_Q4_0)
ctx->initGgmlContext();
}

template <typename InputType> void GatherLayer::runByInputType()
Expand Down Expand Up @@ -97,53 +105,6 @@ template <typename InputType> void GatherLayer::runByInputType()
}
}

void GatherLayer::runByGGMLQuantInputType()
{
// Supporting condition
// Input: rank 2
// Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
// Axis: 0
if (getShape(_input).DimensionsCount() != 2)
throw std::runtime_error("Gather: block quantized input tensor must be rank 2");

if (getShape(_indices).DimensionsCount() >= 4 &&
(getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
throw std::runtime_error("Gather: invalid indices tensor shape");

if (_indices->data_type() != ir::DataType::INT32)
throw std::runtime_error("Gather: indices tensor must be int32 type");

if (_axis != 0)
throw std::runtime_error("Gather: axis must be 0");

// convert tensor
auto input = getGGMLTensor(_input);
auto indices = getGGMLTensor(_indices);
auto output = getGGMLTensor(_output);
{
output.op = GGML_OP_GET_ROWS;
output.src[0] = &input;
output.src[1] = &indices;
}
auto *nodes = &output;

// create graph
struct ggml_cgraph graph;
{
memset(&graph, 0, sizeof(graph));
graph.n_nodes = 1;
graph.nodes = &nodes;
}

// get cplan
auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
std::vector<uint8_t> buf(cplan.work_size);
cplan.work_data = buf.data();

// compute
ggml_graph_compute(&graph, &cplan);
}

void GatherLayer::run()
{
switch (_input->data_type())
Expand All @@ -157,9 +118,6 @@ void GatherLayer::run()
case OperandType::INT32:
runByInputType<int32_t>();
break;
case OperandType::QUANT_GGML_Q4_0:
runByGGMLQuantInputType();
break;
case OperandType::BOOL8:
runByInputType<bool>();
break;
Expand Down
8 changes: 2 additions & 6 deletions runtime/onert/backend/cpu/ops/GatherLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#ifndef __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
#define __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__

#include "../ExternalContext.h"

#include <backend/IPortableTensor.h>

#include <exec/IFunction.h>
Expand All @@ -29,28 +27,26 @@ namespace onert::backend::cpu::ops
class GatherLayer : public ::onert::exec::IFunction
{
public:
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}
{
// DO NOTHING
}

public:
void configure(const IPortableTensor *input, const IPortableTensor *indices,
IPortableTensor *output, int32_t axis, ExternalContext *ctx);
IPortableTensor *output, int32_t axis);

void run() override;

private:
template <typename OpType> void runByInputType();
void runByGGMLQuantInputType();

private:
const IPortableTensor *_input;
const IPortableTensor *_indices;
IPortableTensor *_output;

int32_t _axis;
ExternalContext *_ctx;
};

} // namespace onert::backend::cpu::ops
Expand Down
1 change: 1 addition & 0 deletions runtime/onert/backend/ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(LIB_ONERT_BACKEND_GGML onert_backend_ggml)

file(GLOB SOURCES "*.cc")
list(APPEND SOURCES ops/GGMLHelper.cc)
macro(OP NAME)
list(APPEND SOURCES ops/${NAME}Layer.cc)
endmacro(OP)
Expand Down
2 changes: 2 additions & 0 deletions runtime/onert/backend/ggml/Operation.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OP(FullyConnected)
OP(Gather)
Loading