Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions runtime/onert/backend/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
nnfw_find_package(Ruy REQUIRED)

file(GLOB SOURCES "*.cc")
list(APPEND SOURCES ops/GGMLHelper.cc ops/OperationUtils.cc)
list(APPEND SOURCES ops/OperationUtils.cc)
macro(OP NAME)
list(APPEND SOURCES ops/${NAME}Layer.cc)
endmacro(OP)
Expand All @@ -20,8 +20,6 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
# Set public: ExternalContext is used in train backend
target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)

set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
OUTPUT_NAME backend_cpu
Expand Down
7 changes: 0 additions & 7 deletions runtime/onert/backend/cpu/ExternalContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,4 @@ void ExternalContext::setMaxNumThreads(int max_num_threads)
_ruy_context->set_max_num_threads(_max_num_threads);
}

void ExternalContext::initGgmlContext()
{
if (_ggml_context == nullptr)
_ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
}

} // namespace onert::backend::cpu
4 changes: 0 additions & 4 deletions runtime/onert/backend/cpu/ExternalContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <util/ConfigSource.h>
#include <ruy/context.h>
#include <ggml.h>

#include <memory>

Expand All @@ -36,14 +35,11 @@ class ExternalContext

int32_t maxNumThreads() const { return _max_num_threads; }

void initGgmlContext();

ruy::Context *ruy_context() const { return _ruy_context.get(); }

private:
int32_t _max_num_threads;
const std::unique_ptr<ruy::Context> _ruy_context;
std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
};

} // namespace onert::backend::cpu
Expand Down
59 changes: 15 additions & 44 deletions runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

#include "FullyConnectedLayer.h"

#include "GGMLHelper.h"
#include "../Tensor.h"
#include "../KernelGenerator.h"
#include "../Validator.h"
Expand All @@ -28,7 +27,21 @@
namespace onert::backend::cpu
{

void Validator::visit(const ir::operation::FullyConnected &) { _supported = true; }
void Validator::visit(const ir::operation::FullyConnected &node)
{
using ir::operation::FullyConnected;

const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
const auto weight_node = &_graph.operands().at(weight_index);

_supported = false;

if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
return;

_supported = true;
}

void KernelGenerator::visit(const ir::operation::FullyConnected &node)
{
Expand Down Expand Up @@ -206,39 +219,6 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
throw std::runtime_error{"FullyConnected: unsupported sparsity"};
}

void FullyConnectedLayer::fullyConnectedGGMLWeight()
{
if (_bias)
throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};

// convert tensor
auto input = getGGMLTensor(_input);
auto weights = getGGMLTensor(_weights);
auto output = getGGMLTensor(_output);
{
output.op = GGML_OP_MUL_MAT;
output.src[0] = &weights;
output.src[1] = &input;
}
auto *nodes = &output;

// create graph
struct ggml_cgraph graph;
{
memset(&graph, 0, sizeof(graph));
graph.n_nodes = 1;
graph.nodes = &nodes;
}

// get cplan
auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
std::vector<uint8_t> buf(cplan.work_size);
cplan.work_data = buf.data();

// compute
ggml_graph_compute(&graph, &cplan);
}

void FullyConnectedLayer::fullyConnected16x1Float32()
{
#if defined(__aarch64__) && defined(USE_NEON)
Expand Down Expand Up @@ -279,10 +259,6 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
}
#endif
_external_context = external_context;

if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
_external_context->initGgmlContext();
}

void FullyConnectedLayer::run()
Expand All @@ -295,11 +271,6 @@ void FullyConnectedLayer::run()
{
fullyConnectedSparseWeight();
}
else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
_weights->data_type() == OperandType::QUANT_GGML_Q8_0)
{
fullyConnectedGGMLWeight();
}
else if (_input->data_type() == OperandType::FLOAT32)
{
_is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
Expand Down
74 changes: 16 additions & 58 deletions runtime/onert/backend/cpu/ops/GatherLayer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "GatherLayer.h"

#include "OperationUtils.h"
#include "GGMLHelper.h"
#include "../KernelGenerator.h"
#include "../Validator.h"

Expand All @@ -26,7 +25,20 @@
namespace onert::backend::cpu
{

void Validator::visit(const ir::operation::Gather &) { _supported = true; }
void Validator::visit(const ir::operation::Gather &node)
{
using ir::operation::Gather;

const auto input_index{node.getInputs().at(Gather::Input::INPUT)};
const auto input_node = &_graph.operands().at(input_index);

_supported = false;

if (input_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0)
return;

_supported = true;
}

void KernelGenerator::visit(const ir::operation::Gather &node)
{
Expand All @@ -43,7 +55,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)

auto fn = std::make_unique<ops::GatherLayer>();

fn->configure(input_tensor, indices_tensor, output_tensor, axis, _external_context.get());
fn->configure(input_tensor, indices_tensor, output_tensor, axis);

_return_fn = std::move(fn);
}
Expand All @@ -54,16 +66,12 @@ namespace onert::backend::cpu::ops
{

void GatherLayer::configure(const IPortableTensor *input, const IPortableTensor *indices,
IPortableTensor *output, int32_t axis, ExternalContext *ctx)
IPortableTensor *output, int32_t axis)
{
_input = input;
_indices = indices;
_axis = axis;
_output = output;
_ctx = ctx;

if (_input->data_type() == OperandType::QUANT_GGML_Q4_0)
ctx->initGgmlContext();
}

template <typename InputType> void GatherLayer::runByInputType()
Expand Down Expand Up @@ -97,53 +105,6 @@ template <typename InputType> void GatherLayer::runByInputType()
}
}

void GatherLayer::runByGGMLQuantInputType()
{
// Supporting condition
// Input: rank 2
// Indice: rank < 4 or rank 4 with dim(0) = 1, INT32
// Axis: 0
if (getShape(_input).DimensionsCount() != 2)
throw std::runtime_error("Gather: block quantized input tensor must be rank 2");

if (getShape(_indices).DimensionsCount() >= 4 &&
(getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
throw std::runtime_error("Gather: invalid indices tensor shape");

if (_indices->data_type() != ir::DataType::INT32)
throw std::runtime_error("Gather: indices tensor must be int32 type");

if (_axis != 0)
throw std::runtime_error("Gather: axis must be 0");

// convert tensor
auto input = getGGMLTensor(_input);
auto indices = getGGMLTensor(_indices);
auto output = getGGMLTensor(_output);
{
output.op = GGML_OP_GET_ROWS;
output.src[0] = &input;
output.src[1] = &indices;
}
auto *nodes = &output;

// create graph
struct ggml_cgraph graph;
{
memset(&graph, 0, sizeof(graph));
graph.n_nodes = 1;
graph.nodes = &nodes;
}

// get cplan
auto cplan = ggml_graph_plan(&graph, _ctx->maxNumThreads());
std::vector<uint8_t> buf(cplan.work_size);
cplan.work_data = buf.data();

// compute
ggml_graph_compute(&graph, &cplan);
}

void GatherLayer::run()
{
switch (_input->data_type())
Expand All @@ -157,9 +118,6 @@ void GatherLayer::run()
case OperandType::INT32:
runByInputType<int32_t>();
break;
case OperandType::QUANT_GGML_Q4_0:
runByGGMLQuantInputType();
break;
case OperandType::BOOL8:
runByInputType<bool>();
break;
Expand Down
8 changes: 2 additions & 6 deletions runtime/onert/backend/cpu/ops/GatherLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#ifndef __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__
#define __ONERT_BACKEND_CPU_OPS_GATHERLAYER_H__

#include "../ExternalContext.h"

#include <backend/IPortableTensor.h>

#include <exec/IFunction.h>
Expand All @@ -29,28 +27,26 @@ namespace onert::backend::cpu::ops
class GatherLayer : public ::onert::exec::IFunction
{
public:
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}, _ctx{nullptr}
GatherLayer() : _input{nullptr}, _indices{nullptr}, _output{nullptr}, _axis{-1}
{
// DO NOTHING
}

public:
void configure(const IPortableTensor *input, const IPortableTensor *indices,
IPortableTensor *output, int32_t axis, ExternalContext *ctx);
IPortableTensor *output, int32_t axis);

void run() override;

private:
template <typename OpType> void runByInputType();
void runByGGMLQuantInputType();

private:
const IPortableTensor *_input;
const IPortableTensor *_indices;
IPortableTensor *_output;

int32_t _axis;
ExternalContext *_ctx;
};

} // namespace onert::backend::cpu::ops
Expand Down
1 change: 1 addition & 0 deletions runtime/onert/backend/ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(LIB_ONERT_BACKEND_GGML onert_backend_ggml)

file(GLOB SOURCES "*.cc")
list(APPEND SOURCES ops/GGMLHelper.cc)
macro(OP NAME)
list(APPEND SOURCES ops/${NAME}Layer.cc)
endmacro(OP)
Expand Down
2 changes: 2 additions & 0 deletions runtime/onert/backend/ggml/Operation.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OP(FullyConnected)
OP(Gather)
Loading