From 9cf40d1621a29fbdbcfed4278421c9f16bc69eff Mon Sep 17 00:00:00 2001 From: Anupam Pandey Date: Mon, 7 Apr 2025 10:33:19 +0530 Subject: [PATCH] Enable HiFi SIMD for CONV operator This change facilitates invoking the HiFi5 SIMD for CONV operator during the inference of an FP32xFP32 model on Xtensa. --- tensorflow/lite/micro/kernels/conv.h | 9 ++ tensorflow/lite/micro/kernels/xtensa/conv.cc | 37 +----- .../kernels/xtensa/conv_float32_reference.cc | 84 +++++++++++++ .../lite/micro/kernels/xtensa/conv_hifi.cc | 115 +++++++++++++++++- ...t8_int16.cc => conv_int8_int16_float32.cc} | 27 ++++ .../lite/micro/kernels/xtensa/xtensa_conv.h | 12 ++ .../lite/micro/tools/make/ext_libs/xtensa.inc | 3 +- 7 files changed, 254 insertions(+), 33 deletions(-) create mode 100755 tensorflow/lite/micro/kernels/xtensa/conv_float32_reference.cc rename tensorflow/lite/micro/kernels/xtensa/{conv_int8_int16.cc => conv_int8_int16_float32.cc} (76%) diff --git a/tensorflow/lite/micro/kernels/conv.h b/tensorflow/lite/micro/kernels/conv.h index 0090053e03c..a47a9d05e48 100644 --- a/tensorflow/lite/micro/kernels/conv.h +++ b/tensorflow/lite/micro/kernels/conv.h @@ -129,6 +129,15 @@ inline TFLMRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); } inline TFLMRegistration Register_CONV_2D_INT16() { return Register_CONV_2D(); } #endif // defined(CMSIS_NN) || defined(XTENSA) +#if defined(XTENSA) +// Returns a TFLMRegistration struct for kernel variant that only supports +// float32 activations and float32 weights and uses the latency optimized +// implementations. +TFLMRegistration Register_CONV_2D_FLOAT32(); +#else +inline TFLMRegistration Register_CONV_2D_FLOAT32() { return Register_CONV_2D(); } +#endif + } // namespace tflite #endif // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_ diff --git a/tensorflow/lite/micro/kernels/xtensa/conv.cc b/tensorflow/lite/micro/kernels/xtensa/conv.cc index 39618d41f66..23cf570308f 100644 --- a/tensorflow/lite/micro/kernels/xtensa/conv.cc +++ b/tensorflow/lite/micro/kernels/xtensa/conv.cc @@ -52,37 +52,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { switch (input->type) { case kTfLiteFloat32: { -#ifdef USE_TFLM_COMPRESSION - - MicroContext* micro_context = GetMicroContext(context); - - const CompressionTensorData* weights_comp_td = - micro_context->GetTensorCompressionData(node, kConvWeightsTensor); - const CompressionTensorData* bias_comp_td = - micro_context->GetTensorCompressionData(node, kConvBiasTensor); - -#endif // USE_TFLM_COMPRESSION - tflite::reference_ops::Conv( - ConvParamsFloat(params, op_data.reference_op_data), - tflite::micro::GetTensorShape(input), - tflite::micro::GetTensorData(input), - tflite::micro::GetTensorShape(filter), -#ifdef USE_TFLM_COMPRESSION - tflite::micro::GetTensorData( - micro_context, filter, weights_comp_td, - op_data.reference_op_data.weights_scratch_index), - tflite::micro::GetTensorShape(bias), - tflite::micro::GetOptionalTensorData( - micro_context, bias, bias_comp_td, - op_data.reference_op_data.bias_scratch_index), -#else // USE_TFLM_COMPRESSION - tflite::micro::GetTensorData(filter), - tflite::micro::GetTensorShape(bias), - tflite::micro::GetOptionalTensorData(bias), -#endif // USE_TFLM_COMPRESSION - tflite::micro::GetTensorShape(output), - tflite::micro::GetTensorData(output), - tflite::micro::GetTensorShape(nullptr), nullptr); +#if HIFI_VFPU && (defined(HIFI3) || defined(HIFI4) || defined(HIFI5)) + ConvEvalHifiFloat32(context, node, params, op_data, input, filter, + bias, output); +#else + return ConvReferenceEvalFloat32(context, node); +#endif break; } case kTfLiteInt8: { diff --git a/tensorflow/lite/micro/kernels/xtensa/conv_float32_reference.cc b/tensorflow/lite/micro/kernels/xtensa/conv_float32_reference.cc new file mode 100755 index 00000000000..61d82cb6a37 --- /dev/null +++ b/tensorflow/lite/micro/kernels/xtensa/conv_float32_reference.cc @@ -0,0 +1,84 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" +#include "tensorflow/lite/kernels/internal/reference/conv.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/padding.h" +#include "tensorflow/lite/micro/kernels/conv.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_log.h" +#include "tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h" + +namespace tflite { + +TfLiteStatus ConvReferenceEvalFloat32(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kConvInputTensor); + + const auto& params = + *(reinterpret_cast(node->builtin_data)); + const auto& op_data = *(reinterpret_cast(node->user_data)); + + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kConvOutputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kConvWeightsTensor); + const TfLiteEvalTensor* bias = + tflite::micro::GetEvalInput(context, node, kConvBiasTensor); + +#ifdef USE_TFLM_COMPRESSION + + MicroContext* micro_context = GetMicroContext(context); + + const CompressionTensorData* weights_comp_td = + micro_context->GetTensorCompressionData(node, kConvWeightsTensor); + const CompressionTensorData* bias_comp_td = + micro_context->GetTensorCompressionData(node, kConvBiasTensor); + +#endif // USE_TFLM_COMPRESSION + tflite::reference_ops::Conv( + ConvParamsFloat(params, op_data.reference_op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, filter, weights_comp_td, + op_data.reference_op_data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetOptionalTensorData( + micro_context, bias, bias_comp_td, + op_data.reference_op_data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output), + tflite::micro::GetTensorShape(nullptr), nullptr); + return kTfLiteOk; +} + +} // namespace tflite diff --git a/tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc b/tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc index b5d4b5ea859..3fb286c7e51 100644 --- a/tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc +++ b/tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h" +#include "tensorflow/lite/kernels/internal/reference/conv.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/conv.h" @@ -59,7 +60,8 @@ TfLiteStatus ConvPrepareHifi(TfLiteContext* context, TfLiteNode* node) { inputs_and_bias_ok = inputs_and_bias_ok && (input->type == kTfLiteInt8 || - (input->type == kTfLiteInt16 && bias->type == kTfLiteInt64)); + (input->type == kTfLiteInt16 && bias->type == kTfLiteInt64) || + input->type == kTfLiteFloat32); #else inputs_and_bias_ok = inputs_and_bias_ok && (input->type == kTfLiteInt8); #endif // defined(HIFI3) || defined(HIFI4) || defined(HIFI5) @@ -81,6 +83,7 @@ TfLiteStatus ConvPrepareHifi(TfLiteContext* context, TfLiteNode* node) { const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_depth = filter_shape.Dims(3); const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); const int output_channels = output_shape.Dims(3); @@ -133,6 +136,13 @@ TfLiteStatus ConvPrepareHifi(TfLiteContext* context, TfLiteNode* node) { } TF_LITE_ENSURE(context, required_scratch > 0); } +#if HIFI_VFPU && (defined(HIFI3) || defined(HIFI4) || defined(HIFI5)) + if ((input->type == kTfLiteFloat32) && (input_depth == filter_depth)) { + required_scratch = xa_nn_conv2d_std_getsize( + input_height, input_depth, filter_height, filter_width, stride_height, + pad_height, output_height, output_channels, PREC_F32); + } +#endif } TF_LITE_ENSURE_OK( context, context->RequestScratchBufferInArena( @@ -400,5 +410,108 @@ TfLiteStatus ConvEvalHifiInt8(TfLiteContext* context, TfLiteNode* node, return kTfLiteOk; } +#if HIFI_VFPU +TfLiteStatus ConvEvalHifiFloat32(TfLiteContext *context, TfLiteNode *node, + const TfLiteConvParams ¶ms, + const XtensaConvOpData &data, + const TfLiteEvalTensor *input, + const TfLiteEvalTensor *filter, + const TfLiteEvalTensor *bias, + TfLiteEvalTensor *output) { + const RuntimeShape &input_shape = tflite::micro::GetTensorShape(input); + const RuntimeShape &filter_shape = tflite::micro::GetTensorShape(filter); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = data.reference_op_data.padding.width; + const int pad_height = data.reference_op_data.padding.height; + + const RuntimeShape &output_shape = tflite::micro::GetTensorShape(output); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_depth = filter_shape.Dims(3); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + const float32_t *input_data = tflite::micro::GetTensorData(input); + const float32_t *filter_data = + tflite::micro::GetTensorData(filter); + const float32_t *bias_data = tflite::micro::GetTensorData(bias); + float32_t *output_data = tflite::micro::GetTensorData(output); + ConvParams op_params; + CalculateActivationRange(params.activation, &op_params.float_activation_min, + &op_params.float_activation_max); + + const int output_data_format = 0; + const int out_length = output_height * output_width * output_depth; + if (filter_height == 1 && filter_width == 1) { + for (int batch = 0; batch < batches; ++batch) { + float32_t *p_out_temp; + p_out_temp = &output_data[batch * out_length]; + + TF_LITE_ENSURE_EQ( + context, + xa_nn_conv2d_pointwise_f32( + p_out_temp, const_cast(filter_data), + const_cast(&input_data[batch * input_height * + input_width * input_depth]), + const_cast(bias_data), input_height, input_width, + input_depth, output_depth, output_data_format), + 0); + } + xa_nn_vec_activation_min_max_f32_f32( + output_data, output_data, op_params.float_activation_min, + op_params.float_activation_max, + (batches * output_height * output_width * output_depth)); + } else if ((filter_depth == input_depth) && + ((params.dilation_width_factor == 1) && + (params.dilation_height_factor == 1))) { + void *p_scratch = static_cast( + context->GetScratchBuffer(context, data.scratch_tensor_index)); + + for (int batch = 0; batch < batches; ++batch) { + float32_t *p_out_temp; + p_out_temp = &output_data[batch * out_length]; + TF_LITE_ENSURE_EQ( + context, + xa_nn_conv2d_std_f32( + p_out_temp, + &input_data[batch * input_height * input_width * input_depth], + const_cast(filter_data), // filter_data, + bias_data, input_height, input_width, input_depth, filter_height, + filter_width, output_depth, stride_width, stride_height, + pad_width, pad_height, output_height, output_width, + output_data_format, static_cast(p_scratch)), + 0); + } + xa_nn_vec_activation_min_max_f32_f32( + output_data, output_data, op_params.float_activation_min, + op_params.float_activation_max, + (batches * output_height * output_width * output_depth)); + } else { + TFLITE_DCHECK(node->user_data != nullptr); + const auto &op_data = + *(reinterpret_cast(node->user_data)); + tflite::reference_ops::Conv( + ConvParamsFloat(params, op_data.reference_op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetOptionalTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output), + tflite::micro::GetTensorShape(nullptr), nullptr); + } + + return kTfLiteOk; +} +#endif // HIFI_VFPU + } // namespace tflite #endif // defined(HIFI3) || defined(HIFI4) || defined(HIFI5) diff --git a/tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc b/tensorflow/lite/micro/kernels/xtensa/conv_int8_int16_float32.cc similarity index 76% rename from tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc rename to tensorflow/lite/micro/kernels/xtensa/conv_int8_int16_float32.cc index ed64f01bc2b..5a9b0ba9e7a 100644 --- a/tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc +++ b/tensorflow/lite/micro/kernels/xtensa/conv_int8_int16_float32.cc @@ -75,6 +75,28 @@ TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) { #endif } +TfLiteStatus EvalFloat32(TfLiteContext* context, TfLiteNode* node) { +#if HIFI_VFPU && (defined(HIFI3) || defined(HIFI4) || defined(HIFI5)) + const auto& op_data = *(reinterpret_cast(node->user_data)); + const auto& params = + *(reinterpret_cast(node->builtin_data)); + + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kConvInputTensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kConvOutputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kConvWeightsTensor); + const TfLiteEvalTensor* bias = + tflite::micro::GetEvalInput(context, node, kConvBiasTensor); + + return ConvEvalHifiFloat32(context, node, params, op_data, input, filter, bias, + output); +#else + return ConvReferenceEvalFloat32(context, node); +#endif +} + } // namespace TFLMRegistration Register_CONV_2D_INT8() { @@ -86,4 +108,9 @@ TFLMRegistration Register_CONV_2D_INT16() { EvalInt16); } +TFLMRegistration Register_CONV_2D_FLOAT32() { + return tflite::micro::RegisterOp(ConvInitXtensa, ConvPrepareXtensa, + EvalFloat32); +} + } // namespace tflite diff --git a/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h b/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h index f804a6d430c..4f27658d584 100644 --- a/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h +++ b/tensorflow/lite/micro/kernels/xtensa/xtensa_conv.h @@ -59,6 +59,16 @@ TfLiteStatus ConvEvalHifiInt16(TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* bias, TfLiteEvalTensor* output); +#if HIFI_VFPU +TfLiteStatus ConvEvalHifiFloat32(TfLiteContext* context, TfLiteNode* node, + const TfLiteConvParams& params, + const XtensaConvOpData& data, + const TfLiteEvalTensor* input, + const TfLiteEvalTensor* filter, + const TfLiteEvalTensor* bias, + TfLiteEvalTensor* output); +#endif + #endif // defined(HIFI3) || defined(HIFI4) || defined(HIFI5) #if defined(VISION_P6) @@ -79,6 +89,8 @@ TfLiteStatus ConvReferenceEvalInt8(TfLiteContext* context, TfLiteNode* node); TfLiteStatus ConvReferenceEvalInt16(TfLiteContext* context, TfLiteNode* node); +TfLiteStatus ConvReferenceEvalFloat32(TfLiteContext* context, TfLiteNode* node); + void* ConvInitXtensa(TfLiteContext* context, const char* buffer, size_t length); TfLiteStatus ConvPrepareXtensa(TfLiteContext* context, TfLiteNode* node); diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc index 70e1880c800..532c39da985 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc @@ -4,9 +4,10 @@ MICROLITE_CC_KERNEL_SRCS += \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/add_vision.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_common_xtensa.cc \ + $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_float32_reference.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int16_reference.cc \ - $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16.cc \ + $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_int16_float32.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_int8_reference.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/conv_vision.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/xtensa/depthwise_conv_hifi.cc \