@@ -16,6 +16,8 @@ limitations under the License.
1616#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
1717
1818#include < algorithm>
19+ #include < cmath>
20+ #include < cstdint>
1921
2022#include " ruy/profiler/instrumentation.h" // from @ruy
2123#include " tensorflow/lite/kernels/internal/common.h"
@@ -62,6 +64,59 @@ inline void FullyConnected(
6264 }
6365}
6466
67+ // This implementation receives the scales in float and performs requant in
68+ // float to avoid loss of precision.
69+ inline void FullyConnected (
70+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
71+ const uint8_t * input_data, const RuntimeShape& filter_shape,
72+ const uint8_t * filter_data, const RuntimeShape& bias_shape,
73+ const int32_t * bias_data, const RuntimeShape& output_shape,
74+ float input_scale, float output_scale, float filter_scale,
75+ uint8_t * output_data) {
76+ const int32_t input_offset = params.input_offset ;
77+ const int32_t filter_offset = params.weights_offset ;
78+ const int32_t output_offset = params.output_offset ;
79+ const int32_t output_activation_min = params.quantized_activation_min ;
80+ const int32_t output_activation_max = params.quantized_activation_max ;
81+ TFLITE_DCHECK_GE (filter_shape.DimensionsCount (), 2 );
82+ TFLITE_DCHECK_GE (output_shape.DimensionsCount (), 1 );
83+
84+ TFLITE_DCHECK_LE (output_activation_min, output_activation_max);
85+ // TODO(b/62193649): This really should be:
86+ // const int batches = ArraySize(output_dims, 1);
87+ // but the current --variable_batch hack consists in overwriting the 3rd
88+ // dimension with the runtime batch size, as we don't keep track for each
89+ // array of which dimension is the batch dimension in it.
90+ const int output_dim_count = output_shape.DimensionsCount ();
91+ const int filter_dim_count = filter_shape.DimensionsCount ();
92+ const int batches = FlatSizeSkipDim (output_shape, output_dim_count - 1 );
93+ const int output_depth = MatchingDim (filter_shape, filter_dim_count - 2 ,
94+ output_shape, output_dim_count - 1 );
95+ const int accum_depth = filter_shape.Dims (filter_dim_count - 1 );
96+ for (int b = 0 ; b < batches; ++b) {
97+ for (int out_c = 0 ; out_c < output_depth; ++out_c) {
98+ int32_t acc = 0 ;
99+ for (int d = 0 ; d < accum_depth; ++d) {
100+ int32_t input_val = input_data[b * accum_depth + d];
101+ int32_t filter_val = filter_data[out_c * accum_depth + d];
102+ acc += (filter_val + filter_offset) * (input_val + input_offset);
103+ }
104+ if (bias_data) {
105+ acc += bias_data[out_c];
106+ }
107+ const double effective_output_scale = static_cast <double >(input_scale) *
108+ static_cast <double >(filter_scale) /
109+ static_cast <double >(output_scale);
110+ int32_t acc_scaled = static_cast <int32_t >(
111+ round (static_cast <double >(acc) * effective_output_scale));
112+ acc_scaled += output_offset;
113+ acc_scaled = std::max (acc_scaled, output_activation_min);
114+ acc_scaled = std::min (acc_scaled, output_activation_max);
115+ output_data[out_c + output_depth * b] = static_cast <uint8_t >(acc_scaled);
116+ }
117+ }
118+ }
119+
65120inline void FullyConnected (
66121 const FullyConnectedParams& params, const RuntimeShape& input_shape,
67122 const uint8_t * input_data, const RuntimeShape& filter_shape,
@@ -164,6 +219,60 @@ inline void FullyConnected(
164219 }
165220}
166221
222+ // This implementation receives the scales in float and performs requant in
223+ // float to avoid loss of precision.
224+ inline void FullyConnected (
225+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
226+ const uint8_t * input_data, const RuntimeShape& filter_shape,
227+ const uint8_t * filter_data, const RuntimeShape& bias_shape,
228+ const int32_t * bias_data, const RuntimeShape& output_shape,
229+ float input_scale, float output_scale, float filter_scale,
230+ int16_t * output_data) {
231+ const int32_t input_offset = params.input_offset ;
232+ const int32_t filter_offset = params.weights_offset ;
233+ const int32_t output_offset = params.output_offset ;
234+ const int32_t output_activation_min = params.quantized_activation_min ;
235+ const int32_t output_activation_max = params.quantized_activation_max ;
236+
237+ TFLITE_DCHECK_LE (output_activation_min, output_activation_max);
238+ TFLITE_DCHECK_EQ (output_offset, 0 );
239+ // TODO(b/62193649): This really should be:
240+ // const int batches = ArraySize(output_dims, 1);
241+ // but the current --variable_batch hack consists in overwriting the 3rd
242+ // dimension with the runtime batch size, as we don't keep track for each
243+ // array of which dimension is the batch dimension in it.
244+ const int output_dim_count = output_shape.DimensionsCount ();
245+ const int filter_dim_count = filter_shape.DimensionsCount ();
246+ const int batches = FlatSizeSkipDim (output_shape, output_dim_count - 1 );
247+ const int output_depth = MatchingDim (filter_shape, filter_dim_count - 2 ,
248+ output_shape, output_dim_count - 1 );
249+ const int accum_depth = filter_shape.Dims (filter_dim_count - 1 );
250+ for (int b = 0 ; b < batches; ++b) {
251+ for (int out_c = 0 ; out_c < output_depth; ++out_c) {
252+ // Internal accumulation.
253+ // Initialize accumulator with the bias-value.
254+ int32_t accum = bias_data[out_c];
255+ // Accumulation loop.
256+ for (int d = 0 ; d < accum_depth; ++d) {
257+ int16_t input_val = input_data[b * accum_depth + d] + input_offset;
258+ int16_t filter_val =
259+ filter_data[out_c * accum_depth + d] + filter_offset;
260+ accum += filter_val * input_val;
261+ }
262+ const double effective_output_scale = static_cast <double >(input_scale) *
263+ static_cast <double >(filter_scale) /
264+ static_cast <double >(output_scale);
265+ int32_t acc_scaled = static_cast <int32_t >(
266+ round (static_cast <double >(accum) * effective_output_scale));
267+ // Saturate, cast to int16_t, and store to output array.
268+ acc_scaled = std::max (acc_scaled, output_activation_min - output_offset);
269+ acc_scaled = std::min (acc_scaled, output_activation_max - output_offset);
270+ acc_scaled += output_offset;
271+ output_data[out_c + output_depth * b] = acc_scaled;
272+ }
273+ }
274+ }
275+
167276inline void ShuffledFullyConnected (
168277 const FullyConnectedParams& params, const RuntimeShape& input_shape,
169278 const uint8_t * input_data, const RuntimeShape& weights_shape,
0 commit comments