LiteRT/tflite/kernels/internal/reference/batch_matmul.h at 742d0eb354e217354b788a91bdcf28f44ddd5a25 · google-ai-edge/LiteRT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_

#include <algorithm>
#include <cstdint>

#include "tflite/kernels/internal/common.h"
#include "tflite/kernels/internal/compatibility.h"
#include "tflite/kernels/internal/portable_tensor_utils.h"
#include "tflite/kernels/internal/runtime_shape.h"
#include "tflite/kernels/internal/types.h"

namespace tflite {
namespace reference_ops {
namespace batch_matmul {

// Determine which dimension is the broadcast dimension.
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
  if (lhs_dim == rhs_dim) return lhs_dim;
  if (lhs_dim == 1) return rhs_dim;
  TFLITE_DCHECK_EQ(rhs_dim, 1);
  return lhs_dim;
}

// Compute the "extent" for iterating on this dimension.
// If we are broadcasting, then don't advance (i.e return 0).
inline int extent(const RuntimeShape& shape, int x) {
  if (shape.Dims(x) == 1) {
    return 0;
  }
  int prod = 1;
  for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
    prod *= shape.Dims(i);
  }
  return prod;
}

}  // namespace batch_matmul

template <typename Ta, typename Tb, typename Tout>
inline void BatchMatMul(const RuntimeShape& lhs_shape, const Ta* lhs_data,
                        const RuntimeShape& rhs_shape, const Tb* rhs_data,
                        const RuntimeShape& output_shape, Tout* output_data) {
  const RuntimeShape extended_lhs_shape =
      RuntimeShape::ExtendedShape(5, lhs_shape);
  const RuntimeShape extended_rhs_shape =
      RuntimeShape::ExtendedShape(5, rhs_shape);

  const int batch_dim0 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
  const int batch_dim1 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
  const int batch_dim2 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));

  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);

  // Set params for each matrix multiply.
  const int lhs_rows = extended_lhs_shape.Dims(3);
  const int rhs_cols = extended_rhs_shape.Dims(4);
  const int accum_depth = extended_lhs_shape.Dims(4);

  for (int b0 = 0; b0 < batch_dim0; ++b0) {
    const Ta* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
    const Tb* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
    for (int b1 = 0; b1 < batch_dim1; ++b1) {
      const Ta* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
      const Tb* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
      for (int b2 = 0; b2 < batch_dim2; ++b2) {
        const Ta* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
        const Tb* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
        Tout* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                       b1 * batch_dim2 + b2) *
                                          lhs_rows * rhs_cols;
        for (int j = 0; j < rhs_cols; ++j) {
          for (int i = 0; i < lhs_rows; ++i) {
            Tout total = 0;
            for (int k = 0; k < accum_depth; ++k) {
              total += static_cast<Tout>(lhs_ptr2[accum_depth * i + k]) *
                       static_cast<Tout>(rhs_ptr2[j * accum_depth + k]);
            }
            int idx = lhs_rows * j + i;
            out_ptr[idx] = total;
          }
        }
      }
    }
  }
}

inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
                        const float* scaling_factors,
                        const int32_t* input_offset, int32_t* row_sums,
                        const RuntimeShape& output_shape, float* output_data,
                        bool* compute_row_sums,
                        const float* per_channel_scales) {
  const RuntimeShape extended_lhs_shape =
      RuntimeShape::ExtendedShape(5, lhs_shape);
  const RuntimeShape extended_rhs_shape =
      RuntimeShape::ExtendedShape(5, rhs_shape);

  const int batch_dim0 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
  const int batch_dim1 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
  const int batch_dim2 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));

  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);

  // Set params for each matrix multiply.
  const int lhs_rows = extended_lhs_shape.Dims(3);
  const int rhs_cols = extended_rhs_shape.Dims(4);
  const int accum_depth = extended_lhs_shape.Dims(4);

  const int ioff_ext0 = rhs_ext0 == 0 ? 0 : rhs_cols;
  const int ioff_ext1 = rhs_ext1 == 0 ? 0 : rhs_cols;
  const int ioff_ext2 = rhs_ext2 == 0 ? 0 : rhs_cols;
  const int woff_ext0 = lhs_ext0 == 0 ? 0 : lhs_rows;
  const int woff_ext1 = lhs_ext1 == 0 ? 0 : lhs_rows;
  const int woff_ext2 = lhs_ext2 == 0 ? 0 : lhs_rows;

  if (!compute_row_sums || *compute_row_sums) {
    int num_weights_matrices = 1;
    for (int i = 1; i < extended_lhs_shape.DimensionsCount() - 2; ++i) {
      num_weights_matrices *= extended_lhs_shape.Dims(i);
    }
    tensor_utils::ReductionSumVector(
        lhs_data, row_sums, num_weights_matrices * lhs_rows, accum_depth);
    if (compute_row_sums) {
      *compute_row_sums = false;
    }
  }

  for (int b0 = 0; b0 < batch_dim0; ++b0) {
    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
    const int32_t* ioff_ptr0 = input_offset + (b0 * ioff_ext0);
    const float* scale_ptr0 = scaling_factors + (b0 * ioff_ext0);
    const int32_t* woff_ptr0 = row_sums + (b0 * woff_ext0);
    for (int b1 = 0; b1 < batch_dim1; ++b1) {
      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
      const int32_t* ioff_ptr1 = ioff_ptr0 + (b1 * ioff_ext1);
      const float* scale_ptr1 = scale_ptr0 + (b1 * ioff_ext1);
      const int32_t* woff_ptr1 = woff_ptr0 + (b1 * woff_ext1);
      for (int b2 = 0; b2 < batch_dim2; ++b2) {
        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
        const int32_t* ioff_ptr2 = ioff_ptr1 + (b2 * ioff_ext2);
        const float* scale_ptr2 = scale_ptr1 + (b2 * ioff_ext2);
        const int32_t* woff_ptr2 = woff_ptr1 + (b2 * woff_ext2);
        float* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                        b1 * batch_dim2 + b2) *
                                           lhs_rows * rhs_cols;
        for (int j = 0; j < rhs_cols; ++j) {
          const float batch_scaling_factor = scale_ptr2[j];
          const float batch_offset = static_cast<float>(ioff_ptr2[j]);
          for (int i = 0; i < lhs_rows; ++i) {
            int32_t total = 0;
            for (int k = 0; k < accum_depth; ++k) {
              total +=
                  lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k];
            }
            int32_t row_sum = woff_ptr2[i];
            total -= row_sum * batch_offset;
            int idx = lhs_rows * j + i;
            float scale = batch_scaling_factor;
            if (per_channel_scales) {
              scale *= per_channel_scales[i];
            }
            out_ptr[idx] += scale * total;
          }
        }
      }
    }
  }
}

template <typename lhsT, typename AccumT, typename rhsT = lhsT,
          typename outputT = lhsT>
inline void BatchMatMul(const FullyConnectedParams& params,
                        const RuntimeShape& lhs_shape, const lhsT* lhs_data,
                        const RuntimeShape& rhs_shape, const rhsT* rhs_data,
                        const RuntimeShape& output_shape,
                        outputT* output_data) {
  const RuntimeShape extended_lhs_shape =
      RuntimeShape::ExtendedShape(5, lhs_shape);
  const RuntimeShape extended_rhs_shape =
      RuntimeShape::ExtendedShape(5, rhs_shape);

  const int batch_dim0 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
  const int batch_dim1 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
  const int batch_dim2 = batch_matmul::broadcast_dim(
      extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));

  const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
  const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
  const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
  const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
  const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
  const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);

  // Set params for each matrix multiply.
  const int lhs_rows = extended_lhs_shape.Dims(3);
  const int rhs_cols = extended_rhs_shape.Dims(4);
  const int accum_depth = extended_lhs_shape.Dims(4);

  const int32_t input_offset = params.input_offset;
  const int32_t filter_offset = params.weights_offset;
  const int32_t output_offset = params.output_offset;
  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  for (int b0 = 0; b0 < batch_dim0; ++b0) {
    const lhsT* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
    const rhsT* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
    for (int b1 = 0; b1 < batch_dim1; ++b1) {
      const lhsT* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
      const rhsT* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
      for (int b2 = 0; b2 < batch_dim2; ++b2) {
        const lhsT* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
        const rhsT* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
        outputT* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
                                          b1 * batch_dim2 + b2) *
                                             lhs_rows * rhs_cols;

        for (int j = 0; j < rhs_cols; ++j) {
          for (int i = 0; i < lhs_rows; ++i) {
            AccumT total = 0;
            for (int k = 0; k < accum_depth; ++k) {
              AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
              AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
              total += (lhs_val + filter_offset) * (rhs_val + input_offset);
            }
            int32_t total_scaled = MultiplyByQuantizedMultiplier(
                total, output_multiplier, output_shift);
            total_scaled += output_offset;
            total_scaled = std::max(total_scaled, output_activation_min);
            total_scaled = std::min(total_scaled, output_activation_max);
            const int idx = lhs_rows * j + i;
            out_ptr[idx] = static_cast<outputT>(total_scaled);
          }
        }
      }
    }
  }
}

}  // namespace reference_ops
}  // namespace tflite

#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_