esmirno · esmirno · Oct 21, 2020 · Oct 22, 2020 · Oct 24, 2020
diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h
@@ -45,11 +45,11 @@ struct DnnActivation {
         } pow;
         struct {
             int32_t levels;
-            // if input is perchannel quantisation - input pointers contains per-channer ranges
+            // if input is per-channel quantization - input pointers contains per-channel ranges
             int8_t  inputPerChannel;
             float  *input_low;
             float  *input_high;
-            // if output is perchannel quantisation - output pointers contains per-channer ranges
+            // if output is per-channel quantization - output pointers contains per-channel ranges
             int8_t  outputPerChannel;
             float  *output_low;
             float  *output_high;

diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -15,6 +15,7 @@ struct GNAFlags {
     bool uniformPwlDesign = false;
     bool gna_openmp_multithreading = false;
     bool sw_fp32 = false;
+    bool fake_quantized = false;
     bool performance_counting = false;
 };
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@@ -83,6 +83,10 @@ struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
     }
 };
 
+// for support proper trait instantiation for quantization function callback
+struct FakeQuantI16 : public QuantI16 {};
+struct FakeQuantI8 : public QuantI8 {};
+
 template <class A, class B>
 struct QuantPair {
     using MandatoryType = A;
@@ -115,17 +119,16 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
  */
 template <class T>
 class Quant {
- public:
-    template<class ...Args>
-    void operator()(Args && ... args) const { }
 };
 
 template<>
 class Quant<QuantI16> {
  public:
     template<class ...Args>
     void operator()(Args && ... args) const {
-        QuantizeAffine16(std::forward<Args>(args)...);
+        QuantizationCallback<int16_t, int32_t> {
+            std::forward<Args>(args)...
+        }.runQuantize();
     }
 };
 
@@ -134,10 +137,35 @@ class Quant<QuantI8> {
  public:
     template<class ...Args>
     void operator()(Args && ... args) const {
-        QuantizeAffine8(std::forward<Args>(args)...);
+        QuantizationCallback<int8_t, gna_compound_bias_t> {
+            std::forward<Args>(args)...
+        }.runQuantize();
     }
 };
 
+template<>
+class Quant<FakeQuantI16> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizationCallback<int16_t, int32_t> {
+            std::forward<Args>(args)...
+        }.runFakeQuantize();
+    }
+};
+
+template<>
+class Quant<FakeQuantI8> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizationCallback<int8_t, gna_compound_bias_t>{
+            std::forward<Args>(args)...
+        }.runFakeQuantize();
+    }
+};
+
+
 template <typename T>
 inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
     auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
@@ -273,6 +301,14 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
 
     auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
     {
+        float *ptr_per_channel_weights_quants_min = nullptr;
+        float *ptr_per_channel_weights_quants_max = nullptr;
+
+        if (!quantData->_weights_quants_min.empty()) {
+            ptr_per_channel_weights_quants_min = &quantData->_weights_quants_min.front();
+            ptr_per_channel_weights_quants_max = &quantData->_weights_quants_max.front();
+        }
+
         fnc(wl->_weights->buffer().as<float *>(),
             wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
             intWeights->buffer(),
@@ -283,7 +319,12 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
             num_rows,
             num_columns,
             num_rows_padded,
-            num_columns_padded);
+            num_columns_padded,
+            quantData->levels,
+            nullptr,
+            nullptr,
+            ptr_per_channel_weights_quants_min,
+            ptr_per_channel_weights_quants_max);
     }
     wl->_weights = intWeights;
     wl->_biases = intBiases;
@@ -563,4 +604,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
 using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
 using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
 
+
+using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
+using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
+
+
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp
@@ -5,20 +5,25 @@
 #include <cstring>
 #include <iostream>
 #include <details/ie_exception.hpp>
+#include <gna_plugin_log.hpp>
+#include <limits>
 #include "backend/gna_types.h"
 #include "quantization.h"
 
-void QuantizeAffine16(float *ptr_float_weights,
-                      float *ptr_float_biases,
-                      int16_t *ptr_int_weights,
-                      int32_t *ptr_int_biases,
-                      float input_scale_factor,
-                      float *ptr_weight_scale_factor,
-                      float *ptr_output_scale_factor,
-                      uint32_t num_rows,
-                      uint32_t num_columns,
-                      uint32_t num_rows_padded,
-                      uint32_t num_columns_padded) {
+#ifdef DEBUG
+#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
+#else
+#define QUANTWARNING(...)
+#endif
+
+
+template<>
+void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
+    THROW_GNA_EXCEPTION << "int16 fake quantized models not yet supported";
+}
+
+template<>
+void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
     uint32_t num_saturate = 0;
 
     if (*ptr_weight_scale_factor == 1.0) {
@@ -149,11 +154,62 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
     }
 }
 
-void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
-                     int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
-                     float input_scale_factor, float *ptr_weight_scale_factor,
-                     float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
-                     uint32_t num_rows_padded, uint32_t num_columns_padded) {
+template<>
+void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
+    // TODO: possible remove this zero point
+    const float zeroPoint = MAX_VAL_1B_WEIGHT;
+    uint32_t num_saturate = 0;
+
+    if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
+        THROW_GNA_EXCEPTION << "Fake quantized output range not set";
+    }
+    if (fq_levels == 0 || fq_levels == 1) {
+        THROW_GNA_EXCEPTION << "Fake quantized levels not set";
+    }
+
+    for (uint32_t i = 0; i < num_rows; i++) {
+        for (uint32_t j = 0; j < num_columns; j++) {
+            auto offset = i * num_columns + j;
+            auto normalizedWeight = ptr_float_weights[offset] - zeroPoint;
+            // range checking
+            if (normalizedWeight > MAX_VAL_1B_WEIGHT || normalizedWeight < -MAX_VAL_1B_WEIGHT) {
+                THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantisation: " << ptr_float_weights[offset];
+            }
+            ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
+        }
+        if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
+            THROW_GNA_EXCEPTION << "Fake quantized output range not set";
+        }
+        if (fq_levels == 0 || fq_levels == 1) {
+            THROW_GNA_EXCEPTION << "Fake quantized levels not set";
+        }
+        auto channel_scale = (fq_levels - 1) / (fq_ptr_output_high[i] - fq_ptr_output_low[i]);
+        auto channel_scale_multiplier = *ptr_weight_scale_factor / channel_scale;
+
+        ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_scale_multiplier);
+    }
+
+    if (ptr_float_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j].bias = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j].bias = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j].bias = (int32_t) value;
+            }
+        }
+    }
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
+    }
+}
+template<>
+void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
     if (ptr_int_biases == nullptr) {
         THROW_IE_EXCEPTION << "Int biases are empty";
     }

diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h
@@ -16,25 +16,34 @@
 #define MAX_VAL_2B_WEIGHT 16384
 #define MAX_VAL_2B_FEAT 16384
 #define MAX_VAL_4B_BIAS 1073741824
-#ifdef DEBUG
-#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
-#else
-#define QUANTWARNING(...)
-#endif
-
-void QuantizeAffine16(float *ptr_float_weights,
-                      float *ptr_float_biases,
-                      int16_t *ptr_int_weights,
-                      int32_t *ptr_int_biases,
-                      float input_scale_factor,
-                      float *ptr_weight_scale_factor,
-                      float *ptr_output_scale_factor,
-                      uint32_t num_rows,
-                      uint32_t num_columns,
-                      uint32_t num_rows_padded,
-                      uint32_t num_columns_padded);
+
+template <class WeightsType,  class BiasType>
+struct QuantizationCallback {
+    float *ptr_float_weights;
+    float *ptr_float_biases;
+    WeightsType* ptr_int_weights;
+    BiasType* ptr_int_biases;
+    float input_scale_factor;
+    float *ptr_weight_scale_factor;
+    float *ptr_output_scale_factor;
+    uint32_t num_rows;
+    uint32_t num_columns;
+    uint32_t num_rows_padded;
+    uint32_t num_columns_padded;
+
+    // TODO: copied from fake quantize activation
+    int32_t fq_levels;
+    float  *fq_ptr_input_low;
+    float  *fq_ptr_input_high;
+    float  *fq_ptr_output_low;
+    float  *fq_ptr_output_high;
+
+    void runQuantize() const;
+    void runFakeQuantize() const;
+};
+
+template class QuantizationCallback<int16_t, int32_t>;
+template class QuantizationCallback<int8_t, gna_compound_bias_t>;
+
 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
 void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
-void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
-                     float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
-                     uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
@@ -15,6 +15,13 @@ struct Quantization {
 struct QuantizedLayerParams {
     Quantization _src_quant;
     Quantization _dst_quant;
+
+    // per channel weights quant data
+    int32_t levels;
+    std::vector<float> _weights_quants_min;
+    std::vector<float> _weights_quants_max;
+
+    // deprecate this
     Quantization _weights_quant;
     Quantization _bias_quant;
     float _o_shift = 0.0f;

diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -605,7 +605,8 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
                                                                       MAX_VAL_4B_BIAS,
                                                                       wl->_biases->size());
                 if (quant->_bias_quant.scale != -1.0f) {
-                    quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
+                    quant->_bias_quant.scale =
+                        std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
                     quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale;
                 }
             }
@@ -616,44 +617,43 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
             }
 
             double weights_reducer = 1.0;
-            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl);
+            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
             if (conv) {
                 auto dims = conv->insData.front().lock()->getDims();
 
                 weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
                 weights_reducer = std::max(1.0, weights_reducer);
             }
             quant->_weights_quant.scale /= weights_reducer;
-        }
-
 
-        double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
+            double tmp_dst_quant_scale = quant->_weights_quant.scale * quant->_src_quant.scale;
 
-        if (weightsSize == 1 &&
-            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
-                                                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
-            gnawarn() << "Output scale for " << wl->name
-                                            << " too large and are being reduced. Else saturations likely will happen \n";
-            // reduce weight scale according experimental heuristic
-            if (quant->_dst_quant.scale * quant->_src_quant.scale /
+            if (weightsSize == 1 &&
+                static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
+                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
+                gnawarn() << "Output scale for " << wl->name
+                          << " too large and are being reduced. Else saturations likely will happen \n";
+                // reduce weight scale according experimental heuristic
+                if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
-                quant->_weights_quant.scale *= _scale_reduction_50;
-                tmp_dst_quant_scale *= _scale_reduction_50;
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                    quant->_weights_quant.scale *= _scale_reduction_50;
+                    tmp_dst_quant_scale *= _scale_reduction_50;
+                } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
-                quant->_weights_quant.scale *= _scale_reduction_45;
-                tmp_dst_quant_scale *= _scale_reduction_45;
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                    quant->_weights_quant.scale *= _scale_reduction_45;
+                    tmp_dst_quant_scale *= _scale_reduction_45;
+                } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
-                quant->_weights_quant.scale *= _scale_reduction_40;
-                tmp_dst_quant_scale *= _scale_reduction_40;
-            } else {
-                quant->_weights_quant.scale *= _scale_reduction_35;
-                tmp_dst_quant_scale *= _scale_reduction_35;
+                    quant->_weights_quant.scale *= _scale_reduction_40;
+                    tmp_dst_quant_scale *= _scale_reduction_40;
+                } else {
+                    quant->_weights_quant.scale *= _scale_reduction_35;
+                    tmp_dst_quant_scale *= _scale_reduction_35;
+                }
             }
         }
 
-        quant->_dst_quant.scale = tmp_dst_quant_scale;
+        quant->_dst_quant.scale = quant->_weights_quant.scale * quant->_src_quant.scale;
 
         return true;
     }