diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h index 24e49210731cf8..530102e0adc4d0 100644 --- a/inference-engine/src/gna_plugin/backend/dnn_types.h +++ b/inference-engine/src/gna_plugin/backend/dnn_types.h @@ -45,11 +45,11 @@ struct DnnActivation { } pow; struct { int32_t levels; - // if input is perchannel quantisation - input pointers contains per-channer ranges + // if input is per-channel quantization - input pointers contains per-channel ranges int8_t inputPerChannel; float *input_low; float *input_high; - // if output is perchannel quantisation - output pointers contains per-channer ranges + // if output is per-channel quantization - output pointers contains per-channel ranges int8_t outputPerChannel; float *output_low; float *output_high; diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp index 7536880fe1f5c3..b4999df47a2508 100644 --- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp +++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp @@ -15,6 +15,7 @@ struct GNAFlags { bool uniformPwlDesign = false; bool gna_openmp_multithreading = false; bool sw_fp32 = false; + bool fake_quantized = false; bool performance_counting = false; }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp index 01a72e54ad12bc..0ccae929951929 100644 --- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp @@ -83,6 +83,10 @@ struct QuantI8 : public QuantDescTmpl struct QuantPair { using MandatoryType = A; @@ -115,9 +119,6 @@ inline bool shouldAlwaysAllocate() { */ template class Quant { - public: - template - void operator()(Args && ... args) const { } }; template<> @@ -125,7 +126,9 @@ class Quant { public: template void operator()(Args && ... args) const { - QuantizeAffine16(std::forward(args)...); + QuantizationCallback { + std::forward(args)... + }.runQuantize(); } }; @@ -134,10 +137,35 @@ class Quant { public: template void operator()(Args && ... args) const { - QuantizeAffine8(std::forward(args)...); + QuantizationCallback { + std::forward(args)... + }.runQuantize(); } }; +template<> +class Quant { + public: + template + void operator()(Args && ... args) const { + QuantizationCallback { + std::forward(args)... + }.runFakeQuantize(); + } +}; + +template<> +class Quant { + public: + template + void operator()(Args && ... args) const { + QuantizationCallback{ + std::forward(args)... + }.runFakeQuantize(); + } +}; + + template inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { auto prec_blob = InferenceEngine::make_shared_blob({ precision, @@ -273,6 +301,14 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc, auto quantData = InferenceEngine::getInjectedData(*wl); { + float *ptr_per_channel_weights_quants_min = nullptr; + float *ptr_per_channel_weights_quants_max = nullptr; + + if (!quantData->_weights_quants_min.empty()) { + ptr_per_channel_weights_quants_min = &quantData->_weights_quants_min.front(); + ptr_per_channel_weights_quants_max = &quantData->_weights_quants_max.front(); + } + fnc(wl->_weights->buffer().as(), wl->_biases ? wl->_biases->buffer().as() : nullptr, intWeights->buffer(), @@ -283,7 +319,12 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc, num_rows, num_columns, num_rows_padded, - num_columns_padded); + num_columns_padded, + quantData->levels, + nullptr, + nullptr, + ptr_per_channel_weights_quants_min, + ptr_per_channel_weights_quants_max); } wl->_weights = intWeights; wl->_biases = intBiases; @@ -563,4 +604,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase { using QuantI16 = frontend::QuantPair; using QuantI8 = frontend::QuantPair; + +using FakeQuantI16 = frontend::QuantPair; +using FakeQuantI8 = frontend::QuantPair; + + } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp index c0e24a75a75b06..128eb101463ae3 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.cpp +++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp @@ -5,20 +5,25 @@ #include #include #include
+#include +#include #include "backend/gna_types.h" #include "quantization.h" -void QuantizeAffine16(float *ptr_float_weights, - float *ptr_float_biases, - int16_t *ptr_int_weights, - int32_t *ptr_int_biases, - float input_scale_factor, - float *ptr_weight_scale_factor, - float *ptr_output_scale_factor, - uint32_t num_rows, - uint32_t num_columns, - uint32_t num_rows_padded, - uint32_t num_columns_padded) { +#ifdef DEBUG +#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__)) +#else +#define QUANTWARNING(...) +#endif + + +template<> +void QuantizationCallback::runFakeQuantize() const { + THROW_GNA_EXCEPTION << "int16 fake quantized models not yet supported"; +} + +template<> +void QuantizationCallback::runQuantize() const { uint32_t num_saturate = 0; if (*ptr_weight_scale_factor == 1.0) { @@ -149,11 +154,62 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t } } -void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, - int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases, - float input_scale_factor, float *ptr_weight_scale_factor, - float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns, - uint32_t num_rows_padded, uint32_t num_columns_padded) { +template<> +void QuantizationCallback::runFakeQuantize() const { + // TODO: possible remove this zero point + const float zeroPoint = MAX_VAL_1B_WEIGHT; + uint32_t num_saturate = 0; + + if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) { + THROW_GNA_EXCEPTION << "Fake quantized output range not set"; + } + if (fq_levels == 0 || fq_levels == 1) { + THROW_GNA_EXCEPTION << "Fake quantized levels not set"; + } + + for (uint32_t i = 0; i < num_rows; i++) { + for (uint32_t j = 0; j < num_columns; j++) { + auto offset = i * num_columns + j; + auto normalizedWeight = ptr_float_weights[offset] - zeroPoint; + // range checking + if (normalizedWeight > MAX_VAL_1B_WEIGHT || normalizedWeight < -MAX_VAL_1B_WEIGHT) { + THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantisation: " << ptr_float_weights[offset]; + } + ptr_int_weights[offset] = static_cast(normalizedWeight); + } + if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) { + THROW_GNA_EXCEPTION << "Fake quantized output range not set"; + } + if (fq_levels == 0 || fq_levels == 1) { + THROW_GNA_EXCEPTION << "Fake quantized levels not set"; + } + auto channel_scale = (fq_levels - 1) / (fq_ptr_output_high[i] - fq_ptr_output_low[i]); + auto channel_scale_multiplier = *ptr_weight_scale_factor / channel_scale; + + ptr_int_biases[i].multiplier = static_cast (channel_scale_multiplier); + } + + if (ptr_float_biases != nullptr) { + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j].bias = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j].bias = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j].bias = (int32_t) value; + } + } + } + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows); + } +} +template<> +void QuantizationCallback::runQuantize() const { if (ptr_int_biases == nullptr) { THROW_IE_EXCEPTION << "Int biases are empty"; } diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h index d9316739024e39..2ca27c20d6e489 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.h +++ b/inference-engine/src/gna_plugin/frontend/quantization.h @@ -16,25 +16,34 @@ #define MAX_VAL_2B_WEIGHT 16384 #define MAX_VAL_2B_FEAT 16384 #define MAX_VAL_4B_BIAS 1073741824 -#ifdef DEBUG -#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__)) -#else -#define QUANTWARNING(...) -#endif - -void QuantizeAffine16(float *ptr_float_weights, - float *ptr_float_biases, - int16_t *ptr_int_weights, - int32_t *ptr_int_biases, - float input_scale_factor, - float *ptr_weight_scale_factor, - float *ptr_output_scale_factor, - uint32_t num_rows, - uint32_t num_columns, - uint32_t num_rows_padded, - uint32_t num_columns_padded); + +template +struct QuantizationCallback { + float *ptr_float_weights; + float *ptr_float_biases; + WeightsType* ptr_int_weights; + BiasType* ptr_int_biases; + float input_scale_factor; + float *ptr_weight_scale_factor; + float *ptr_output_scale_factor; + uint32_t num_rows; + uint32_t num_columns; + uint32_t num_rows_padded; + uint32_t num_columns_padded; + + // TODO: copied from fake quantize activation + int32_t fq_levels; + float *fq_ptr_input_low; + float *fq_ptr_input_high; + float *fq_ptr_output_low; + float *fq_ptr_output_high; + + void runQuantize() const; + void runFakeQuantize() const; +}; + +template class QuantizationCallback; +template class QuantizationCallback; + float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor); -void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases, - float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor, - uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded); diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp index 024d5b81f0b69a..1d0d2503ebbe89 100644 --- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp +++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp @@ -15,6 +15,13 @@ struct Quantization { struct QuantizedLayerParams { Quantization _src_quant; Quantization _dst_quant; + + // per channel weights quant data + int32_t levels; + std::vector _weights_quants_min; + std::vector _weights_quants_max; + + // deprecate this Quantization _weights_quant; Quantization _bias_quant; float _o_shift = 0.0f; diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp index 8ec4a690cab78f..0c9fa6a27ee184 100644 --- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp @@ -605,7 +605,8 @@ class ScaleFactorPerLayer { MAX_VAL_4B_BIAS, wl->_biases->size()); if (quant->_bias_quant.scale != -1.0f) { - quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale); + quant->_bias_quant.scale = + std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale); quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale; } } @@ -616,7 +617,7 @@ class ScaleFactorPerLayer { } double weights_reducer = 1.0; - auto conv = dynamic_cast(wl); + auto conv = dynamic_cast(wl); if (conv) { auto dims = conv->insData.front().lock()->getDims(); @@ -624,36 +625,35 @@ class ScaleFactorPerLayer { weights_reducer = std::max(1.0, weights_reducer); } quant->_weights_quant.scale /= weights_reducer; - } - - double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale; + double tmp_dst_quant_scale = quant->_weights_quant.scale * quant->_src_quant.scale; - if (weightsSize == 1 && - static_cast(tmp_dst_quant_scale * quant->_src_quant.scale) > - static_cast(std::numeric_limits::max()-1) * _scale_change_req_threshold) { - gnawarn() << "Output scale for " << wl->name - << " too large and are being reduced. Else saturations likely will happen \n"; - // reduce weight scale according experimental heuristic - if (quant->_dst_quant.scale * quant->_src_quant.scale / + if (weightsSize == 1 && + static_cast(tmp_dst_quant_scale * quant->_src_quant.scale) > + static_cast(std::numeric_limits::max() - 1) * _scale_change_req_threshold) { + gnawarn() << "Output scale for " << wl->name + << " too large and are being reduced. Else saturations likely will happen \n"; + // reduce weight scale according experimental heuristic + if (quant->_dst_quant.scale * quant->_src_quant.scale / static_cast(std::numeric_limits::max()) < _scale_change_threshold_100) { - quant->_weights_quant.scale *= _scale_reduction_50; - tmp_dst_quant_scale *= _scale_reduction_50; - } else if (quant->_dst_quant.scale * quant->_src_quant.scale / + quant->_weights_quant.scale *= _scale_reduction_50; + tmp_dst_quant_scale *= _scale_reduction_50; + } else if (quant->_dst_quant.scale * quant->_src_quant.scale / static_cast(std::numeric_limits::max()) < _scale_change_threshold_150) { - quant->_weights_quant.scale *= _scale_reduction_45; - tmp_dst_quant_scale *= _scale_reduction_45; - } else if (quant->_dst_quant.scale * quant->_src_quant.scale / + quant->_weights_quant.scale *= _scale_reduction_45; + tmp_dst_quant_scale *= _scale_reduction_45; + } else if (quant->_dst_quant.scale * quant->_src_quant.scale / static_cast(std::numeric_limits::max()) < _scale_change_threshold_200) { - quant->_weights_quant.scale *= _scale_reduction_40; - tmp_dst_quant_scale *= _scale_reduction_40; - } else { - quant->_weights_quant.scale *= _scale_reduction_35; - tmp_dst_quant_scale *= _scale_reduction_35; + quant->_weights_quant.scale *= _scale_reduction_40; + tmp_dst_quant_scale *= _scale_reduction_40; + } else { + quant->_weights_quant.scale *= _scale_reduction_35; + tmp_dst_quant_scale *= _scale_reduction_35; + } } } - quant->_dst_quant.scale = tmp_dst_quant_scale; + quant->_dst_quant.scale = quant->_weights_quant.scale * quant->_src_quant.scale; return true; } diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index e4eb06031d4010..748779900c881d 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -40,6 +40,7 @@ #if GNA_LIB_VER == 2 #include +#include uint32_t ToByteSize(const Gna2DataType type) { switch (type) { @@ -339,6 +340,80 @@ void GNAPlugin::InitGNADevice() { graphCompiler.setGNAMemoryPtr(gnamem); } +void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork & network) { + // fp32 emulation mode dont need any modifications to configuration + if (config.gnaFlags.sw_fp32) return; + + // search for FQ layers + // only supports cases of int16 or int8 + auto it = details::CNNNetworkIterator(&network); + auto end = details::CNNNetworkIterator(); + for (; it != end; it++) { + if (!LayerInfo(*it).isFakeQuantize()) { + continue; + } + + GNAFakeQuantizeLayer fqLayer(*it); + auto inputLayer = fqLayer.getInputLayer(); + + // this fake quantize represents data quantization - not weights + if (!LayerInfo(inputLayer).isConst()) { + continue; + } + // checking weight precision - they already quantized - so we need to adjust type of quantisation we have so far + const auto int8Levels = 255; + const auto int16Levels = 65535; + if (fqLayer.getLevels() != int8Levels && fqLayer.getLevels() != int16Levels) { + THROW_GNA_LAYER_EXCEPTION(*it) + << "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only: " + << int8Levels << " or " << int16Levels << " supported"; + } + // also in mixed mode i8 should be stated as target precision + if (fqLayer.getLevels() == int8Levels) { + config.gnaPrecision = InferenceEngine::Precision::I8; + } + gnaFlags->fake_quantized = true; + config.gnaFlags.fake_quantized = true; + } +} + +void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork & network) { + // fp32 emulation mode dont need any modifications to configuration + if (config.gnaFlags.sw_fp32) return; + + // search for FQ layers + // only supports cases of int16 or int8 + InputsDataMap inputs; + network.getInputsInfo(inputs); + for (auto && input : inputs) { + auto data = input.second->getInputData(); + size_t inputIdx = 0; + for (auto && nextToInputLayer : getInputTo(data)) { + if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) { + inputIdx++; + continue; + } + // replacing scale factor from this fq layer + GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second); + auto inputRange = fqLayer.getInputRange(); + auto outputRange = fqLayer.getOutputRange(); + if (inputRange.second.size() != 1 || inputRange.second.size() != 1 || + outputRange.second.size() != 1 || outputRange.second.size() != 1) { + THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second) + << "unsupported,per-channel quantisation for input layer : " << input.second->name(); + } + float scaleInput = (inputRange.second[0] - inputRange.first[0]) / (fqLayer.getLevels() - 1); + float scaleOutputs = (outputRange.second[0] - outputRange.first[0]) / (fqLayer.getLevels() - 1); + + // TODO: proper mapping into scale factors + config.inputScaleFactors[inputIdx] = 1 / scaleInput; + inputsDesc->inputScaleFactors[inputIdx] = 1 / scaleInput; + + inputIdx++; + } + } +} + void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { std::shared_ptr convertedNetwork; if (_network.getFunction()) { @@ -356,6 +431,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { THROW_GNA_EXCEPTION << error.c_str(); } + // FQ networks now replaces certain flags in the plugin - flags will'be owerritten + UpdateGnaQuantModeFromNetwork(network); + UpdateInputScaleFromNetwork(network); + // network optimisation phases int passIdx = 0; auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) { @@ -369,6 +448,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { // fake quantisation aware passes passes->registerPass(); + passes->registerPass(); passes->registerPass(); passes->registerPass(); @@ -405,6 +485,19 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { // to run all passes need to have two calls to pass manager run_passes(newNet, true); run_passes(newNet, false); + } else if (gnaFlags->fake_quantized) { + switch (config.gnaPrecision) { + case Precision::I16: + ModelQuantizer q16; + newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors); + break; + case Precision::I8: + ModelQuantizer q8; + newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); + break; + default: + THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision; + } } else { switch (config.gnaPrecision) { case Precision::I16: @@ -416,8 +509,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); break; default: - THROW_GNA_EXCEPTION << "no mans land for GNA precision"; - break; + THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision; } } @@ -967,7 +1059,7 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer #ifdef PLOT dnn->BeginNewWrite(dnn_dump_write_index); if (dnn->num_components() != 0) { - dnn->WriteDnnText("Net_.txt", kDnnFloat); + dnn->WriteDnnText("Net_.txt", kDnnInt); } dnn_dump_write_index++; #endif diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp index 99eda6c07d54f9..5945af7537b1aa 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -221,6 +221,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin { int idx = 0); void UpdateFieldsFromConfig(); + void UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork &); + void UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork &); }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp index b3d5dc249ed9e6..ab399636b61cbf 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp @@ -72,5 +72,5 @@ if (!(expr)) { \ } #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": " #define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer) -#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" " +#define LAYER_NAME(layer) (layer)->type << " layer : \"" << (layer)->name << "\" " diff --git a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp index 20520c5e1e716f..892f153667aac0 100644 --- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp @@ -16,17 +16,19 @@ class GNAFakeQuantizeLayer { InferenceEngine::CNNLayerPtr fqLayer; public : GNAFakeQuantizeLayer(InferenceEngine::CNNLayerPtr fqLayer) - : fqLayer(fqLayer) {} + : fqLayer(fqLayer) { + if (!LayerInfo(fqLayer).isFakeQuantize()) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize"; + } + } /** * @brief convert FQ layer directly to gna-pwl activation layer */ DnnActivation parseAsActivation() const { DnnActivation fqActivation; - if (!LayerInfo(fqLayer).isFakeQuantize()) { - THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize"; - } - fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels"); + + fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels"); auto inputShape = getShapeForRange(fqLayer, 1); auto outputShape = getShapeForRange(fqLayer, 3); @@ -52,8 +54,50 @@ class GNAFakeQuantizeLayer { return LayerUtils::getParamFromInputAsBlob(fqLayer, 0); } + /** + * fake quantize has 5 input layers, while 4 of them always constant layer, and 1 might be a tensor - connection + */ + InferenceEngine::CNNLayerPtr getInputLayer() const { + return getInputLayerAt(fqLayer, 0); + } + + int32_t getLevels() { + return fqLayer->GetParamAsInt("levels"); + } + + std::pair, std::vector> getInputRange() { + return getRange(fqLayer, 1); + } + + std::pair, std::vector> getOutputRange() { + return getRange(fqLayer, 3); + } + + operator InferenceEngine::CNNLayerPtr () const { + return fqLayer; + } + + InferenceEngine::CNNLayerPtr operator -> () const { + return fqLayer; + } + InferenceEngine::CNNLayerPtr operator * () const { + return fqLayer; + } protected : + static std::pair, std::vector> getRange(InferenceEngine::CNNLayerPtr input, size_t idx) { + auto shape = getShapeForRange(input, idx); + auto rangeSize = InferenceEngine::details::product(shape.begin(), shape.end()); + + auto minPtr = getParamFromInputAsFloats(input, idx); + std::vector minValues(minPtr, minPtr + rangeSize); + + auto maxPtr = getParamFromInputAsFloats(input, idx + 1); + std::vector maxValues(maxPtr, maxPtr + rangeSize); + + return {minValues, maxValues}; + } + static float* getParamFromInputAsFloats(InferenceEngine::CNNLayerPtr input, size_t idx) { auto data = LayerUtils::getParamFromInputAsBlob(input, idx); return data->buffer().as(); @@ -64,6 +108,23 @@ class GNAFakeQuantizeLayer { return data->getTensorDesc().getDims(); } + static InferenceEngine::CNNLayerPtr getInputLayerAt(InferenceEngine::CNNLayerPtr input, size_t idx) { + if (input->insData.size() <= idx) { + THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input"; + } + auto iLayerData = input->insData[idx].lock(); + if (!iLayerData) { + THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx + << ", input: cannot dereference data weak-pointer"; + } + auto iLayer = getCreatorLayer(iLayerData).lock(); + if (!iLayer) { + THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx + << ", input: cannot dereference creator layer weak-pointer"; + } + return iLayer; + } + static InferenceEngine::SizeVector getShapeForRange(InferenceEngine::CNNLayerPtr input, size_t idx) { auto lowShape = getShapeFromInput(input, idx); auto highShape = getShapeFromInput(input, idx + 1); diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index 281125605b8059..b6b18643ef6822 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -35,6 +35,7 @@ #include "gna_pass_manager.hpp" #include "layers/gna_layer_info.hpp" #include "gna_upstream_iterator.hpp" +#include "frontend/quantization.h" using namespace InferenceEngine; @@ -1450,6 +1451,17 @@ void FuseFQIntoWeightsPass::run() { return LayerInfo(ptr).isNonFunctional(); }; + auto assignWeightsAndBiases = [](CNNLayerPtr layer, Blob::Ptr weights, Blob::Ptr biases) { + auto weigtableLayer = std::dynamic_pointer_cast(layer); + if (nullptr == weigtableLayer) { + THROW_GNA_LAYER_EXCEPTION(layer) << " not a weightable layer"; + } + weigtableLayer->_weights = weights; + weigtableLayer->_biases = biases; + weigtableLayer->blobs["weights"] = weights; + weigtableLayer->blobs["biases"] = biases; + }; + for (auto &l : *pLayers) { if (!LayerInfo(l).isFakeQuantize()) { continue; @@ -1479,7 +1491,10 @@ void FuseFQIntoWeightsPass::run() { pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of " << LAYER_NAME(weightableLayer) << "\n"; + GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer); + auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, 2); + auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData(); // 1. broke existing connections - by detaching fq subgraph from rest of graph auto prevData = weightableLayer->insData[1].lock(); @@ -1495,37 +1510,109 @@ void FuseFQIntoWeightsPass::run() { auto outputSize = details::product(weightDims.begin(), weightDims.end()); // depending on compute precision weights will be recreated - auto quantized = InferenceEngine::getInjectedData(fqLayer); + // for integer mode - weights might be simply copied - to avoid furter quantisations overhead + auto quantized = InferenceEngine::getInjectedData(weightableLayer); if (quantized) { - THROW_GNA_LAYER_EXCEPTION(fqLayer) << " not supported for non FP32 precision yet"; + // assign already quantized Weights + assignWeightsAndBiases(weightableLayer, quantizedWeights, biases); + + // modify scale factors for quantized component + auto outputRange = gnaFakeQuantizeLayer.getOutputRange(); + quantized->_weights_quants_min.insert( + quantized->_weights_quants_max.end(), outputRange.first.begin(), outputRange.first.end()); + + quantized->_weights_quants_max.insert( + quantized->_weights_quants_max.end(), outputRange.second.begin(), outputRange.second.end()); + + quantized->levels = gnaFakeQuantizeLayer.getLevels(); + auto inputRange = gnaFakeQuantizeLayer.getInputRange(); + + // lets find out minimum scale factor among channels + if (quantized->_weights_quants_min.empty()) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel weigts scales missed"; + } + auto getScale = [&quantized](uint32_t i) { + return (quantized->levels - 1) / (quantized->_weights_quants_max[i] - quantized->_weights_quants_min[i]); + }; + float min_channel_scale = getScale(0); + for (uint32_t i = 1; i < quantized->_weights_quants_min.size(); i++) { + min_channel_scale = std::min(min_channel_scale, getScale(i)); + } + // common scale calculation + quantized->_weights_quant.scale = min_channel_scale * MAX_OUT_MULTIPLIER; + continue; } intel_dnn_component_t component; - GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer); component.num_columns_in = weightDims[1]; component.num_rows_in = weightDims[0]; intel_piecewiselinear_t *transform = reinterpret_cast(&component.op.pwl); transform->func_id = gnaFakeQuantizeLayer.parseAsActivation(); - auto inputData = gnaFakeQuantizeLayer.getConstInputData(); - auto inputBuffer = inputData->buffer(); - component.ptr_inputs = inputBuffer.as(); + auto quantizedWeightsData = quantizedWeights->buffer(); + component.ptr_inputs = quantizedWeightsData.as(); - auto resultBlob = make_shared_blob(TensorDesc(Precision::FP32, {outputSize}, Layout::C)); - resultBlob->allocate(); + auto dequantizedWeights = make_shared_blob(TensorDesc(Precision::FP32, {outputSize}, Layout::C)); + dequantizedWeights->allocate(); - auto resultBuffer = resultBlob->buffer(); + auto resultBuffer = dequantizedWeights->buffer(); component.ptr_outputs = resultBuffer.as(); PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1); - // 3. assign quantized const blob to weightable layer - auto weigtableLayerCasted = std::dynamic_pointer_cast(weightableLayer); - weigtableLayerCasted->_weights = resultBlob; - weigtableLayerCasted->_biases = biases; - weigtableLayerCasted->blobs["weights"] = resultBlob; - weigtableLayerCasted->blobs["biases"] = biases; + // 3. assign dequantized const blob to weightable layer + assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases); + } +} + +void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { + auto quantized = InferenceEngine::getInjectedData(pLayers->front()); + if (!quantized) { + return; + } + + auto donotSkip = [](CNNLayerPtr) { + return false; + }; + for (auto &&l : *pLayers) { + if (!LayerInfo(l).isFakeQuantize()) { + continue; + } + GNAFakeQuantizeLayer fqLayer(l); + + auto nextLayer = CNNNetGetNextLayerSkipCertain(*fqLayer, 0, 0, donotSkip).first; + auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip); + + if (prevLayer->outData.size() != 1) { + THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported"; + } + auto insDatas = CNNLayerFindInsDataIdxes(fqLayer->outData.front(), nextLayer); + + if (insDatas.size() != 1) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize connection to layer: " << LAYER_NAME(nextLayer) << " is not correct"; + } + nextLayer->insData[insDatas.front()] = prevLayer->outData.front(); + getInputTo(prevLayer->outData.front()).clear(); + getInputTo(prevLayer->outData.front())[nextLayer->name] = nextLayer; + + // After layer gets removed lets absorb its params in QuantParams structure + + // replacing scale factor from this fq layer + auto inputRange = fqLayer.getInputRange(); + auto outputRange = fqLayer.getOutputRange(); + if (inputRange.second.size() != 1 || inputRange.second.size() != 1 || + outputRange.second.size() != 1 || outputRange.second.size() != 1) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) + << "unsupported,per-channel quantisation for layer : " << nextLayer->name; + } + float scaleInput = (inputRange.second[0] - inputRange.first[0]) / (fqLayer.getLevels() - 1); + float scaleOutputs = (outputRange.second[0] - outputRange.first[0]) / (fqLayer.getLevels() - 1); + + auto quantParams = InferenceEngine::getInjectedData(nextLayer); + + // TODO: proper mapping into scale factors + quantParams->_src_quant.scale = 1 / scaleInput; } } diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp index fc5d131ba5f013..6111ba2c2a9219 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp @@ -185,6 +185,11 @@ DECL_PASS(BroadcastConst); */ DECL_PASS(FuseFQIntoWeights); +/** +* @brief remove all fake quantize layers while moving it's settings into QuantParams for certain layer +*/ +DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams); + struct PassManagerSettings { Policy policy;