Skip to content

[GNA] support for GNA_SW_EXACT mode for FQ networks #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: feature/esmirno/gna-fq-nolpt
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions inference-engine/src/gna_plugin/backend/dnn_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ struct DnnActivation {
} pow;
struct {
int32_t levels;
// if input is perchannel quantisation - input pointers contains per-channer ranges
// if input is per-channel quantization - input pointers contains per-channel ranges
int8_t inputPerChannel;
float *input_low;
float *input_high;
// if output is perchannel quantisation - output pointers contains per-channer ranges
// if output is per-channel quantization - output pointers contains per-channel ranges
int8_t outputPerChannel;
float *output_low;
float *output_high;
Expand Down
1 change: 1 addition & 0 deletions inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ struct GNAFlags {
bool uniformPwlDesign = false;
bool gna_openmp_multithreading = false;
bool sw_fp32 = false;
bool fake_quantized = false;
bool performance_counting = false;
};
} // namespace GNAPluginNS
58 changes: 52 additions & 6 deletions inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
}
};

// for support proper trait instantiation for quantization function callback
struct FakeQuantI16 : public QuantI16 {};
struct FakeQuantI8 : public QuantI8 {};

template <class A, class B>
struct QuantPair {
using MandatoryType = A;
Expand Down Expand Up @@ -115,17 +119,16 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
*/
template <class T>
class Quant {
public:
template<class ...Args>
void operator()(Args && ... args) const { }
};

template<>
class Quant<QuantI16> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizeAffine16(std::forward<Args>(args)...);
QuantizationCallback<int16_t, int32_t> {
std::forward<Args>(args)...
}.runQuantize();
}
};

Expand All @@ -134,10 +137,35 @@ class Quant<QuantI8> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizeAffine8(std::forward<Args>(args)...);
QuantizationCallback<int8_t, gna_compound_bias_t> {
std::forward<Args>(args)...
}.runQuantize();
}
};

template<>
class Quant<FakeQuantI16> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizationCallback<int16_t, int32_t> {
std::forward<Args>(args)...
}.runFakeQuantize();
}
};

template<>
class Quant<FakeQuantI8> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizationCallback<int8_t, gna_compound_bias_t>{
std::forward<Args>(args)...
}.runFakeQuantize();
}
};


template <typename T>
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
Expand Down Expand Up @@ -273,6 +301,14 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,

auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
{
float *ptr_per_channel_weights_quants_min = nullptr;
float *ptr_per_channel_weights_quants_max = nullptr;

if (!quantData->_weights_quants_min.empty()) {
ptr_per_channel_weights_quants_min = &quantData->_weights_quants_min.front();
ptr_per_channel_weights_quants_max = &quantData->_weights_quants_max.front();
}

fnc(wl->_weights->buffer().as<float *>(),
wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
intWeights->buffer(),
Expand All @@ -283,7 +319,12 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
num_rows,
num_columns,
num_rows_padded,
num_columns_padded);
num_columns_padded,
quantData->levels,
nullptr,
nullptr,
ptr_per_channel_weights_quants_min,
ptr_per_channel_weights_quants_max);
}
wl->_weights = intWeights;
wl->_biases = intBiases;
Expand Down Expand Up @@ -563,4 +604,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;


using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;


} // namespace GNAPluginNS
88 changes: 72 additions & 16 deletions inference-engine/src/gna_plugin/frontend/quantization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,25 @@
#include <cstring>
#include <iostream>
#include <details/ie_exception.hpp>
#include <gna_plugin_log.hpp>
#include <limits>
#include "backend/gna_types.h"
#include "quantization.h"

void QuantizeAffine16(float *ptr_float_weights,
float *ptr_float_biases,
int16_t *ptr_int_weights,
int32_t *ptr_int_biases,
float input_scale_factor,
float *ptr_weight_scale_factor,
float *ptr_output_scale_factor,
uint32_t num_rows,
uint32_t num_columns,
uint32_t num_rows_padded,
uint32_t num_columns_padded) {
#ifdef DEBUG
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
#else
#define QUANTWARNING(...)
#endif


template<>
void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
THROW_GNA_EXCEPTION << "int16 fake quantized models not yet supported";
}

template<>
void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
uint32_t num_saturate = 0;

if (*ptr_weight_scale_factor == 1.0) {
Expand Down Expand Up @@ -149,11 +154,62 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
}
}

void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
float input_scale_factor, float *ptr_weight_scale_factor,
float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
uint32_t num_rows_padded, uint32_t num_columns_padded) {
template<>
void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
// TODO: possible remove this zero point
const float zeroPoint = MAX_VAL_1B_WEIGHT;
uint32_t num_saturate = 0;

if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
THROW_GNA_EXCEPTION << "Fake quantized output range not set";
}
if (fq_levels == 0 || fq_levels == 1) {
THROW_GNA_EXCEPTION << "Fake quantized levels not set";
}

for (uint32_t i = 0; i < num_rows; i++) {
for (uint32_t j = 0; j < num_columns; j++) {
auto offset = i * num_columns + j;
auto normalizedWeight = ptr_float_weights[offset] - zeroPoint;
// range checking
if (normalizedWeight > MAX_VAL_1B_WEIGHT || normalizedWeight < -MAX_VAL_1B_WEIGHT) {
THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantisation: " << ptr_float_weights[offset];
}
ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
}
if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
THROW_GNA_EXCEPTION << "Fake quantized output range not set";
}
if (fq_levels == 0 || fq_levels == 1) {
THROW_GNA_EXCEPTION << "Fake quantized levels not set";
}
auto channel_scale = (fq_levels - 1) / (fq_ptr_output_high[i] - fq_ptr_output_low[i]);
auto channel_scale_multiplier = *ptr_weight_scale_factor / channel_scale;

ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_scale_multiplier);
}

if (ptr_float_biases != nullptr) {
for (uint32_t j = 0; j < num_rows; j++) {
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
if (value > 2147483647.0) {
ptr_int_biases[j].bias = 2147483647L;
num_saturate++;
} else if (value < -2147483648.0) {
ptr_int_biases[j].bias = -2147483648LL;
num_saturate++;
} else {
ptr_int_biases[j].bias = (int32_t) value;
}
}
}
if (num_saturate > 0) {
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
}
}
template<>
void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
if (ptr_int_biases == nullptr) {
THROW_IE_EXCEPTION << "Int biases are empty";
}
Expand Down
49 changes: 29 additions & 20 deletions inference-engine/src/gna_plugin/frontend/quantization.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,34 @@
#define MAX_VAL_2B_WEIGHT 16384
#define MAX_VAL_2B_FEAT 16384
#define MAX_VAL_4B_BIAS 1073741824
#ifdef DEBUG
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
#else
#define QUANTWARNING(...)
#endif

void QuantizeAffine16(float *ptr_float_weights,
float *ptr_float_biases,
int16_t *ptr_int_weights,
int32_t *ptr_int_biases,
float input_scale_factor,
float *ptr_weight_scale_factor,
float *ptr_output_scale_factor,
uint32_t num_rows,
uint32_t num_columns,
uint32_t num_rows_padded,
uint32_t num_columns_padded);

template <class WeightsType, class BiasType>
struct QuantizationCallback {
float *ptr_float_weights;
float *ptr_float_biases;
WeightsType* ptr_int_weights;
BiasType* ptr_int_biases;
float input_scale_factor;
float *ptr_weight_scale_factor;
float *ptr_output_scale_factor;
uint32_t num_rows;
uint32_t num_columns;
uint32_t num_rows_padded;
uint32_t num_columns_padded;

// TODO: copied from fake quantize activation
int32_t fq_levels;
float *fq_ptr_input_low;
float *fq_ptr_input_high;
float *fq_ptr_output_low;
float *fq_ptr_output_high;

void runQuantize() const;
void runFakeQuantize() const;
};

template class QuantizationCallback<int16_t, int32_t>;
template class QuantizationCallback<int8_t, gna_compound_bias_t>;

float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ struct Quantization {
struct QuantizedLayerParams {
Quantization _src_quant;
Quantization _dst_quant;

// per channel weights quant data
int32_t levels;
std::vector<float> _weights_quants_min;
std::vector<float> _weights_quants_max;

// deprecate this
Quantization _weights_quant;
Quantization _bias_quant;
float _o_shift = 0.0f;
Expand Down
48 changes: 24 additions & 24 deletions inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,8 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
MAX_VAL_4B_BIAS,
wl->_biases->size());
if (quant->_bias_quant.scale != -1.0f) {
quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
quant->_bias_quant.scale =
std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale;
}
}
Expand All @@ -616,44 +617,43 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
}

double weights_reducer = 1.0;
auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl);
auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
if (conv) {
auto dims = conv->insData.front().lock()->getDims();

weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
weights_reducer = std::max(1.0, weights_reducer);
}
quant->_weights_quant.scale /= weights_reducer;
}


double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
double tmp_dst_quant_scale = quant->_weights_quant.scale * quant->_src_quant.scale;

if (weightsSize == 1 &&
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
gnawarn() << "Output scale for " << wl->name
<< " too large and are being reduced. Else saturations likely will happen \n";
// reduce weight scale according experimental heuristic
if (quant->_dst_quant.scale * quant->_src_quant.scale /
if (weightsSize == 1 &&
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
gnawarn() << "Output scale for " << wl->name
<< " too large and are being reduced. Else saturations likely will happen \n";
// reduce weight scale according experimental heuristic
if (quant->_dst_quant.scale * quant->_src_quant.scale /
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
quant->_weights_quant.scale *= _scale_reduction_50;
tmp_dst_quant_scale *= _scale_reduction_50;
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
quant->_weights_quant.scale *= _scale_reduction_50;
tmp_dst_quant_scale *= _scale_reduction_50;
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
quant->_weights_quant.scale *= _scale_reduction_45;
tmp_dst_quant_scale *= _scale_reduction_45;
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
quant->_weights_quant.scale *= _scale_reduction_45;
tmp_dst_quant_scale *= _scale_reduction_45;
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
quant->_weights_quant.scale *= _scale_reduction_40;
tmp_dst_quant_scale *= _scale_reduction_40;
} else {
quant->_weights_quant.scale *= _scale_reduction_35;
tmp_dst_quant_scale *= _scale_reduction_35;
quant->_weights_quant.scale *= _scale_reduction_40;
tmp_dst_quant_scale *= _scale_reduction_40;
} else {
quant->_weights_quant.scale *= _scale_reduction_35;
tmp_dst_quant_scale *= _scale_reduction_35;
}
}
}

quant->_dst_quant.scale = tmp_dst_quant_scale;
quant->_dst_quant.scale = quant->_weights_quant.scale * quant->_src_quant.scale;

return true;
}
Expand Down
Loading