diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h
index 24e49210731cf8..530102e0adc4d0 100644
--- a/inference-engine/src/gna_plugin/backend/dnn_types.h
+++ b/inference-engine/src/gna_plugin/backend/dnn_types.h
@@ -45,11 +45,11 @@ struct DnnActivation {
         } pow;
         struct {
             int32_t levels;
-            // if input is perchannel quantisation - input pointers contains per-channer ranges
+            // if input is per-channel quantization - input pointers contains per-channel ranges
             int8_t  inputPerChannel;
             float  *input_low;
             float  *input_high;
-            // if output is perchannel quantisation - output pointers contains per-channer ranges
+            // if output is per-channel quantization - output pointers contains per-channel ranges
             int8_t  outputPerChannel;
             float  *output_low;
             float  *output_high;
diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
index 7536880fe1f5c3..b4999df47a2508 100644
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -15,6 +15,7 @@ struct GNAFlags {
     bool uniformPwlDesign = false;
     bool gna_openmp_multithreading = false;
     bool sw_fp32 = false;
+    bool fake_quantized = false;
     bool performance_counting = false;
 };
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
index 01a72e54ad12bc..0ccae929951929 100644
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@@ -83,6 +83,10 @@ struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
     }
 };
 
+// for support proper trait instantiation for quantization function callback
+struct FakeQuantI16 : public QuantI16 {};
+struct FakeQuantI8 : public QuantI8 {};
+
 template <class A, class B>
 struct QuantPair {
     using MandatoryType = A;
@@ -115,9 +119,6 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
  */
 template <class T>
 class Quant {
- public:
-    template<class ...Args>
-    void operator()(Args && ... args) const { }
 };
 
 template<>
@@ -125,7 +126,9 @@ class Quant<QuantI16> {
  public:
     template<class ...Args>
     void operator()(Args && ... args) const {
-        QuantizeAffine16(std::forward<Args>(args)...);
+        QuantizationCallback<int16_t, int32_t> {
+            std::forward<Args>(args)...
+        }.runQuantize();
     }
 };
 
@@ -134,10 +137,35 @@ class Quant<QuantI8> {
  public:
     template<class ...Args>
     void operator()(Args && ... args) const {
-        QuantizeAffine8(std::forward<Args>(args)...);
+        QuantizationCallback<int8_t, gna_compound_bias_t> {
+            std::forward<Args>(args)...
+        }.runQuantize();
     }
 };
 
+template<>
+class Quant<FakeQuantI16> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizationCallback<int16_t, int32_t> {
+            std::forward<Args>(args)...
+        }.runFakeQuantize();
+    }
+};
+
+template<>
+class Quant<FakeQuantI8> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizationCallback<int8_t, gna_compound_bias_t>{
+            std::forward<Args>(args)...
+        }.runFakeQuantize();
+    }
+};
+
+
 template <typename T>
 inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
     auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
@@ -273,6 +301,14 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
 
     auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
     {
+        float *ptr_per_channel_weights_quants_min = nullptr;
+        float *ptr_per_channel_weights_quants_max = nullptr;
+
+        if (!quantData->_weights_quants_min.empty()) {
+            ptr_per_channel_weights_quants_min = &quantData->_weights_quants_min.front();
+            ptr_per_channel_weights_quants_max = &quantData->_weights_quants_max.front();
+        }
+
         fnc(wl->_weights->buffer().as<float *>(),
             wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
             intWeights->buffer(),
@@ -283,7 +319,12 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
             num_rows,
             num_columns,
             num_rows_padded,
-            num_columns_padded);
+            num_columns_padded,
+            quantData->levels,
+            nullptr,
+            nullptr,
+            ptr_per_channel_weights_quants_min,
+            ptr_per_channel_weights_quants_max);
     }
     wl->_weights = intWeights;
     wl->_biases = intBiases;
@@ -563,4 +604,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
 using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
 using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
 
+
+using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
+using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
+
+
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp
index c0e24a75a75b06..128eb101463ae3 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.cpp
+++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp
@@ -5,20 +5,25 @@
 #include <cstring>
 #include <iostream>
 #include <details/ie_exception.hpp>
+#include <gna_plugin_log.hpp>
+#include <limits>
 #include "backend/gna_types.h"
 #include "quantization.h"
 
-void QuantizeAffine16(float *ptr_float_weights,
-                      float *ptr_float_biases,
-                      int16_t *ptr_int_weights,
-                      int32_t *ptr_int_biases,
-                      float input_scale_factor,
-                      float *ptr_weight_scale_factor,
-                      float *ptr_output_scale_factor,
-                      uint32_t num_rows,
-                      uint32_t num_columns,
-                      uint32_t num_rows_padded,
-                      uint32_t num_columns_padded) {
+#ifdef DEBUG
+#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
+#else
+#define QUANTWARNING(...)
+#endif
+
+
+template<>
+void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
+    THROW_GNA_EXCEPTION << "int16 fake quantized models not yet supported";
+}
+
+template<>
+void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
     uint32_t num_saturate = 0;
 
     if (*ptr_weight_scale_factor == 1.0) {
@@ -149,11 +154,62 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
     }
 }
 
-void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
-                     int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
-                     float input_scale_factor, float *ptr_weight_scale_factor,
-                     float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
-                     uint32_t num_rows_padded, uint32_t num_columns_padded) {
+template<>
+void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
+    // TODO: possible remove this zero point
+    const float zeroPoint = MAX_VAL_1B_WEIGHT;
+    uint32_t num_saturate = 0;
+
+    if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
+        THROW_GNA_EXCEPTION << "Fake quantized output range not set";
+    }
+    if (fq_levels == 0 || fq_levels == 1) {
+        THROW_GNA_EXCEPTION << "Fake quantized levels not set";
+    }
+
+    for (uint32_t i = 0; i < num_rows; i++) {
+        for (uint32_t j = 0; j < num_columns; j++) {
+            auto offset = i * num_columns + j;
+            auto normalizedWeight = ptr_float_weights[offset] - zeroPoint;
+            // range checking
+            if (normalizedWeight > MAX_VAL_1B_WEIGHT || normalizedWeight < -MAX_VAL_1B_WEIGHT) {
+                THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantisation: " << ptr_float_weights[offset];
+            }
+            ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
+        }
+        if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
+            THROW_GNA_EXCEPTION << "Fake quantized output range not set";
+        }
+        if (fq_levels == 0 || fq_levels == 1) {
+            THROW_GNA_EXCEPTION << "Fake quantized levels not set";
+        }
+        auto channel_scale = (fq_levels - 1) / (fq_ptr_output_high[i] - fq_ptr_output_low[i]);
+        auto channel_scale_multiplier = *ptr_weight_scale_factor / channel_scale;
+
+        ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_scale_multiplier);
+    }
+
+    if (ptr_float_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j].bias = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j].bias = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j].bias = (int32_t) value;
+            }
+        }
+    }
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
+    }
+}
+template<>
+void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
     if (ptr_int_biases == nullptr) {
         THROW_IE_EXCEPTION << "Int biases are empty";
     }
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h
index d9316739024e39..2ca27c20d6e489 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.h
+++ b/inference-engine/src/gna_plugin/frontend/quantization.h
@@ -16,25 +16,34 @@
 #define MAX_VAL_2B_WEIGHT 16384
 #define MAX_VAL_2B_FEAT 16384
 #define MAX_VAL_4B_BIAS 1073741824
-#ifdef DEBUG
-#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
-#else
-#define QUANTWARNING(...)
-#endif
-
-void QuantizeAffine16(float *ptr_float_weights,
-                      float *ptr_float_biases,
-                      int16_t *ptr_int_weights,
-                      int32_t *ptr_int_biases,
-                      float input_scale_factor,
-                      float *ptr_weight_scale_factor,
-                      float *ptr_output_scale_factor,
-                      uint32_t num_rows,
-                      uint32_t num_columns,
-                      uint32_t num_rows_padded,
-                      uint32_t num_columns_padded);
+
+template <class WeightsType,  class BiasType>
+struct QuantizationCallback {
+    float *ptr_float_weights;
+    float *ptr_float_biases;
+    WeightsType* ptr_int_weights;
+    BiasType* ptr_int_biases;
+    float input_scale_factor;
+    float *ptr_weight_scale_factor;
+    float *ptr_output_scale_factor;
+    uint32_t num_rows;
+    uint32_t num_columns;
+    uint32_t num_rows_padded;
+    uint32_t num_columns_padded;
+
+    // TODO: copied from fake quantize activation
+    int32_t fq_levels;
+    float  *fq_ptr_input_low;
+    float  *fq_ptr_input_high;
+    float  *fq_ptr_output_low;
+    float  *fq_ptr_output_high;
+
+    void runQuantize() const;
+    void runFakeQuantize() const;
+};
+
+template class QuantizationCallback<int16_t, int32_t>;
+template class QuantizationCallback<int8_t, gna_compound_bias_t>;
+
 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
 void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
-void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
-                     float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
-                     uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
index 024d5b81f0b69a..1d0d2503ebbe89 100644
--- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
+++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
@@ -15,6 +15,13 @@ struct Quantization {
 struct QuantizedLayerParams {
     Quantization _src_quant;
     Quantization _dst_quant;
+
+    // per channel weights quant data
+    int32_t levels;
+    std::vector<float> _weights_quants_min;
+    std::vector<float> _weights_quants_max;
+
+    // deprecate this
     Quantization _weights_quant;
     Quantization _bias_quant;
     float _o_shift = 0.0f;
diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
index 8ec4a690cab78f..0c9fa6a27ee184 100644
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -605,7 +605,8 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
                                                                       MAX_VAL_4B_BIAS,
                                                                       wl->_biases->size());
                 if (quant->_bias_quant.scale != -1.0f) {
-                    quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
+                    quant->_bias_quant.scale =
+                        std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
                     quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale;
                 }
             }
@@ -616,7 +617,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
             }
 
             double weights_reducer = 1.0;
-            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl);
+            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
             if (conv) {
                 auto dims = conv->insData.front().lock()->getDims();
 
@@ -624,36 +625,35 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
                 weights_reducer = std::max(1.0, weights_reducer);
             }
             quant->_weights_quant.scale /= weights_reducer;
-        }
-
 
-        double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
+            double tmp_dst_quant_scale = quant->_weights_quant.scale * quant->_src_quant.scale;
 
-        if (weightsSize == 1 &&
-            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
-                                                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
-            gnawarn() << "Output scale for " << wl->name
-                                            << " too large and are being reduced. Else saturations likely will happen \n";
-            // reduce weight scale according experimental heuristic
-            if (quant->_dst_quant.scale * quant->_src_quant.scale /
+            if (weightsSize == 1 &&
+                static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
+                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
+                gnawarn() << "Output scale for " << wl->name
+                          << " too large and are being reduced. Else saturations likely will happen \n";
+                // reduce weight scale according experimental heuristic
+                if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
-                quant->_weights_quant.scale *= _scale_reduction_50;
-                tmp_dst_quant_scale *= _scale_reduction_50;
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                    quant->_weights_quant.scale *= _scale_reduction_50;
+                    tmp_dst_quant_scale *= _scale_reduction_50;
+                } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
-                quant->_weights_quant.scale *= _scale_reduction_45;
-                tmp_dst_quant_scale *= _scale_reduction_45;
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                    quant->_weights_quant.scale *= _scale_reduction_45;
+                    tmp_dst_quant_scale *= _scale_reduction_45;
+                } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
                     static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
-                quant->_weights_quant.scale *= _scale_reduction_40;
-                tmp_dst_quant_scale *= _scale_reduction_40;
-            } else {
-                quant->_weights_quant.scale *= _scale_reduction_35;
-                tmp_dst_quant_scale *= _scale_reduction_35;
+                    quant->_weights_quant.scale *= _scale_reduction_40;
+                    tmp_dst_quant_scale *= _scale_reduction_40;
+                } else {
+                    quant->_weights_quant.scale *= _scale_reduction_35;
+                    tmp_dst_quant_scale *= _scale_reduction_35;
+                }
             }
         }
 
-        quant->_dst_quant.scale = tmp_dst_quant_scale;
+        quant->_dst_quant.scale = quant->_weights_quant.scale * quant->_src_quant.scale;
 
         return true;
     }
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index e4eb06031d4010..748779900c881d 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -40,6 +40,7 @@
 
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
+#include <layers/gna_fake_quantize_layer.hpp>
 
 uint32_t ToByteSize(const Gna2DataType type) {
     switch (type) {
@@ -339,6 +340,80 @@ void GNAPlugin::InitGNADevice() {
     graphCompiler.setGNAMemoryPtr(gnamem);
 }
 
+void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork & network) {
+    // fp32 emulation mode dont need any modifications to configuration
+    if (config.gnaFlags.sw_fp32) return;
+
+    // search for FQ layers
+    // only supports cases of int16 or int8
+    auto it = details::CNNNetworkIterator(&network);
+    auto end = details::CNNNetworkIterator();
+    for (; it != end; it++) {
+        if (!LayerInfo(*it).isFakeQuantize()) {
+            continue;
+        }
+
+        GNAFakeQuantizeLayer fqLayer(*it);
+        auto inputLayer = fqLayer.getInputLayer();
+
+        // this fake quantize represents data quantization - not weights
+        if (!LayerInfo(inputLayer).isConst()) {
+            continue;
+        }
+        // checking weight precision - they already quantized - so we need to adjust type of quantisation we have so far
+        const auto int8Levels = 255;
+        const auto int16Levels = 65535;
+        if (fqLayer.getLevels() != int8Levels && fqLayer.getLevels() != int16Levels) {
+            THROW_GNA_LAYER_EXCEPTION(*it)
+                << "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only: "
+                << int8Levels << " or " << int16Levels << " supported";
+        }
+        // also in mixed mode i8 should be stated as target precision
+        if (fqLayer.getLevels() == int8Levels) {
+            config.gnaPrecision = InferenceEngine::Precision::I8;
+        }
+        gnaFlags->fake_quantized = true;
+        config.gnaFlags.fake_quantized = true;
+    }
+}
+
+void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork & network) {
+    // fp32 emulation mode dont need any modifications to configuration
+    if (config.gnaFlags.sw_fp32) return;
+
+    // search for FQ layers
+    // only supports cases of int16 or int8
+    InputsDataMap  inputs;
+    network.getInputsInfo(inputs);
+    for (auto && input : inputs) {
+        auto data = input.second->getInputData();
+        size_t inputIdx = 0;
+        for (auto && nextToInputLayer : getInputTo(data)) {
+            if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
+                inputIdx++;
+                continue;
+            }
+            // replacing scale factor from this fq layer
+            GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
+            auto inputRange = fqLayer.getInputRange();
+            auto outputRange = fqLayer.getOutputRange();
+            if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
+                outputRange.second.size() != 1 || outputRange.second.size() != 1) {
+                THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
+                    << "unsupported,per-channel quantisation for input layer : " << input.second->name();
+            }
+            float scaleInput = (inputRange.second[0] - inputRange.first[0]) / (fqLayer.getLevels() - 1);
+            float scaleOutputs = (outputRange.second[0] - outputRange.first[0]) / (fqLayer.getLevels() - 1);
+
+            // TODO: proper mapping into scale factors
+            config.inputScaleFactors[inputIdx] = 1 / scaleInput;
+            inputsDesc->inputScaleFactors[inputIdx] = 1 / scaleInput;
+
+            inputIdx++;
+        }
+    }
+}
+
 void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
     std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
     if (_network.getFunction()) {
@@ -356,6 +431,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
         THROW_GNA_EXCEPTION << error.c_str();
     }
 
+    // FQ networks now replaces certain flags in the plugin - flags will'be owerritten
+    UpdateGnaQuantModeFromNetwork(network);
+    UpdateInputScaleFromNetwork(network);
+
     // network optimisation phases
     int passIdx = 0;
     auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
@@ -369,6 +448,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
 
         // fake quantisation aware passes
         passes->registerPass<FuseFQIntoWeightsPass>();
+        passes->registerPass<MoveFakeQuantizeLayerIntoQuantParamsPass>();
 
         passes->registerPass<SubstitutePReluPass>();
         passes->registerPass<SubstituteSoftSignPass>();
@@ -405,6 +485,19 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
         // to run all passes need to have two calls to pass manager
         run_passes(newNet, true);
         run_passes(newNet, false);
+    } else if (gnaFlags->fake_quantized) {
+        switch (config.gnaPrecision) {
+            case Precision::I16:
+                ModelQuantizer<FakeQuantI16> q16;
+                newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
+                break;
+            case Precision::I8:
+                ModelQuantizer<FakeQuantI8> q8;
+                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
+                break;
+            default:
+                THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
+        }
     } else {
         switch (config.gnaPrecision) {
             case Precision::I16:
@@ -416,8 +509,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
                 newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                 break;
             default:
-                THROW_GNA_EXCEPTION << "no mans land for GNA precision";
-                break;
+                THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
         }
     }
 
@@ -967,7 +1059,7 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer
 #ifdef PLOT
     dnn->BeginNewWrite(dnn_dump_write_index);
     if (dnn->num_components() != 0) {
-        dnn->WriteDnnText("Net_.txt", kDnnFloat);
+        dnn->WriteDnnText("Net_.txt", kDnnInt);
     }
     dnn_dump_write_index++;
 #endif
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
index 99eda6c07d54f9..5945af7537b1aa 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -221,6 +221,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
                     int idx = 0);
 
     void UpdateFieldsFromConfig();
+    void UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork &);
+    void UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork &);
 };
 
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
index b3d5dc249ed9e6..ab399636b61cbf 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
@@ -72,5 +72,5 @@ if (!(expr)) { \
 }
 #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
 #define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
-#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" "
+#define LAYER_NAME(layer) (layer)->type << " layer : \"" << (layer)->name << "\" "
 
diff --git a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
index 20520c5e1e716f..892f153667aac0 100644
--- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
@@ -16,17 +16,19 @@ class GNAFakeQuantizeLayer {
     InferenceEngine::CNNLayerPtr fqLayer;
  public :
     GNAFakeQuantizeLayer(InferenceEngine::CNNLayerPtr fqLayer)
-        : fqLayer(fqLayer) {}
+        : fqLayer(fqLayer) {
+        if (!LayerInfo(fqLayer).isFakeQuantize()) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize";
+        }
+    }
 
     /**
      * @brief convert FQ layer directly to gna-pwl activation layer
      */
     DnnActivation parseAsActivation() const {
         DnnActivation fqActivation;
-        if (!LayerInfo(fqLayer).isFakeQuantize()) {
-            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize";
-        }
-        fqActivation.args.fakeQuantize.levels      = fqLayer->GetParamAsInt("levels");
+
+        fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels");
         auto inputShape  = getShapeForRange(fqLayer, 1);
         auto outputShape = getShapeForRange(fqLayer, 3);
 
@@ -52,8 +54,50 @@ class GNAFakeQuantizeLayer {
          return LayerUtils::getParamFromInputAsBlob(fqLayer, 0);
      }
 
+     /**
+      * fake quantize has 5 input layers, while 4 of them always constant layer, and 1 might be a tensor - connection
+      */
+    InferenceEngine::CNNLayerPtr getInputLayer() const {
+        return getInputLayerAt(fqLayer, 0);
+    }
+
+    int32_t getLevels() {
+        return fqLayer->GetParamAsInt("levels");
+    }
+
+    std::pair<std::vector<float>, std::vector<float>> getInputRange() {
+        return getRange(fqLayer, 1);
+    }
+
+    std::pair<std::vector<float>, std::vector<float>> getOutputRange() {
+        return getRange(fqLayer, 3);
+    }
+
+    operator InferenceEngine::CNNLayerPtr () const {
+        return fqLayer;
+    }
+
+    InferenceEngine::CNNLayerPtr operator -> () const {
+        return fqLayer;
+    }
+    InferenceEngine::CNNLayerPtr operator * () const {
+        return fqLayer;
+    }
  protected :
 
+    static std::pair<std::vector<float>, std::vector<float>> getRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
+        auto shape     = getShapeForRange(input, idx);
+        auto rangeSize = InferenceEngine::details::product(shape.begin(), shape.end());
+
+        auto minPtr = getParamFromInputAsFloats(input, idx);
+        std::vector<float> minValues(minPtr, minPtr + rangeSize);
+
+        auto maxPtr = getParamFromInputAsFloats(input, idx + 1);
+        std::vector<float> maxValues(maxPtr, maxPtr + rangeSize);
+
+        return {minValues, maxValues};
+    }
+
     static float*  getParamFromInputAsFloats(InferenceEngine::CNNLayerPtr input, size_t idx) {
         auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
         return data->buffer().as<float*>();
@@ -64,6 +108,23 @@ class GNAFakeQuantizeLayer {
         return data->getTensorDesc().getDims();
     }
 
+    static InferenceEngine::CNNLayerPtr  getInputLayerAt(InferenceEngine::CNNLayerPtr input, size_t idx) {
+        if (input->insData.size() <= idx) {
+            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
+        }
+        auto iLayerData = input->insData[idx].lock();
+        if (!iLayerData) {
+            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
+                                             << ", input: cannot dereference data weak-pointer";
+        }
+        auto iLayer = getCreatorLayer(iLayerData).lock();
+        if (!iLayer) {
+            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
+                                             << ", input: cannot dereference creator layer weak-pointer";
+        }
+        return iLayer;
+    }
+
     static InferenceEngine::SizeVector getShapeForRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
         auto lowShape  = getShapeFromInput(input, idx);
         auto highShape = getShapeFromInput(input, idx + 1);
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index 281125605b8059..b6b18643ef6822 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -35,6 +35,7 @@
 #include "gna_pass_manager.hpp"
 #include "layers/gna_layer_info.hpp"
 #include "gna_upstream_iterator.hpp"
+#include "frontend/quantization.h"
 
 
 using namespace InferenceEngine;
@@ -1450,6 +1451,17 @@ void FuseFQIntoWeightsPass::run() {
         return LayerInfo(ptr).isNonFunctional();
     };
 
+    auto assignWeightsAndBiases = [](CNNLayerPtr layer, Blob::Ptr weights, Blob::Ptr biases) {
+        auto weigtableLayer = std::dynamic_pointer_cast<WeightableLayer>(layer);
+        if (nullptr == weigtableLayer) {
+            THROW_GNA_LAYER_EXCEPTION(layer) << " not a weightable layer";
+        }
+        weigtableLayer->_weights = weights;
+        weigtableLayer->_biases  = biases;
+        weigtableLayer->blobs["weights"] = weights;
+        weigtableLayer->blobs["biases"] = biases;
+    };
+
     for (auto &l : *pLayers) {
         if (!LayerInfo(l).isFakeQuantize()) {
             continue;
@@ -1479,7 +1491,10 @@ void FuseFQIntoWeightsPass::run() {
         pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
             << LAYER_NAME(weightableLayer) << "\n";
 
+        GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
+
         auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, 2);
+        auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();
 
         // 1. broke existing connections - by detaching fq subgraph from rest of graph
         auto prevData = weightableLayer->insData[1].lock();
@@ -1495,37 +1510,109 @@ void FuseFQIntoWeightsPass::run() {
         auto outputSize = details::product(weightDims.begin(), weightDims.end());
 
         // depending on compute precision weights will be recreated
-        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(fqLayer);
+        // for integer mode - weights might be simply copied - to avoid furter quantisations overhead
+        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(weightableLayer);
         if (quantized) {
-            THROW_GNA_LAYER_EXCEPTION(fqLayer) << " not supported for non FP32 precision yet";
+            // assign already quantized Weights
+            assignWeightsAndBiases(weightableLayer, quantizedWeights, biases);
+
+            // modify scale factors for quantized component
+            auto outputRange = gnaFakeQuantizeLayer.getOutputRange();
+            quantized->_weights_quants_min.insert(
+                quantized->_weights_quants_max.end(), outputRange.first.begin(), outputRange.first.end());
+
+            quantized->_weights_quants_max.insert(
+                quantized->_weights_quants_max.end(), outputRange.second.begin(), outputRange.second.end());
+
+            quantized->levels = gnaFakeQuantizeLayer.getLevels();
+            auto inputRange = gnaFakeQuantizeLayer.getInputRange();
+
+            // lets find out minimum scale factor among channels
+            if (quantized->_weights_quants_min.empty()) {
+                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel weigts scales missed";
+            }
+            auto getScale = [&quantized](uint32_t i) {
+                return (quantized->levels - 1) / (quantized->_weights_quants_max[i] - quantized->_weights_quants_min[i]);
+            };
+            float min_channel_scale = getScale(0);
+            for (uint32_t i = 1; i < quantized->_weights_quants_min.size(); i++) {
+                min_channel_scale = std::min(min_channel_scale, getScale(i));
+            }
+            // common scale calculation
+            quantized->_weights_quant.scale = min_channel_scale * MAX_OUT_MULTIPLIER;
+            continue;
         }
 
         intel_dnn_component_t component;
-        GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
         component.num_columns_in = weightDims[1];
         component.num_rows_in    = weightDims[0];
 
         intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl);
         transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
 
-        auto inputData = gnaFakeQuantizeLayer.getConstInputData();
-        auto inputBuffer = inputData->buffer();
-        component.ptr_inputs = inputBuffer.as<float*>();
+        auto quantizedWeightsData = quantizedWeights->buffer();
+        component.ptr_inputs = quantizedWeightsData.as<float*>();
 
-        auto resultBlob = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
-        resultBlob->allocate();
+        auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
+        dequantizedWeights->allocate();
 
-        auto resultBuffer = resultBlob->buffer();
+        auto resultBuffer = dequantizedWeights->buffer();
         component.ptr_outputs = resultBuffer.as<float*>();
 
         PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
 
-        // 3. assign quantized const blob to weightable layer
-        auto weigtableLayerCasted = std::dynamic_pointer_cast<WeightableLayer>(weightableLayer);
-        weigtableLayerCasted->_weights = resultBlob;
-        weigtableLayerCasted->_biases  = biases;
-        weigtableLayerCasted->blobs["weights"] = resultBlob;
-        weigtableLayerCasted->blobs["biases"] = biases;
+        // 3. assign dequantized const blob to weightable layer
+        assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
+    }
+}
+
+void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
+    if (!quantized) {
+        return;
+    }
+
+    auto donotSkip = [](CNNLayerPtr) {
+        return false;
+    };
+    for (auto &&l : *pLayers) {
+        if (!LayerInfo(l).isFakeQuantize()) {
+            continue;
+        }
+        GNAFakeQuantizeLayer fqLayer(l);
+
+        auto nextLayer = CNNNetGetNextLayerSkipCertain(*fqLayer, 0, 0, donotSkip).first;
+        auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
+
+        if (prevLayer->outData.size() != 1) {
+            THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
+        }
+        auto insDatas = CNNLayerFindInsDataIdxes(fqLayer->outData.front(), nextLayer);
+
+        if (insDatas.size() != 1) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize connection to layer: " << LAYER_NAME(nextLayer) << " is not correct";
+        }
+        nextLayer->insData[insDatas.front()] = prevLayer->outData.front();
+        getInputTo(prevLayer->outData.front()).clear();
+        getInputTo(prevLayer->outData.front())[nextLayer->name] = nextLayer;
+
+        // After layer gets removed lets absorb its params in QuantParams structure
+
+        // replacing scale factor from this fq layer
+        auto inputRange = fqLayer.getInputRange();
+        auto outputRange = fqLayer.getOutputRange();
+        if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
+            outputRange.second.size() != 1 || outputRange.second.size() != 1) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer)
+                << "unsupported,per-channel quantisation for layer : " << nextLayer->name;
+        }
+        float scaleInput = (inputRange.second[0] - inputRange.first[0]) / (fqLayer.getLevels() - 1);
+        float scaleOutputs = (outputRange.second[0] - outputRange.first[0]) / (fqLayer.getLevels() - 1);
+
+        auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(nextLayer);
+
+        // TODO: proper mapping into scale factors
+        quantParams->_src_quant.scale = 1 / scaleInput;
     }
 }
 
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
index fc5d131ba5f013..6111ba2c2a9219 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
@@ -185,6 +185,11 @@ DECL_PASS(BroadcastConst);
 */
 DECL_PASS(FuseFQIntoWeights);
 
+/**
+* @brief remove all fake quantize layers while moving it's settings into QuantParams for certain layer
+*/
+DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams);
+
 
 struct PassManagerSettings {
     Policy policy;