Skip to content

Commit 92ed322

Browse files
committed
[CPU] FullyConnected: fixed primitive caching for sparse decompression case
[CPU][oneDNN] sparsity: some fixes and removed unused code [CPU][TESTS] FullyConnected: sparsity weights decompression tests [CPU] FullyConnected: removed min sparse rate = 0.5 limitation [CPU] fixed property CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE
1 parent af3c789 commit 92ed322

File tree

6 files changed

+351
-12
lines changed

6 files changed

+351
-12
lines changed

src/inference/include/openvino/runtime/intel_cpu/properties.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ namespace intel_cpu {
4747
*/
4848
static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};
4949

50-
static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
50+
static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
5151

5252
} // namespace intel_cpu
5353
} // namespace ov

src/plugins/intel_cpu/src/nodes/fullyconnected.cpp

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
591591
dnnl::memory::desc wgh_candidate;
592592
if (useSparseWeights) {
593593
wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
594-
wdt, memory::desc::packed(nnzCount) };
594+
wdt, memory::desc::packed() };
595595
} else {
596596
wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
597597
wdt, dnnl::memory::format_tag::any };
@@ -930,18 +930,12 @@ bool FullyConnected::useSparseWeightsDecompression() {
930930
zerosCounts++;
931931
}
932932
}
933-
nnzCount = elementsCount - zerosCounts;
934933

935934
DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ",
936-
zerosCounts, ", nnzCount = ", nnzCount);
935+
zerosCounts, ", nnzCount = ", elementsCount - zerosCounts);
937936

938937
weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount);
939938

940-
// [av] WA: there is no point in using sparse decompression when the sparse rate is low
941-
// todo: add heuristic
942-
if (minSparseRate < 0.5)
943-
minSparseRate = 0.5;
944-
945939
DEBUG_LOG(getName(), " | sparse rate = ", weiSparseRate * 100, "%, min sparse rate = ",
946940
minSparseRate * 100, "%, use sparse weights = ", weiSparseRate >= minSparseRate);
947941

src/plugins/intel_cpu/src/nodes/fullyconnected.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ class FullyConnected : public Node {
4242

4343
void initSupportedPrimitiveDescriptors() override;
4444
void initOptimalPrimitiveDescriptor() override;
45-
// void createPrimitive() override;
4645
std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
4746
std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
4847

@@ -112,7 +111,6 @@ class FullyConnected : public Node {
112111

113112
// sparse weights
114113
bool useSparseWeights = false;
115-
int nnzCount = -1;
116114
float minSparseRate = 1.f;
117115
float weiSparseRate = 0.f;
118116
bool useSparseWeightsDecompression();
Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
// Copyright (C) 2022-2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "shared_test_classes/single_layer/mat_mul.hpp"
6+
#include "shared_test_classes/base/ov_subgraph.hpp"
7+
#include "ie_precision.hpp"
8+
#include "test_utils/fusing_test_utils.hpp"
9+
#include "ngraph_functions/builders.hpp"
10+
#include <string>
11+
#include <ov_ops/type_relaxed.hpp>
12+
#include "shared_test_classes/base/utils/generate_inputs.hpp"
13+
#include "cpu/cpu_config.hpp"
14+
15+
using namespace ngraph;
16+
using namespace InferenceEngine;
17+
using namespace CPUTestUtils;
18+
using namespace ov::test;
19+
20+
namespace CPULayerTestsDefinitions {
21+
22+
struct ShapeRelatedParams {
23+
std::vector<InputShape> inputShapes;
24+
std::pair<bool, bool> transpose;
25+
};
26+
27+
typedef std::tuple<
28+
ShapeRelatedParams,
29+
ElementType, // Input precision
30+
ElementType, // Weights precision
31+
ElementType, // Output precision
32+
fusingSpecificParams,
33+
CPUSpecificParams,
34+
std::map<std::string, std::string>, // Additional config
35+
float // Weights sparse rate
36+
> MatMulSparseParamSet;
37+
38+
class MatMulSparseCPUTest : public testing::WithParamInterface<MatMulSparseParamSet>,
39+
virtual public SubgraphBaseTest, public CpuTestWithFusing {
40+
public:
41+
static std::string getTestCaseName(const testing::TestParamInfo<MatMulSparseParamSet>& obj) {
42+
ShapeRelatedParams shapeRelatedParams;
43+
ElementType inType, weiType, outType;
44+
fusingSpecificParams fusingParams;
45+
CPUSpecificParams cpuParams;
46+
std::map<std::string, std::string> additionalConfig;
47+
float weiSparseRate;
48+
std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
49+
weiSparseRate) = obj.param;
50+
51+
std::ostringstream result;
52+
result << "IS=";
53+
for (const auto& shape : shapeRelatedParams.inputShapes) {
54+
result << CommonTestUtils::partialShape2str({shape.first}) << "_";
55+
}
56+
result << "TS=";
57+
for (const auto& shape : shapeRelatedParams.inputShapes) {
58+
result << "(";
59+
if (!shape.second.empty()) {
60+
auto itr = shape.second.begin();
61+
do {
62+
result << CommonTestUtils::vec2str(*itr);
63+
} while (++itr != shape.second.end() && result << "_");
64+
}
65+
result << ")_";
66+
}
67+
result << "transpose_a=" << shapeRelatedParams.transpose.first << "_";
68+
result << "transpose_b=" << shapeRelatedParams.transpose.second << "_";
69+
result << "inType=" << inType << "_";
70+
result << "weiType=" << weiType << "_";
71+
result << "outType=" << outType << "_";
72+
result << CpuTestWithFusing::getTestCaseName(fusingParams);
73+
result << CPUTestsBase::getTestCaseName(cpuParams);
74+
75+
if (!additionalConfig.empty()) {
76+
result << "_PluginConf";
77+
for (auto& item : additionalConfig) {
78+
result << "_" << item.first << "=" << item.second;
79+
}
80+
}
81+
result << "_weiSparseRate=" << weiSparseRate;
82+
83+
return result.str();
84+
}
85+
86+
protected:
87+
std::string cpuNodeType;
88+
89+
template<typename T>
90+
void transpose(T& shape) {
91+
IE_ASSERT(shape.size() > 1);
92+
std::swap(*(shape.end() - 1), *(shape.end() - 2));
93+
}
94+
95+
std::vector<int8_t> inline generateSparseVector(size_t vec_len,
96+
float sparseRate = 0.0f,
97+
int8_t upTo = 10,
98+
int8_t startFrom = 1,
99+
int32_t seed = 1) {
100+
std::vector<int8_t> res(vec_len);
101+
std::mt19937 gen(seed);
102+
std::uniform_int_distribution<long> dist(static_cast<long>(startFrom), static_cast<long>(upTo));
103+
104+
std::mt19937 gen_f(123);
105+
std::uniform_real_distribution<float> dist_f(0.f, 1.f);
106+
107+
int countZero = 0;
108+
109+
res[0] = startFrom;
110+
res[vec_len - 1] = upTo;
111+
for (size_t i = 1; i < vec_len - 1; i++) {
112+
if (dist_f(gen_f) > sparseRate) {
113+
res[i] = static_cast<int8_t>(dist(gen));
114+
} else {
115+
res[i] = 0;
116+
countZero++;
117+
}
118+
}
119+
120+
std::cout << "Sparse rate = " << countZero * 100 / vec_len << "%" << std::endl;
121+
122+
return res;
123+
}
124+
125+
std::shared_ptr<Node> makeMatMulRelaxed(const Output<Node>& A,
126+
const ov::PartialShape& inShapeB,
127+
ElementType weiType,
128+
bool transpose_a,
129+
bool transpose_b,
130+
const std::vector<int8_t>& weiData) {
131+
using namespace ngraph;
132+
auto inputParamsFP32 = builder::makeDynamicParams(element::f32, {A.get_partial_shape()});
133+
auto matrixBFP32 = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);
134+
135+
auto matMulRelaxed = std::make_shared<op::TypeRelaxed<opset3::MatMul>>(
136+
*as_type_ptr<opset3::MatMul>(builder::makeMatMul(inputParamsFP32[0], matrixBFP32, transpose_a, transpose_b)),
137+
element::f32);
138+
139+
auto matrixB = ngraph::builder::makeConstant<int8_t>(weiType, inShapeB.get_shape(), weiData);
140+
141+
auto matMul = matMulRelaxed->copy_with_new_inputs({A, matrixB});
142+
143+
return matMul;
144+
}
145+
146+
void SetUp() override {
147+
abs_threshold = 0.5f;
148+
using ngraph::pass::ConvertPrecision;
149+
150+
ShapeRelatedParams shapeRelatedParams;
151+
ElementType inType, weiType, outType;
152+
fusingSpecificParams fusingParams;
153+
CPUSpecificParams cpuParams;
154+
std::map<std::string, std::string> additionalConfig;
155+
float weiSparseRate;
156+
157+
std::tie(shapeRelatedParams, inType, weiType, outType, fusingParams, cpuParams, additionalConfig,
158+
weiSparseRate) = this->GetParam();
159+
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
160+
161+
configuration.insert(additionalConfig.begin(), additionalConfig.end());
162+
163+
init_input_shapes(shapeRelatedParams.inputShapes);
164+
165+
bool transpA = shapeRelatedParams.transpose.first;
166+
bool transpB = shapeRelatedParams.transpose.second;
167+
168+
if (transpA) {
169+
transpose(inputDynamicShapes[0]);
170+
for (auto& shapes : targetStaticShapes) {
171+
transpose(shapes[0]);
172+
}
173+
}
174+
if (transpB) {
175+
transpose(inputDynamicShapes[1]);
176+
for (auto& shapes : targetStaticShapes) {
177+
transpose(shapes[1]);
178+
}
179+
}
180+
181+
const auto& inShapeA = inputDynamicShapes[0];
182+
const auto& inShapeB = inputDynamicShapes[1];
183+
184+
std::tie(postOpMgrPtr, fusedOps) = fusingParams;
185+
186+
configuration.insert(additionalConfig.begin(), additionalConfig.end());
187+
188+
cpuNodeType = "FullyConnected";
189+
selectedType = makeSelectedTypeStr(selectedType, element::i8);
190+
191+
auto params = builder::makeDynamicParams(inType, {inShapeA});
192+
auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<opset1::Parameter>(params));
193+
194+
auto matrixB = builder::makeDynamicInputLayer(element::f32, helpers::InputLayerType::CONSTANT, inShapeB);
195+
196+
auto weiData = generateSparseVector(ngraph::shape_size(inShapeB.get_shape()), weiSparseRate);
197+
auto matMul = makeMatMulRelaxed(paramOuts[0], inShapeB, weiType, transpA, transpB, weiData);
198+
199+
function = makeNgraphFunction(element::f32, params, matMul, cpuNodeType);
200+
201+
checkFusingPosition = false;
202+
203+
targetDevice = CommonTestUtils::DEVICE_CPU;
204+
205+
functionRefs = ov::clone_model(*function);
206+
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::i8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
207+
ngraph::pass::ConvertPrecision<ngraph::element::Type_t::u8, ngraph::element::Type_t::f32>().run_on_function(functionRefs);
208+
functionRefs->validate_nodes_and_infer_types();
209+
}
210+
};
211+
212+
TEST_P(MatMulSparseCPUTest, CompareWithRefs) {
213+
SKIP_IF_CURRENT_TEST_IS_DISABLED()
214+
215+
run();
216+
CheckPluginRelatedResults(compiledModel, cpuNodeType);
217+
}
218+
219+
namespace {
220+
221+
/* ============= Common params ============= */
222+
223+
std::vector<CPUSpecificParams> filterSpecificParams(bool sparseExpected) {
224+
std::vector<CPUSpecificParams> specificParams;
225+
if (with_cpu_x86_avx512_core_amx()) {
226+
if (sparseExpected) {
227+
specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx_sparse"});
228+
} else {
229+
specificParams.push_back(CPUSpecificParams{{}, {}, {"brgemm_avx512_amx"}, "brgemm_avx512_amx"});
230+
}
231+
}
232+
233+
return specificParams;
234+
}
235+
236+
/* ============= FullyConnected ============= */
237+
namespace fullyConnected {
238+
239+
// cpu (sparse) configs
240+
const std::map<std::string, std::string> emptyConfig = {};
241+
const std::map<std::string, std::string> SparseRate50 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.5"}};
242+
const std::map<std::string, std::string> SparseRate80 = {{CPUConfigParams::KEY_CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE, "0.8"}};
243+
244+
const std::vector<ShapeRelatedParams> IS2D_sparse_smoke = {
245+
{static_shapes_to_test_representation({{64, 64}, {64, 64}}), {false, true}},
246+
{static_shapes_to_test_representation({{71, 64}, {64, 64}}), {false, true}},
247+
{static_shapes_to_test_representation({{3, 128}, {128, 64}}), {false, true}},
248+
{static_shapes_to_test_representation({{71, 64}, {64, 128}}), {false, true}},
249+
250+
{
251+
{
252+
{{-1, -1}, {{20, 64}, {20, 64}}},
253+
{{64, 128}, {{64, 128}, {64, 128}}}
254+
},
255+
{false, true}
256+
},
257+
258+
{
259+
{
260+
{{{0, 100}, {0, 64}}, {{20, 64}, {14, 64}, {20, 64}, {14, 64}}},
261+
{{64, 128}, {{64, 128}, {64, 128}, {64, 128}, {64, 128}}}
262+
},
263+
{false, true}
264+
},
265+
};
266+
267+
const auto testParams2D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
268+
::testing::Values(ElementType::i8, ElementType::u8),
269+
::testing::Values(ElementType::i8),
270+
::testing::Values(ElementType::f32),
271+
::testing::Values(emptyFusingSpec),
272+
::testing::ValuesIn(filterSpecificParams(false)),
273+
::testing::Values(emptyConfig, SparseRate80),
274+
::testing::Values(0.7));
275+
276+
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8, MatMulSparseCPUTest, testParams2D_i8_smoke,
277+
MatMulSparseCPUTest::getTestCaseName);
278+
279+
const auto testParams2D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_sparse_smoke),
280+
::testing::Values(ElementType::i8, ElementType::u8),
281+
::testing::Values(ElementType::i8),
282+
::testing::Values(ElementType::f32),
283+
::testing::Values(emptyFusingSpec),
284+
::testing::ValuesIn(filterSpecificParams(true)),
285+
::testing::Values(SparseRate50),
286+
::testing::Values(0.7));
287+
288+
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_I8_sparse, MatMulSparseCPUTest, testParams2D_i8_sparse_smoke,
289+
MatMulSparseCPUTest::getTestCaseName);
290+
291+
const std::vector<ShapeRelatedParams> IS3D_sparse_smoke = {
292+
{static_shapes_to_test_representation({{1, 64, 64}, {64, 64}}), {false, true}},
293+
{static_shapes_to_test_representation({{3, 71, 64}, {64, 64}}), {false, true}},
294+
{static_shapes_to_test_representation({{3, 5, 128}, {128, 64}}), {false, true}},
295+
{static_shapes_to_test_representation({{1, 71, 64}, {64, 128}}), {false, true}},
296+
297+
{
298+
{
299+
{{-1, -1, 64}, {{1, 5, 64}, {1, 10, 64}, {1, 5, 64}, {1, 10, 64}}},
300+
{{64, 128}, {{64, 128}, {64, 128}}}
301+
},
302+
{false, true}
303+
},
304+
305+
// todo: [av] investigate "Primitive descriptor was not found" error for this case
306+
// {
307+
// {
308+
// {{{0, 60}, {0, 60}, {0, 64}}}, {{1, 3, 64}, {1, 7, 64}}},
309+
// {{64, 64}, {{64, 64}, {64, 64}}}
310+
// },
311+
// {false, true}
312+
// },
313+
};
314+
315+
const auto testParams3D_i8_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
316+
::testing::Values(ElementType::i8, ElementType::u8),
317+
::testing::Values(ElementType::i8),
318+
::testing::Values(ElementType::f32),
319+
::testing::Values(emptyFusingSpec),
320+
::testing::ValuesIn(filterSpecificParams(false)),
321+
::testing::Values(emptyConfig, SparseRate80),
322+
::testing::Values(0.7));
323+
324+
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8, MatMulSparseCPUTest, testParams3D_i8_smoke,
325+
MatMulSparseCPUTest::getTestCaseName);
326+
327+
const auto testParams3D_i8_sparse_smoke = ::testing::Combine(::testing::ValuesIn(IS3D_sparse_smoke),
328+
::testing::Values(ElementType::i8, ElementType::u8),
329+
::testing::Values(ElementType::i8),
330+
::testing::Values(ElementType::f32),
331+
::testing::Values(emptyFusingSpec),
332+
::testing::ValuesIn(filterSpecificParams(true)),
333+
::testing::Values(SparseRate50),
334+
::testing::Values(0.7));
335+
336+
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_I8_sparse, MatMulSparseCPUTest, testParams3D_i8_sparse_smoke,
337+
MatMulSparseCPUTest::getTestCaseName);
338+
339+
} // namespace fullyConnected
340+
341+
} // namespace
342+
343+
} // namespace CPULayerTestsDefinitions

0 commit comments

Comments
 (0)