Skip to content

Commit a739ea5

Browse files
authored
Merge pull request #3747 from alibaba/feature/sync
MNN:Sync: Sync Internal 3.2.2
2 parents e814142 + db0f559 commit a739ea5

File tree

231 files changed

+21696
-1979
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

231 files changed

+21696
-1979
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
7878
option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
7979
option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
8080
option(MNN_BUILD_AUDIO "Build audio api in MNN." OFF)
81+
option(MNN_SME2 "Use Arm sme2 instructions" ON)
8182

8283
if (MNN_BUILD_MINI)
8384
set(MNN_SKIPBUILD_GEOMETRY ON)

backupcode/cpubackend/CPULSTM.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect
100100
auto temp = tempBuffer->host<float>();
101101
auto dest = dst + n * UP_DIV(timeSteps, hP) * numFeatures * hP;
102102
MNNUnpackC4(temp, source, numFeatures, timeSteps);
103-
MNNPackForMatMul_B(dest, temp, timeSteps, numFeatures, true);
103+
MNNPackForMatMul_B(dest, temp, timeSteps, 1, numFeatures, true);
104104
}
105105
};
106106

backupcode/cpubackend/bf16/BF16Functions.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, con
159159
return;
160160
}
161161

162-
void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) {
162+
void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t kernelsize, size_t ic, bool transpose) {
163+
auto l = kernelsize * ic;
163164
auto hP = h / 4;
164165
auto hR = hP * 4;
165166
if (hR != h) {

backupcode/cpubackend/compute/DeconvolutionWithStride.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre
7171
el[3] = 0;
7272
size_t parameters[6];
7373
parameters[0] = eP * sizeof(float);
74-
parameters[1] = ic;
74+
parameters[1] = ROUND_UP(ic, lP);
7575
parameters[2] = oc;
7676
parameters[3] = eP * 4 * sizeof(float);
7777
parameters[4] = 0;
@@ -130,7 +130,7 @@ static void _gemmAndIm2col(const DeconvolutionWithStride::ComputeUnit& unit, int
130130
el[3] = 0;
131131
size_t parameters[6];
132132
parameters[0] = eP * sizeof(float);
133-
parameters[1] = ic;
133+
parameters[1] = ROUND_UP(ic, lP);
134134
parameters[2] = oc;
135135
parameters[3] = eP * 4 * sizeof(float);
136136
parameters[4] = 0;

codegen/OpFuse.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,84 @@ static void dumpCmd(const Command* cmd) {
6262
MNN_PRINT("}\n");
6363
}
6464

65+
void mergeConvolutionAndPrelu(Node* root, MNNForwardType forwardType){
66+
if (root->cmd->op != nullptr && root->cmd->op->type() == OpType_Convolution && root->succ.size() == 1) {
67+
auto child = root->succ[0];
68+
if(child->cmd->op->type() == OpType_PReLU){
69+
if(root->cmd->op->externalPath() != nullptr){
70+
return;
71+
}
72+
std::shared_ptr<Command> cmdPlugin;
73+
auto inputs = root->cmd->inputs;
74+
auto outputs = root->cmd->outputs;
75+
auto convOp = root->cmd->op->main_as_Convolution2D();
76+
if(convOp->quanParameter() != nullptr || convOp->symmetricQuan() != nullptr || convOp->sparseParameter() != nullptr || convOp->external() != nullptr || convOp->common()->outputCount() != child->cmd->op->main_as_PRelu()->slopeCount()){
77+
return;
78+
}
79+
std::unique_ptr<OpT> fuseOp(new OpT);
80+
fuseOp->type = OpType_Extra;
81+
fuseOp->name = root->cmd->op->name()->str();
82+
ExtraT* extra_param = new ExtraT;
83+
extra_param->type = "ExtraConvolution2DPrelu";
84+
extra_param->attr.resize(2);
85+
// copy convolution2D param
86+
AttributeT* convAtr = new AttributeT;
87+
BlobT* convParamBlob = new BlobT;
88+
{
89+
std::unique_ptr<Convolution2DT> convolutionParam(convOp->UnPack());
90+
flatbuffers::FlatBufferBuilder builder;
91+
auto lastOffset = Convolution2D::Pack(builder, convolutionParam.get());
92+
builder.Finish(lastOffset);
93+
94+
const uint8_t* buffer_ptr = builder.GetBufferPointer();
95+
const size_t size = builder.GetSize();
96+
convParamBlob->uint8s.resize(size);
97+
::memcpy(convParamBlob->uint8s.data(), buffer_ptr, size);
98+
}
99+
convAtr->tensor.reset(convParamBlob);
100+
extra_param->attr[0].reset(convAtr);
101+
102+
// copy prelu param
103+
AttributeT* preluAtr = new AttributeT;
104+
BlobT* preluParamBlob = new BlobT;
105+
{
106+
std::unique_ptr<PReluT> preluParam(child->cmd->op->main_as_PRelu()->UnPack());
107+
flatbuffers::FlatBufferBuilder builder;
108+
auto lastOffset = PRelu::Pack(builder, preluParam.get());
109+
builder.Finish(lastOffset);
110+
const uint8_t* buffer_ptr = builder.GetBufferPointer();
111+
const size_t size = builder.GetSize();
112+
preluParamBlob->uint8s.resize(size);
113+
::memcpy(preluParamBlob->uint8s.data(), buffer_ptr, size);
114+
}
115+
preluAtr->tensor.reset(preluParamBlob);
116+
extra_param->attr[1].reset(preluAtr);
117+
118+
fuseOp->main.type = OpParameter_Extra;
119+
fuseOp->main.value = extra_param;
120+
flatbuffers::FlatBufferBuilder builder;
121+
auto lastOffset = Op::Pack(builder, fuseOp.get());
122+
builder.Finish(lastOffset);
123+
cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs);
124+
125+
root->cmd->op = cmdPlugin->op;
126+
root->cmd->inputs = cmdPlugin->inputs;
127+
root->cmd->outputs = cmdPlugin->outputs;
128+
root->cmd->buffer = cmdPlugin->buffer;
129+
child->cmd->op = nullptr;
130+
child->cmd->buffer.reset();
131+
for(auto &childNode : child->succ){
132+
for(auto &input : childNode->cmd->inputs){
133+
if(input == child->cmd->outputs[0]){
134+
input = root->cmd->outputs[0];
135+
}
136+
}
137+
}
138+
root->succ = child->succ;
139+
}
140+
}
141+
}
142+
65143
// is legal fused type
66144
bool isLegal(Command* cmd, MNNForwardType forwardType) {
67145
auto type = cmd->op->type();
@@ -369,6 +447,20 @@ bool opFuse(std::vector<Schedule::OpCacheInfo>& infos, MNNForwardType type, Back
369447
graph.push_back(std::move(node));
370448
}
371449
}
450+
451+
if(type == MNN_FORWARD_OPENCL){
452+
for(int i = 0; i < graph.size(); ++i){
453+
mergeConvolutionAndPrelu(graph[i].get(), type);
454+
}
455+
for(auto iter = graph.begin(); iter != graph.end();){
456+
if(iter->get()->cmd->op == nullptr){
457+
iter = graph.erase(iter);
458+
}else{
459+
++iter;
460+
}
461+
}
462+
}
463+
372464
std::queue<Node*> postDominateNodeQueue;
373465
// build dominate tree
374466
for (int i = static_cast<int>(graph.size()) - 1; i >= 0; i--) {

docs/compile/cmake.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
1919
| MNN_SUPPORT_QUNAT_EXTEND | 是否编译非核心算子的量化版本,默认为`ON` |
2020
| MNN_SUPPORT_DEPRECATED_OP | 是否支持Tflite的量化算子等已经废弃的算子,用于兼容历史模型(1.1.0版本之前),默认为`OFF` |
2121
| MNN_SUPPORT_DEPRECATED_OPV2 | 是否编译MNN更新到3.0之后已经废弃的算子,用于兼容历史模型(3.0.0版本之前),比如 Convolution3D 和 ConvTranspose3D在3.0.0 版本之后改由模型转换器转化为对应2D算子,不再需要运行时支持,默认为`ON` |
22-
| MNN_REDUCE_SIZE | 是否裁剪MNN库大小,去除求导相关算子,减少优化策略,默认为`OFF` ,开启时,MNN_SUPPORT_QUNAT_EXTEND / MNN_SUPPORT_DEPRECATED_OP / MNN_SUPPORT_DEPRECATED_OPV2 都会设成 OFF|
22+
| MNN_REDUCE_SIZE | 是否裁剪MNN库大小,去除求导相关算子,减少优化策略,默认为`OFF` ,开启时,MNN_SUPPORT_QUANT_EXTEND / MNN_SUPPORT_DEPRECATED_OP / MNN_SUPPORT_DEPRECATED_OPV2 都会设成 OFF|
23+
| MNN_SUPPORT_QUANT_EXTEND | 是否开启Binary/Unary等算子的量化计算支持,默认为`ON` |
2324
| MNN_DEBUG_MEMORY | 是否开启MNN内存调试,默认为`OFF` |
2425
| MNN_DEBUG_TENSOR_SIZE | 是否开启MNN tensor size调试,默认为`OFF` |
2526
| MNN_GPU_TRACE | 是否开启MNN GPU调试,默认为`OFF` |
@@ -43,6 +44,7 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
4344
| MNN_OPENGL | 是否构建`OpenGL`后端,默认为`OFF` |
4445
| MNN_VULKAN | 是否构建`Vulkan`后端,默认为`OFF` |
4546
| MNN_ARM82 | 编译ARM架构时,是否构建`Armv8.2`后端,以支持FP16计算,默认为`ON` |
47+
| MNN_SME2 | 编译ARM架构时,是否构建`ArmSme2`后端,以支持使用sme2指令集计算,默认为`ON` |
4648
| MNN_SUPPORT_FP16_ARMV7 | 编译armeabi-v7a架构时,是否构建`Armv8.2`后端,以支持FP16计算,默认为`OFF` |
4749
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
4850
| MNN_AVX2 |`MNN_USE_SSE`开启的基础上,是否增加AVX2指令的支持,默认为`ON` |
@@ -55,6 +57,8 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
5557
| MNN_TENSORRT | 是否构建`TensorRT`后端,默认为`OFF` |
5658
| MNN_COREML | 是否构建`CoreML`后端,默认为`OFF` |
5759
| MNN_NNAPI | 是否构建`NNAPI`后端,默认为`OFF` |
60+
| MNN_QNN | 是否构建`QNN`后端,默认为`OFF` |
61+
| MNN_QNN_CONVERT_MODE |`MNN_QNN`开启的基础上,是否构建Convert模式的QNN后端,默认为`OFF` |
5862
| MNN_BUILD_BENCHMARK | 是否构建MNN的性能测试,默认为`OFF` |
5963
| MNN_BUILD_TEST | 是否构建MNN的单元测试,默认为`OFF` |
6064
| MNN_BUILD_FOR_ANDROID_COMMAND | 是否使用命令行构建`Android`,默认为`OFF` |

docs/compile/other.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@
171171
- `rasterDemo.out` Raster示例
172172
- `nluDemo.out` nlu模型示例
173173
- `mergeInplaceForCPU` 将模型中可以Inplace计算的算子改成Inplace计算,可以减少内存占用,但限定CPU后端运行
174+
- `OpenCLProgramBuildTest.out` 测试OpenCL后端的Program在设备上是否能编译成功
174175
## 单元测试
175176
- 相关编译选项
176177
- `MNN_BUILD_TEST` 是否编译MNN单元测试

docs/pymnn/expr.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3132,6 +3132,20 @@ roialign
31323132
```python
31333133
TODO
31343134
```
3135+
3136+
---
3137+
### `jsonop(inputs, describe, output_number)`
3138+
3139+
jsonop
3140+
3141+
对于MNN模型支持的算子,但没有相应表达式透出的情况,可以使用jsonop接口,以json描述算子
3142+
3143+
参数:
3144+
- `inputs` : List[Var] 输入变量数组,任意类型
3145+
- `describe` : str ,算子的json描述
3146+
- `output_number` : int, 算子输出数
3147+
3148+
31353149
---
31363150
**以下函数为框架开发者使用函数,普通用户不建议使用!**
31373151

docs/tools/convert.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ Usage:
8585

8686
--useGeluApproximation 在进行Gelu算子合并时,使用Gelu的近似算法,默认为1 ,也就是`true`
8787

88+
--useOriginRNNImpl LSTM和GRU算子是否使用原始算子实现,默认关闭。若开启,性能可能提升,但无法进行LSTM/GRU的量化
89+
8890
```
8991

9092
**说明1: 选项weightQuantBits,使用方式为 --weightQuantBits numBits,numBits可选2~8,此功能仅对conv/matmul/LSTM的float32权值进行量化,仅优化模型大小,加载模型后会解码为float32,量化位宽可选2~8,运行速度和float32模型一致。经内部测试8bit时精度基本无损,模型大小减小4倍。default: 0,即不进行权值量化。**

express/Executor.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ void Executor::RuntimeManager::setCache(std::string cacheName) {
336336

337337
mInside->mCache.reset(new Cache);
338338
mInside->mCache->cacheFile = cacheName;
339+
mInside->mInfo->onSetCachePath(cacheName.c_str(), 0);
339340
if (nullptr == mInside->mCache->cacheFile.c_str()) {
340341
MNN_ERROR("Empty cacheFile\n");
341342
return;

0 commit comments

Comments
 (0)