alibaba
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backupcode/cpubackend/CPULSTM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backupcode/cpubackend/CPULSTM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backupcode/cpubackend/bf16/BF16Functions.cpp‎
Lines changed: 2 additions & 1 deletion b/‎backupcode/cpubackend/bf16/BF16Functions.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backupcode/cpubackend/compute/DeconvolutionWithStride.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backupcode/cpubackend/compute/DeconvolutionWithStride.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎codegen/OpFuse.cpp‎
Lines changed: 92 additions & 0 deletions b/‎codegen/OpFuse.cpp‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎docs/compile/cmake.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/compile/cmake.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/compile/other.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/compile/other.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/pymnn/expr.md‎
Lines changed: 14 additions & 0 deletions b/‎docs/pymnn/expr.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/tools/convert.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/tools/convert.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎express/Executor.cpp‎
Lines changed: 1 addition & 0 deletions b/‎express/Executor.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -78,6 +78,7 @@ option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
 option(MNN_BUILD_AUDIO "Build audio api in MNN." OFF)
+option(MNN_SME2 "Use Arm sme2 instructions" ON)
 
 if (MNN_BUILD_MINI)
     set(MNN_SKIPBUILD_GEOMETRY ON)
 
@@ -100,7 +100,7 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect
             auto temp = tempBuffer->host<float>();
             auto dest = dst + n * UP_DIV(timeSteps, hP) * numFeatures * hP;
             MNNUnpackC4(temp, source, numFeatures, timeSteps);
-            MNNPackForMatMul_B(dest, temp, timeSteps, numFeatures, true);
+            MNNPackForMatMul_B(dest, temp, timeSteps, 1, numFeatures, true);
         }
     };
 
 
@@ -159,7 +159,8 @@ void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, con
     return;
 }
 
-void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) {
+void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t kernelsize, size_t ic, bool transpose) {
+    auto l = kernelsize * ic;
     auto hP = h / 4;
     auto hR = hP * 4;
     if (hR != h) {
 
@@ -71,7 +71,7 @@ static void _winograd(const DeconvolutionWithStride::ComputeUnit& unit, int thre
     el[3] = 0;
     size_t parameters[6];
     parameters[0] = eP * sizeof(float);
-    parameters[1] = ic;
+    parameters[1] = ROUND_UP(ic, lP);
     parameters[2] = oc;
     parameters[3] = eP * 4 * sizeof(float);
     parameters[4] = 0;
@@ -130,7 +130,7 @@ static void _gemmAndIm2col(const DeconvolutionWithStride::ComputeUnit& unit, int
     el[3] = 0;
     size_t parameters[6];
     parameters[0] = eP * sizeof(float);
-    parameters[1] = ic;
+    parameters[1] = ROUND_UP(ic, lP);
     parameters[2] = oc;
     parameters[3] = eP * 4 * sizeof(float);
     parameters[4] = 0;
 
@@ -62,6 +62,84 @@ static void dumpCmd(const Command* cmd) {
     MNN_PRINT("}\n");
 }
 
+void mergeConvolutionAndPrelu(Node* root, MNNForwardType forwardType){
+    if (root->cmd->op != nullptr && root->cmd->op->type() == OpType_Convolution && root->succ.size() == 1) {
+        auto child = root->succ[0];
+        if(child->cmd->op->type() == OpType_PReLU){
+            if(root->cmd->op->externalPath() != nullptr){
+                return;
+            }
+            std::shared_ptr<Command> cmdPlugin;
+            auto inputs = root->cmd->inputs;
+            auto outputs = root->cmd->outputs;
+            auto convOp = root->cmd->op->main_as_Convolution2D();
+            if(convOp->quanParameter() != nullptr || convOp->symmetricQuan() != nullptr || convOp->sparseParameter() != nullptr || convOp->external() != nullptr || convOp->common()->outputCount() != child->cmd->op->main_as_PRelu()->slopeCount()){
+                return;
+            }
+            std::unique_ptr<OpT> fuseOp(new OpT);
+            fuseOp->type = OpType_Extra;
+            fuseOp->name = root->cmd->op->name()->str();
+            ExtraT* extra_param = new ExtraT;
+            extra_param->type = "ExtraConvolution2DPrelu";
+            extra_param->attr.resize(2);
+            // copy convolution2D param
+            AttributeT* convAtr = new AttributeT;
+            BlobT* convParamBlob = new BlobT;
+            {
+                std::unique_ptr<Convolution2DT> convolutionParam(convOp->UnPack());
+                flatbuffers::FlatBufferBuilder builder;
+                auto lastOffset = Convolution2D::Pack(builder, convolutionParam.get());
+                builder.Finish(lastOffset);
+                
+                const uint8_t* buffer_ptr = builder.GetBufferPointer();
+                const size_t size = builder.GetSize();
+                convParamBlob->uint8s.resize(size);
+                ::memcpy(convParamBlob->uint8s.data(), buffer_ptr, size);
+            }
+            convAtr->tensor.reset(convParamBlob);
+            extra_param->attr[0].reset(convAtr);
+            
+            // copy prelu param
+            AttributeT* preluAtr = new AttributeT;
+            BlobT* preluParamBlob = new BlobT;
+            {
+                std::unique_ptr<PReluT> preluParam(child->cmd->op->main_as_PRelu()->UnPack());
+                flatbuffers::FlatBufferBuilder builder;
+                auto lastOffset = PRelu::Pack(builder, preluParam.get());
+                builder.Finish(lastOffset);
+                const uint8_t* buffer_ptr = builder.GetBufferPointer();
+                const size_t size = builder.GetSize();
+                preluParamBlob->uint8s.resize(size);
+                ::memcpy(preluParamBlob->uint8s.data(), buffer_ptr, size);
+            }
+            preluAtr->tensor.reset(preluParamBlob);
+            extra_param->attr[1].reset(preluAtr);
+            
+            fuseOp->main.type  = OpParameter_Extra;
+            fuseOp->main.value = extra_param;
+            flatbuffers::FlatBufferBuilder builder;
+            auto lastOffset = Op::Pack(builder, fuseOp.get());
+            builder.Finish(lastOffset);
+            cmdPlugin = GeometryComputerUtils::makeCommand(builder, inputs, outputs);
+            
+            root->cmd->op = cmdPlugin->op;
+            root->cmd->inputs = cmdPlugin->inputs;
+            root->cmd->outputs = cmdPlugin->outputs;
+            root->cmd->buffer = cmdPlugin->buffer;
+            child->cmd->op = nullptr;
+            child->cmd->buffer.reset();
+            for(auto &childNode : child->succ){
+                for(auto &input : childNode->cmd->inputs){
+                    if(input == child->cmd->outputs[0]){
+                        input = root->cmd->outputs[0];
+                    }
+                }
+            }
+            root->succ = child->succ;
+        }
+    }
+}
+
 // is legal fused type
 bool isLegal(Command* cmd, MNNForwardType forwardType) {
     auto type = cmd->op->type();
@@ -369,6 +447,20 @@ bool opFuse(std::vector<Schedule::OpCacheInfo>& infos, MNNForwardType type, Back
             graph.push_back(std::move(node));
         }
     }
+    
+    if(type == MNN_FORWARD_OPENCL){
+        for(int i = 0; i < graph.size(); ++i){
+            mergeConvolutionAndPrelu(graph[i].get(), type);
+        }
+        for(auto iter = graph.begin(); iter != graph.end();){
+            if(iter->get()->cmd->op == nullptr){
+                iter = graph.erase(iter);
+            }else{
+                ++iter;
+            }
+        }
+    }
+    
     std::queue<Node*> postDominateNodeQueue;
     // build dominate tree
     for (int i = static_cast<int>(graph.size()) - 1; i >= 0; i--) {
 
@@ -19,7 +19,8 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_SUPPORT_QUNAT_EXTEND  | 是否编译非核心算子的量化版本，默认为`ON` |
 | MNN_SUPPORT_DEPRECATED_OP | 是否支持Tflite的量化算子等已经废弃的算子，用于兼容历史模型(1.1.0版本之前)，默认为`OFF` |
 | MNN_SUPPORT_DEPRECATED_OPV2 | 是否编译MNN更新到3.0之后已经废弃的算子，用于兼容历史模型(3.0.0版本之前)，比如 Convolution3D 和 ConvTranspose3D在3.0.0 版本之后改由模型转换器转化为对应2D算子，不再需要运行时支持，默认为`ON` |
-| MNN_REDUCE_SIZE  | 是否裁剪MNN库大小，去除求导相关算子，减少优化策略，默认为`OFF` ，开启时，MNN_SUPPORT_QUNAT_EXTEND / MNN_SUPPORT_DEPRECATED_OP / MNN_SUPPORT_DEPRECATED_OPV2 都会设成 OFF|
+| MNN_REDUCE_SIZE  | 是否裁剪MNN库大小，去除求导相关算子，减少优化策略，默认为`OFF` ，开启时，MNN_SUPPORT_QUANT_EXTEND / MNN_SUPPORT_DEPRECATED_OP / MNN_SUPPORT_DEPRECATED_OPV2 都会设成 OFF|
+| MNN_SUPPORT_QUANT_EXTEND     | 是否开启Binary/Unary等算子的量化计算支持，默认为`ON` |
 | MNN_DEBUG_MEMORY     | 是否开启MNN内存调试，默认为`OFF` |
 | MNN_DEBUG_TENSOR_SIZE | 是否开启MNN tensor size调试，默认为`OFF` |
 | MNN_GPU_TRACE        | 是否开启MNN GPU调试，默认为`OFF` |
@@ -43,6 +44,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENGL           | 是否构建`OpenGL`后端，默认为`OFF` |
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
 | MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
+| MNN_SME2             | 编译ARM架构时，是否构建`ArmSme2`后端，以支持使用sme2指令集计算，默认为`ON` |
 | MNN_SUPPORT_FP16_ARMV7            | 编译armeabi-v7a架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`OFF` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
 | MNN_AVX2             | 在`MNN_USE_SSE`开启的基础上，是否增加AVX2指令的支持，默认为`ON` |
@@ -55,6 +57,8 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_TENSORRT         | 是否构建`TensorRT`后端，默认为`OFF` |
 | MNN_COREML           | 是否构建`CoreML`后端，默认为`OFF` |
 | MNN_NNAPI            | 是否构建`NNAPI`后端，默认为`OFF`  |
+| MNN_QNN              | 是否构建`QNN`后端，默认为`OFF` |
+| MNN_QNN_CONVERT_MODE | 在`MNN_QNN`开启的基础上,是否构建Convert模式的QNN后端，默认为`OFF` |
 | MNN_BUILD_BENCHMARK  | 是否构建MNN的性能测试，默认为`OFF` |
 | MNN_BUILD_TEST       | 是否构建MNN的单元测试，默认为`OFF` |
 | MNN_BUILD_FOR_ANDROID_COMMAND | 是否使用命令行构建`Android`，默认为`OFF` |
 
@@ -171,6 +171,7 @@
   - `rasterDemo.out` Raster示例
   - `nluDemo.out` nlu模型示例
   - `mergeInplaceForCPU` 将模型中可以Inplace计算的算子改成Inplace计算，可以减少内存占用，但限定CPU后端运行
+  - `OpenCLProgramBuildTest.out` 测试OpenCL后端的Program在设备上是否能编译成功
 ## 单元测试
 - 相关编译选项
   - `MNN_BUILD_TEST` 是否编译MNN单元测试
 
@@ -3132,6 +3132,20 @@ roialign
 ```python
 TODO
 ```
+
+---
+### `jsonop(inputs, describe, output_number)`
+
+jsonop
+
+对于MNN模型支持的算子，但没有相应表达式透出的情况，可以使用jsonop接口，以json描述算子
+
+参数：
+- `inputs` : List[Var] 输入变量数组，任意类型
+- `describe` : str ，算子的json描述
+- `output_number` : int, 算子输出数
+
+
 ---
 **以下函数为框架开发者使用函数，普通用户不建议使用！**
 
 
@@ -85,6 +85,8 @@ Usage:
 
       --useGeluApproximation    在进行Gelu算子合并时，使用Gelu的近似算法，默认为1 ，也就是`true`
 
+      --useOriginRNNImpl    LSTM和GRU算子是否使用原始算子实现，默认关闭。若开启，性能可能提升，但无法进行LSTM/GRU的量化
+
 ```
 
 **说明1: 选项weightQuantBits，使用方式为 --weightQuantBits numBits，numBits可选2~8，此功能仅对conv/matmul/LSTM的float32权值进行量化，仅优化模型大小，加载模型后会解码为float32，量化位宽可选2~8，运行速度和float32模型一致。经内部测试8bit时精度基本无损，模型大小减小4倍。default: 0，即不进行权值量化。**
 
@@ -336,6 +336,7 @@ void Executor::RuntimeManager::setCache(std::string cacheName) {
 
     mInside->mCache.reset(new Cache);
     mInside->mCache->cacheFile = cacheName;
+    mInside->mInfo->onSetCachePath(cacheName.c_str(), 0);
     if (nullptr == mInside->mCache->cacheFile.c_str()) {
         MNN_ERROR("Empty cacheFile\n");
         return;
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect`
`100`	`100`	`auto temp = tempBuffer->host<float>();`
`101`	`101`	`auto dest = dst + n * UP_DIV(timeSteps, hP) * numFeatures * hP;`
`102`	`102`	`MNNUnpackC4(temp, source, numFeatures, timeSteps);`
`103`		`- MNNPackForMatMul_B(dest, temp, timeSteps, numFeatures, true);`
	`103`	`+ MNNPackForMatMul_B(dest, temp, timeSteps, 1, numFeatures, true);`
`104`	`104`	`}`
`105`	`105`	`};`
`106`	`106`
Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,8 @@ void MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, con`
`159`	`159`	`return;`
`160`	`160`	`}`
`161`	`161`
`162`		`-void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t l, bool transpose) {`
	`162`	`+void MNNPackForMatMul_B_BF16(float* dest, const float* source, size_t h, size_t kernelsize, size_t ic, bool transpose) {`
	`163`	`+ auto l = kernelsize * ic;`
`163`	`164`	`auto hP = h / 4;`
`164`	`165`	`auto hR = hP * 4;`
`165`	`166`	`if (hR != h) {`