alibaba
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/tools/compress.md‎
Lines changed: 363 additions & 934 deletions b/‎docs/tools/compress.md‎
Lines changed: 363 additions & 934 deletions
diff --git a/‎docs/tools/mnncompress.md‎
Lines changed: 837 additions & 0 deletions b/‎docs/tools/mnncompress.md‎
Lines changed: 837 additions & 0 deletions
diff --git a/‎express/MathOp.cpp‎
Lines changed: 4 additions & 4 deletions b/‎express/MathOp.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎include/MNN/MNNDefine.h‎
Lines changed: 1 addition & 1 deletion b/‎include/MNN/MNNDefine.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎schema/current/TensorflowOp_generated.h‎
Lines changed: 37 additions & 36 deletions b/‎schema/current/TensorflowOp_generated.h‎
Lines changed: 37 additions & 36 deletions
diff --git a/‎schema/default/TensorflowOp.fbs‎
Lines changed: 2 additions & 2 deletions b/‎schema/default/TensorflowOp.fbs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/backend/cpu/KVCacheManager.cpp‎
Lines changed: 48 additions & 4 deletions b/‎source/backend/cpu/KVCacheManager.cpp‎
Lines changed: 48 additions & 4 deletions
diff --git a/‎source/backend/cpu/KVCacheManager.hpp‎
Lines changed: 5 additions & 2 deletions b/‎source/backend/cpu/KVCacheManager.hpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎source/backend/metal/MetalAttention.mm‎
Lines changed: 2 additions & 1 deletion b/‎source/backend/metal/MetalAttention.mm‎
Lines changed: 2 additions & 1 deletion
@@ -81,6 +81,7 @@
    tools/test
    tools/benchmark
    tools/compress
+   tools/mnncompress
    tools/visual
    tools/python
 
 
@@ -1140,7 +1140,7 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape, int reducetion) {
     op->main.type    = OpParameter_BinaryOp;
     op->type         = OpType_ScatterNd;
     auto param       = new BinaryOpT;
-    param->opType    = reducetion;
+    param->opType    = (BinaryOpOperation)reducetion;
     op->main.value   = param;
     return (Variable::create(Expr::create(std::move(op), {indices, updates, shape})));
 }
@@ -1150,7 +1150,7 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input, int reduceti
     op->main.type    = OpParameter_BinaryOp;
     op->type         = OpType_ScatterNd;
     auto param       = new BinaryOpT;
-    param->opType    = reducetion;
+    param->opType    = (BinaryOpOperation)reducetion;
     op->main.value   = param;
     return (Variable::create(Expr::create(std::move(op), {indices, updates, shape, input})));
 }
@@ -1167,7 +1167,7 @@ VARP _ScatterElements(VARP data, VARP indices, VARP updates, int reducetion) {
     op->main.type     = OpParameter_BinaryOp;
     op->type          = OpType_ScatterElements;
     auto param        = new BinaryOpT;
-    param->opType     = reducetion;
+    param->opType     = (BinaryOpOperation)reducetion;
     op->main.value    = param;
     return (Variable::create(Expr::create(std::move(op), {data, indices, updates})));
 }
@@ -1177,7 +1177,7 @@ VARP _ScatterElements(VARP data, VARP indices, VARP updates, VARP axis, int redu
     op->main.type     = OpParameter_BinaryOp;
     op->type          = OpType_ScatterElements;
     auto param        = new BinaryOpT;
-    param->opType     = reducetion;
+    param->opType     = (BinaryOpOperation)reducetion;
     op->main.value    = param;
     return (Variable::create(Expr::create(std::move(op), {data, indices, updates, axis})));
 }
 
@@ -78,6 +78,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 3
 #define MNN_VERSION_MINOR 2
-#define MNN_VERSION_PATCH 4
+#define MNN_VERSION_PATCH 5
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
@@ -592,11 +592,11 @@ inline const char *EnumNamePadValueMode(PadValueMode e) {
 
 struct BinaryOpT : public flatbuffers::NativeTable {
   typedef BinaryOp TableType;
-  int32_t opType;
+  BinaryOpOperation opType;
   DataType T;
   int32_t activationType;
   BinaryOpT()
-      : opType(0),
+      : opType(BinaryOpOperation_ADD),
         T(DataType_DT_FLOAT),
         activationType(0) {
   }
@@ -607,8 +607,8 @@ struct BinaryOp FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   static const flatbuffers::TypeTable *MiniReflectTypeTable() {
     return BinaryOpTypeTable();
   }
-  int32_t opType() const {
-    return GetField<int32_t>(4, 0);
+  BinaryOpOperation opType() const {
+    return static_cast<BinaryOpOperation>(GetField<int32_t>(4, 0));
   }
   DataType T() const {
     return static_cast<DataType>(GetField<int32_t>(6, 1));
@@ -631,8 +631,8 @@ struct BinaryOp FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct BinaryOpBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_opType(int32_t opType) {
-    fbb_.AddElement<int32_t>(4, opType, 0);
+  void add_opType(BinaryOpOperation opType) {
+    fbb_.AddElement<int32_t>(4, static_cast<int32_t>(opType), 0);
   }
   void add_T(DataType T) {
     fbb_.AddElement<int32_t>(6, static_cast<int32_t>(T), 1);
@@ -654,7 +654,7 @@ struct BinaryOpBuilder {
 
 inline flatbuffers::Offset<BinaryOp> CreateBinaryOp(
     flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t opType = 0,
+    BinaryOpOperation opType = BinaryOpOperation_ADD,
     DataType T = DataType_DT_FLOAT,
     int32_t activationType = 0) {
   BinaryOpBuilder builder_(_fbb);
@@ -4930,34 +4930,34 @@ inline flatbuffers::Offset<LSTMBlockCell> CreateLSTMBlockCell(flatbuffers::FlatB
 
 inline const flatbuffers::TypeTable *BinaryOpOperationTypeTable() {
   static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 },
-    { flatbuffers::ET_CHAR, 0, 0 }
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 0 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     BinaryOpOperationTypeTable
@@ -5175,11 +5175,12 @@ inline const flatbuffers::TypeTable *PadValueModeTypeTable() {
 
 inline const flatbuffers::TypeTable *BinaryOpTypeTable() {
   static const flatbuffers::TypeCode type_codes[] = {
-    { flatbuffers::ET_INT, 0, -1 },
     { flatbuffers::ET_INT, 0, 0 },
+    { flatbuffers::ET_INT, 0, 1 },
     { flatbuffers::ET_INT, 0, -1 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
+    BinaryOpOperationTypeTable,
     DataTypeTypeTable
   };
   static const char * const names[] = {
 
@@ -1,7 +1,7 @@
 include "Tensor.fbs";
 namespace MNN;
 
-enum BinaryOpOperation : byte {
+enum BinaryOpOperation : int {
     ADD = 0,
     SUB = 1,
     MUL = 2,
@@ -33,7 +33,7 @@ enum BinaryOpOperation : byte {
 }
 
 table BinaryOp {
-    opType:int;
+    opType:BinaryOpOperation;
     T:DataType=DT_FLOAT;
     // 0 -> No Activation
     // 1 -> Relu
 
@@ -264,7 +264,7 @@ void KVCacheManager::expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int o
     } else if (mConfig.mQuantKey) {
         old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP), UP_DIV(mHeadDim, lP), hP, lP}));
     } else {
-        old_key.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(oldMaxLength, hP), UP_DIV(mHeadDim, lP), hP, lP}));  
+        old_key.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(oldMaxLength, hP), UP_DIV(mHeadDim, lP), hP, lP}));
     }
     if (mConfig.mQuantValue) {
         old_value.reset(Tensor::createDevice<fp8_t>({mKvNumHead, UP_DIV(mHeadDim, hP), UP_DIV(oldMaxLength, lP), hP, lP}));
@@ -387,7 +387,7 @@ void KVCacheManager::onAlloc(int kv_seq_len) {
         } else {
             mPastValue.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), UP_DIV(mMaxLength, lP), hP, lP}));
         }
-        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC); 
+        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC);
         mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC);
         if (mHeadDim % lP) {
             memset(mPastKey->host<int8_t>(), 0, mPastKey->length(0) * mPastKey->stride(0) * mBytes);
@@ -486,6 +486,21 @@ void KVCacheManager::onRealloc(const KVMeta* meta) {
         mPastLength = start;
         return;
     }
+#if 1
+    auto dstIndex = start;
+    for (int n = 0; n < meta->n_reserve; ++n) {
+        auto begin = meta->reserve[2 * n];
+        auto size  = meta->reserve[2 * n + 1];
+        auto srcIndex = start + begin;
+        if (mBytes == 2) {
+            moveKV<FLOAT16_T>(srcIndex, dstIndex, size);
+        } else {
+            moveKV<float>(srcIndex, dstIndex, size);
+        }
+        dstIndex += size;
+    }
+    mPastLength = dstIndex;
+#else
     // Don't support not align reserve
     auto align = hP;
     auto dstStart = start;
@@ -503,7 +518,7 @@ void KVCacheManager::onRealloc(const KVMeta* meta) {
         }
         auto end = begin + start + size;
         auto endAlign = UP_DIV(end, align) * align;
-        
+
         auto sizeUnit = (endAlign - startAlign) / align;
         auto dstStartAlign = UP_DIV(dstStart, align) * align;
 
@@ -539,6 +554,7 @@ void KVCacheManager::onRealloc(const KVMeta* meta) {
         lastValidSrcEnd = begin + start + size;
     }
     mPastLength = dstStart;
+#endif
 }
 
 void KVCacheManager::onClear() {
@@ -551,7 +567,7 @@ void KVCacheManager::onClear() {
         } else {
             keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * ROUND_UP(mHeadDim, lP) * hP * mBytes;
         }
-        valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * ROUND_UP(mMaxLength, lP) * hP * (mConfig.mQuantValue ? 1 : mBytes);    
+        valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * ROUND_UP(mMaxLength, lP) * hP * (mConfig.mQuantValue ? 1 : mBytes);
         unmapKVCache(keySize, valueSize);
         removeKVCacheFile();
         mKVCacheInDisk = false;
@@ -663,6 +679,34 @@ void KVCacheManager::pack_value(const Tensor* value, int seq_len, int kv_h) { //
     }
 }
 
+size_t KVCacheManager::keyIndex(int seq, int dim) const {
+    return (seq / hP) * ROUND_UP(mHeadDim, lP) * hP +
+           (dim / lP) * hP * lP +
+           (seq % hP) * lP +
+           (dim % lP);
+}
+
+size_t KVCacheManager::valueIndex(int seq, int dim) const {
+    return (dim / hP) * ROUND_UP(mMaxLength, lP) * hP +
+           (seq / lP) * hP * lP +
+           (dim % hP) * lP +
+           (seq % lP);
+}
+
+template <typename T>
+void KVCacheManager::moveKV(int src, int dst, int size) {
+    for (int h = 0; h < mKvNumHead; ++h) {
+        auto kPtr = reinterpret_cast<T*>(addrOfKey(h));
+        auto vPtr = reinterpret_cast<T*>(addrOfValue(h));
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < mHeadDim; j++) {
+                kPtr[keyIndex(dst + i, j)]   = kPtr[keyIndex(src + i, j)];
+                vPtr[valueIndex(dst + i, j)] = vPtr[valueIndex(src + i, j)];
+            }
+        }
+    }
+}
+
 void KVCacheManager::onPushBack(const Tensor * key, const Tensor * value, int add) {
     auto core = static_cast<CPUBackend*>(mBackend)->functions();
     int seq_len = add;
 
@@ -39,7 +39,7 @@ class KVCacheManager : public NonCopyable{
 private:
     Backend * mBackend;
     KVCacheConfig mConfig;
-    std::shared_ptr<Tensor> mPastKey;               // {numhead, [maxlen/hP, headdim, hP]} or {numhead, [maxlen/hP8, headdim/lP8, hP8, lP8]} 
+    std::shared_ptr<Tensor> mPastKey;               // {numhead, [maxlen/hP, headdim, hP]} or {numhead, [maxlen/hP8, headdim/lP8, hP8, lP8]}
     std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/hP, maxlen, hP]
     std::shared_ptr<Tensor> mKeyScale;              // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
     std::shared_ptr<Tensor> mKeyZeroPoint;          // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
@@ -65,10 +65,13 @@ class KVCacheManager : public NonCopyable{
     void expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int oldValueSize, int keySize, int valueSize);
     template <typename T> void pack_key(const Tensor* key, int seq_len, int kv_h);
     template <typename T> void pack_value(const Tensor* value, int seq_len, int kv_h);
+    template <typename T> void moveKV(int src, int dst, int size);
+    size_t keyIndex(int seq, int dim) const;
+    size_t valueIndex(int seq, int dim) const;
 public:
     KVCacheManager(Backend * backend, KVCacheConfig & kvConfig) {
         mBackend   = backend;
-        mConfig    = kvConfig; 
+        mConfig    = kvConfig;
     }
     ~KVCacheManager() {
         onClear();
 
@@ -176,14 +176,15 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         auto valueBuf = MetalBackend::getBuffer(mCache->mPastValue.get());
         auto value_ptr = (uint8_t*)[valueBuf.first contents] + valueBuf.second;
 
+        auto src_start = start;
         // TODO: need to ensure reserve info is sorted
         for (int n = 0; n < mMeta->n_reserve; ++n) {
             auto begin = mMeta->reserve[2 * n];
             auto length = mMeta->reserve[2 * n + 1];
             // past_key   : [mCache->mPastLength, mKvNumHead, mHeadDim]
             // past_value : [mKvNumHead, mHeadDim, mCache->mMaxLength]
 
-            auto copy_src_index = start + begin;
+            auto copy_src_index = src_start + begin;
             auto copy_dst_index = start;
             for(int i = 0; i < length; i++) {
                 ::memcpy(key_ptr + (copy_dst_index + i) * mKvNumHead * mHeadDim * byte, key_ptr + (copy_src_index + i) * mKvNumHead * mHeadDim * byte, mKvNumHead * mHeadDim * byte);