alibaba · zlaazlaa · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025 · Dec 20, 2025
diff --git a/docs/index.rst b/docs/index.rst
@@ -69,6 +69,7 @@
    :name: transformers
 
    transformers/diffusion
+   transformers/DiT
    transformers/llm
    transformers/models
 

diff --git a/docs/transformers/DiT.md b/docs/transformers/DiT.md
@@ -0,0 +1,83 @@
+# DiT（Diffusion Transformer）模型
+
+## 模型支持与下载
+
+1. stable-diffusion-3.5-medium
+```
+https://huggingface.co/stabilityai/stable-diffusion-3.5-medium/tree/main
+```
+## 模型转换
+### 将DiT模型 转为onnx模型
+```sh
+optimum-cli export onnx \
+    --model hf_model_path \
+    --task stable-diffusion \
+    --device cuda \
+    onnx_save_path
+```
+注意，上述脚本需要依赖torch/onnx/diffusers等库，可以安装conda环境：
+```
+conda env create -f env.yaml
+conda activate ldm
+```
+在conda环境中执行模型转换脚本
+
+### 将onnx模型转为mnn模型
+新建diffusion mnn模型文件夹 mnn_save_path ，将转好的mnn文件放在该文件夹下。
+
+执行脚本
+```
+python3 convert_mnn_sd35.py onnx_save_path mnn_save_path "--weightQuantBits=8"
+```
+
+若希望在OpenCL / Metal后端上进一步加速，可加上--transformerFuse:
+```
+# 适用OpenCL / Metal后端推理
+python3 convert_mnn_sd35.py onnx_save_path mnn_save_path "--weightQuantBits=8 --transformerFuse"
+```
+
+## 编译Diffusion Demo
+### Linux/MAC/Windows上
+```
+cd mnn_path
+mkdir build
+cd build
+cmake .. -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON -DCMAKE_CXX_STANDARD=17
+make -j32
+```
+### Android上
+```
+cd mnn_path/project/android/build
+../build_64.sh -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+../updateTest.sh
+```
+## 运行Diffusion Demo
+```
+./diffusion_sd35_demo <resource_path> <model_type> <memory_mode> <backend_type> <iteration_num> <random_seed> <output_image_name> <prompt_text>
+```
+其中，resource_path 就是mnn模型文件的路径，除了mnn文件，还需要:
+### 资源拷贝
+运行stable-diffusion-3.5-medium模型需要将huggingface模型中的tokenizer、tokenizer_2、tokenizer_3三个目录拷贝到resource_path文件夹中。
+### 参数设置
+```
+1. model_type代表模型类型，0代表stable-diffusion-3.5-medium模型。
+2. memory_mode代表设备是否内存足够，设为0表示内存节约模式(demo中每个模型使用前等待初始化，用完释放)，1代表内存足够模式(所有模式启动时全初始化完，性能快，运行时无需等待初始化), 2代表内存&性能折中模式(启动时初始化部分模型)。
+3. backend_type代表选择的运行后端，如OpenCL/Metal/CPU等。
+4. iteration_num代表文生图迭代次数，通常建议设置10到20之间。
+5. random_seed代表固定输入噪声种子数，设置为负数表示随机生成噪声种子数。当随机噪声种子数生成图片质量不佳时，可以调节该参数种子数值。
+```
+### 提示词和图片名称设置
+```
+1. output_image_name是生成图片的名字，默认图片位置在当前运行目录下。
+2. prompt_text是文生图的prompt，建议使用英文prompt。
+```
+### 运行命令示例
+```
+# 使用cuda后端运行stable-diffusion-3.5-medium模型
+./diffusion_sd35_demo mnn_sd3.5_path 0 0 3 20 0 demo.jpg "A fluffy white kitten wearing a yellow oversized hoodie, holding a warm coffee cup, cozy rainy window background, Pixar style 3D render, cute, warm lighting, fuzzy texture."
+```
+## FAQ
+1. Demo运行报错、段错误，怎么解决？
+- 常见错误可能是设备内存不足，通常支持opencl fp16的设备需要保证10GB以上的内存，不支持fp16则需要20GB以上显存了。
+2. 使用其他后端，出现报错，什么原因？
+- 目前其他后端暂不支持transformer插件算子，需要在onnx->mnn模型转换阶段，去掉--transformerFuse。
diff --git a/project/android/build_64.sh b/project/android/build_64.sh
@@ -10,6 +10,14 @@ cmake ../../../ \
 -DMNN_BUILD_TEST=ON \
 -DANDROID_NATIVE_API_LEVEL=android-21  \
 -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+-DMNN_LOW_MEMORY=ON \
+-DCMAKE_BUILD_TYPE=Debug \
+-DMNN_BUILD_DIFFUSION=ON \
+-DMNN_BUILD_OPENCV=ON \
+-DMNN_IMGCODECS=ON \
+-DMNN_OPENCL=ON \
+-DMNN_SEP_BUILD=OFF \
+-DMNN_SUPPORT_TRANSFORMER_FUSE=ON
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $*
 
 make -j4
diff --git a/project/android/updateTest.sh b/project/android/updateTest.sh
@@ -5,6 +5,7 @@ make -j16
 adb push ./libllm.so /data/local/tmp/$DIR/libllm.so
 adb push ./llm_demo /data/local/tmp/$DIR/llm_demo
 adb push ./diffusion_demo /data/local/tmp/$DIR/diffusion_demo
+adb push ./diffusion_sd35_demo /data/local/tmp/$DIR/diffusion_sd35_demo
 adb push ./libMNN.so /data/local/tmp/$DIR/libMNN.so
 adb push ./libMNN_CL.so /data/local/tmp/$DIR/libMNN_CL.so
 adb push ./libMNN_Vulkan.so /data/local/tmp/$DIR/libMNN_Vulkan.so

diff --git a/transformers/diffusion/engine/CMakeLists.txt b/transformers/diffusion/engine/CMakeLists.txt
@@ -1,5 +1,21 @@
 cmake_minimum_required(VERSION 3.10)
 
+# SentencePiece integration
+set(SPM_ENABLE_SHARED OFF CACHE BOOL "" FORCE)
+set(SPM_BUILD_TEST OFF CACHE BOOL "" FORCE)
+set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "" FORCE)
+set(SPM_PROTOBUF_PROVIDER "internal" CACHE STRING "" FORCE)
+set(SPM_ABSL_PROVIDER "internal" CACHE STRING "" FORCE)
+
+# Force C++17 for sentencepiece compatibility
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Prevent sentencepiece from installing itself
+set(CMAKE_INSTALL_BINDIR bin_spm)
+set(CMAKE_INSTALL_LIBDIR lib_spm)
+set(CMAKE_INSTALL_INCLUDEDIR include_spm)
+
 # source files
 FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 include_directories(${CMAKE_SOURCE_DIR}/tools/cv/include/)
@@ -21,4 +37,7 @@ else()
 endif()
 
 add_executable(diffusion_demo ${CMAKE_CURRENT_LIST_DIR}/diffusion_demo.cpp)
-target_link_libraries(diffusion_demo  ${MNN_DEPS})
+target_link_libraries(diffusion_demo  ${MNN_DEPS})
+
+add_executable(diffusion_sd35_demo ${CMAKE_CURRENT_LIST_DIR}/diffusion_sd35_demo.cpp)
+target_link_libraries(diffusion_sd35_demo  ${MNN_DEPS})
diff --git a/transformers/diffusion/engine/diffusion_sd35_demo.cpp b/transformers/diffusion/engine/diffusion_sd35_demo.cpp
@@ -0,0 +1,66 @@
+//
+//  diffusion_sd35_demo.cpp
+//
+//  Created by zlaa on 2025/12/18.
+//
+
+#include <iostream>
+#include "diffusion/diffusion_sd35.hpp"
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+using namespace MNN::DIFFUSION;
+
+int main(int argc, const char* argv[]) {
+    if (argc < 9) {
+        MNN_PRINT("=====================================================================================================================\n");
+        MNN_PRINT("Usage: ./diffusion_sd35_demo <resource_path> <model_type> <memory_mode> <backend_type> <iteration_num> <random_seed> <output_image_name> <prompt_text>\n");
+        MNN_PRINT("=====================================================================================================================\n");
+        return 0;
+    }
+
+    auto resource_path = argv[1];
+    // auto model_type = argv[2];
+    auto memory_mode = atoi(argv[3]);
+    auto backend_type = (MNNForwardType)atoi(argv[4]);
+    auto iteration_num = atoi(argv[5]);
+    auto random_seed = atoi(argv[6]);
+    auto img_name = argv[7];
+
+    std::string input_text;
+    for (int i = 8; i < argc; ++i) {
+        input_text += argv[i];
+        if (i < argc - 1) {
+            input_text += " ";
+        }
+    }
+
+    MNN_PRINT("Model resource path: %s\n", resource_path);
+    MNN_PRINT("Model type is stable diffusion 3.5\n");
+
+    if(memory_mode == 1) {
+        MNN_PRINT("(Memory Enough) All Diffusion models will be initialized when application enter. with fast initialization\n");
+    } else {
+        MNN_PRINT("(Memory Lack) Each diffusion model will be initialized when using, freed after using. with slow initialization\n");
+    }
+    MNN_PRINT("Backend type: %d\n", (int)backend_type);
+    MNN_PRINT("Output image name: %s\n", img_name);
+    MNN_PRINT("Prompt text: %s\n", input_text.c_str());
+
+    // Create Diffusion_sd35 instance
+    // We pass STABLE_DIFFUSION_1_5 as placeholder for modelType since DiffusionSD35 is dedicated to SD3.5
+    std::shared_ptr<DiffusionSD35> diffusion(DiffusionSD35::createDiffusionSD35(resource_path, STABLE_DIFFUSION_1_5, backend_type, memory_mode));
+
+    if (!diffusion->load()) {
+        MNN_ERROR("Failed to load diffusion models\n");
+        return -1;
+    }
+
+    diffusion->run(input_text, img_name, iteration_num, random_seed, [](int progress){
+        printf("Progress: %d%%\r", progress);
+        fflush(stdout);
+    });
+
+    printf("\nDone.\n");
+    return 0;
+}
diff --git a/transformers/diffusion/engine/include/diffusion/diffusion_sd35.hpp b/transformers/diffusion/engine/include/diffusion/diffusion_sd35.hpp
@@ -0,0 +1,70 @@
+//
+//  diffusion_sd35.hpp
+//
+//  Created by zlaa on 2025/12/18.
+//
+
+#ifndef MNN_DIFFUSION_SD35_HPP
+#define MNN_DIFFUSION_SD35_HPP
+
+#include <map>
+#include <vector>
+#include <MNN/Interpreter.hpp>
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/MathOp.hpp>
+#include <MNN/expr/NeuralNetWorkOp.hpp>
+#include <MNN/expr/Module.hpp>
+#include "diffusion/diffusion.hpp"
+
+using namespace MNN;
+using namespace MNN::Express;
+
+namespace MNN {
+namespace DIFFUSION {
+
+class MNN_PUBLIC DiffusionSD35 {
+public:
+    DiffusionSD35(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode);
+    virtual ~DiffusionSD35();
+    static DiffusionSD35* createDiffusionSD35(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode);
+
+    bool run(const std::string prompt, const std::string imagePath, int iterNum, int randomSeed, std::function<void(int)> progressCallback);
+    bool load();
+
+private:
+    // Returns {prompt_embeds, pooled_prompt_embeds}
+    std::pair<VARP, VARP> encode_prompt(const std::string& prompt);
+
+    VARP transformer(VARP hidden_states, VARP encoder_hidden_states, VARP pooled_projections, VARP timestep);
+    VARP vae_decoder(VARP latent);
+    VARP step_flow_match(VARP sample, VARP model_output, int index, int num_steps);
+    bool loadModule(int index);
+
+private:
+    std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
+    std::vector<std::shared_ptr<Module>> mModules; 
+    // mModules[0]: text_encoder
+    // mModules[1]: text_encoder_2
+    // mModules[2]: text_encoder_3
+    // mModules[3]: transformer
+    // mModules[4]: vae_decoder
+
+    std::vector<int> mTimeSteps;
+    VARP mLatentVar;
+
+private:
+    std::string mModelPath;
+    DiffusionModelType mModelType;
+    int mMaxTextLen = 77;
+    int mMaxTextLenT5 = 256;
+    int mMemoryMode;
+    MNNForwardType mBackendType;
+
+    std::unique_ptr<Tokenizer> mTokenizer1;
+    std::unique_ptr<Tokenizer> mTokenizer2;
+    std::unique_ptr<Tokenizer> mTokenizer3;
+};
+
+}
+}
+#endif