foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 91 additions & 9 deletions b/‎convert.py‎
Lines changed: 91 additions & 9 deletions
diff --git a/‎docs/models.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/models.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/ernie.cpp‎
Lines changed: 132 additions & 1 deletion b/‎models/ernie.cpp‎
Lines changed: 132 additions & 1 deletion
diff --git a/‎models/ernie.h‎
Lines changed: 30 additions & 2 deletions b/‎models/ernie.h‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎scripts/models.json‎
Lines changed: 9 additions & 0 deletions b/‎scripts/models.json‎
Lines changed: 9 additions & 0 deletions
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-07-04: ERNIE-MoE
 * 2025-06-30: Hunyuan-A13B, ERNIE-Dense
 * 2025-06-21: [I can hear](./docs/multimodal.md): Qwen2-Audio
 * 2025-06-10: SmolVLM2
 
@@ -197,6 +197,8 @@ class ModelType(Enum):
 
     Apriel          = 0x2400
 
+    ERNIE_MoE       = 0x2500
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -285,6 +287,7 @@ def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
     tensor = torch.cat((scale.half().view(torch.int8), min_values.half().view(torch.int8), tensor), dim=-1)
     return tensor
 
+@torch.jit.script
 def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
     assert x.dim() == 1
     N = x.shape[0]
@@ -297,7 +300,7 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
     if min_x > 0: min_x = torch.tensor(0)
     if min_x == max_x:
         L = torch.zeros(N)
-        return 0.0, -min_x, L
+        return torch.tensor(0.0), -min_x, L
 
     iscale = nmax / (max_x - min_x)
     scale = 1 / iscale
@@ -322,7 +325,7 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
             this_scale = (sum_w * sum_xl - sum_x * sum_l)/D
             this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D
             if this_min > 0:
-                this_min = 0
+                this_min = torch.tensor(0)
                 this_scale = sum_xl / sum_l2
 
             diff = this_scale * l + this_min - x
@@ -335,19 +338,20 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
 
     return scale, -min_x, L
 
-def quantize_q4_k_block(tensor: torch.Tensor) -> torch.CharTensor:
+@torch.jit.script
+def quantize_q4_k_block(tensor: torch.Tensor, GGML_QK_K: int) -> torch.CharTensor:
     assert tensor.shape == (GGML_QK_K, )
     tensor = tensor.view(-1, 32)
 
-    subblocks = [qkx2_quants(tensor[i], 15, -1.0, 0.1, 20, False) for i in range(tensor.shape[0])]
+    subblocks = [qkx2_quants(tensor[i], torch.tensor(15), torch.tensor(-1.0), torch.tensor(0.1), 20, False) for i in range(tensor.shape[0])]
     scale = torch.stack([x[0] for x in subblocks])
     min_x = torch.stack([x[1] for x in subblocks])
 
     max_scale = torch.max(scale)
     max_min   = torch.max(min_x)
 
-    inv_scale = 63.0 / max_scale if max_scale > 0 else 0.0
-    inv_min   = 64.0 / max_min   if max_min   > 0 else 0.0
+    inv_scale = torch.tensor(63.0) / max_scale if max_scale > 0 else torch.tensor(0.0)
+    inv_min   = torch.tensor(64.0) / max_min   if max_min   > 0 else torch.tensor(0.0)
 
     ls = (inv_scale * scale).round().clamp(max=63)
     lm = (inv_min   * min_x).round().clamp(max=63)
@@ -380,11 +384,12 @@ def quantize_q4_k_block(tensor: torch.Tensor) -> torch.CharTensor:
 
     return r
 
-def quantize_q4_k(tensor: torch.Tensor) -> torch.CharTensor:
+@torch.jit.script
+def quantize_q4_k(tensor: torch.Tensor, GGML_QK_K: int) -> torch.CharTensor:
     # equivalent to dequantize_row_q4_K in ggml-quants.c
     assert tensor.shape[tensor.ndim - 1] % GGML_QK_K == 0
     tensor = tensor.view(-1, GGML_QK_K)
-    blocks = [quantize_q4_k_block(tensor[i]) for i in range(tensor.shape[0])]
+    blocks = [quantize_q4_k_block(tensor[i], GGML_QK_K) for i in range(tensor.shape[0])]
     tensor = torch.cat(blocks, dim=-1)
     return tensor
 
@@ -411,7 +416,7 @@ def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType):
         elif ggml_type == GGMLType.Q4_1:
             tensor = quantize_q4_1(tensor)
         elif ggml_type == GGMLType.Q4_K:
-            tensor = quantize_q4_k(tensor)
+            tensor = quantize_q4_k(tensor, GGML_QK_K)
         else:
             raise NotImplementedError(f"Cannot dump tensor of dtype {tensor.dtype}")
     except Exception as e:
@@ -6364,6 +6369,81 @@ def get_weight_names(config):
 
         return weight_names
 
+class ERNIEMoEConverter(BaseConverter):
+    MODEL_TYPE = ModelType.ERNIE_MoE
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        assert not config.use_bias
+        assert len(config.moe_capacity) == 3
+        if config.rope_scaling is not None:
+            assert config.rope_scaling == 1.0, 'rope_scaling must equal to 1.0'
+
+        dump_llama_like_config(f, config, ggml_type)
+        config_values = [
+            config.num_key_value_heads,
+            1 if config.tie_word_embeddings else 0,
+            config.moe_num_experts,
+            config.moe_num_shared_experts,
+            config.moe_layer_start_index,
+            config.moe_intermediate_size,
+            config.moe_capacity[0],
+            config.moe_capacity[1],
+            config.moe_capacity[2],
+            config.moe_k,
+            config.moe_layer_interval,
+            1 if config.moe_use_aux_free else 0,
+        ]
+        f.write(struct.pack("i" * len(config_values), *config_values))
+        f.write(struct.pack("<f", config.rope_theta))
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = ["model.embed_tokens.weight"]
+        for i in range(config.num_hidden_layers):
+            weight_names += [
+                f"model.layers.{i}.input_layernorm.weight",
+                f"model.layers.{i}.post_attention_layernorm.weight",
+                f"model.layers.{i}.self_attn.k_proj.weight",
+                f"model.layers.{i}.self_attn.o_proj.weight",
+                f"model.layers.{i}.self_attn.q_proj.weight",
+                f"model.layers.{i}.self_attn.v_proj.weight",
+            ]
+
+            if (i >= config.moe_layer_start_index) and ((i + 1) % config.moe_layer_interval == 0):
+                weight_names += [
+                    f"model.layers.{i}.mlp.gate.weight",
+                    f"model.layers.{i}.mlp.shared_experts.gate_proj.weight",
+                    f"model.layers.{i}.mlp.shared_experts.up_proj.weight",
+                    f"model.layers.{i}.mlp.shared_experts.down_proj.weight",
+                ]
+                if config.moe_use_aux_free:
+                    weight_names += [
+                        f"model.layers.{i}.mlp.moe_statics.e_score_correction_bias",
+                    ]
+                for j in range(config.moe_num_experts):
+                    weight_names += [
+                        f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight",
+                        f"model.layers.{i}.mlp.experts.{j}.up_proj.weight",
+                        f"model.layers.{i}.mlp.experts.{j}.down_proj.weight",
+                    ]
+            else:
+                weight_names += [
+                    f"model.layers.{i}.mlp.down_proj.weight",
+                    f"model.layers.{i}.mlp.gate_proj.weight",
+                    f"model.layers.{i}.mlp.up_proj.weight",
+                ]
+
+        weight_names += [
+            "model.norm.weight",
+        ]
+
+        if not config.tie_word_embeddings:
+            weight_names += [
+                "lm_head.weight"
+            ]
+        return weight_names
+
 class KimiVLConverter(BaseConverter):
     MODEL_TYPE = ModelType.KimiVL
 
@@ -7516,6 +7596,8 @@ def main():
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'Ernie4_5_ForCausalLM':
         ERNIEDenseConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'Ernie4_5_MoeForCausalLM':
+        ERNIEMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'deepseek-r1-distill-qwen3':
         QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
 
@@ -58,8 +58,8 @@
 
     Two optimization modes are defined: speed (default) and memory. See `BaseMLAttention`.
 
-* ERNIE (`Ernie4_5_ForCausalLM`)
-    * [x] [0.3B](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT/tree/c163aa422d265f995b024d1322d91c4e3cb52ec8)
+* ERNIE (`Ernie4_5_ForCausalLM`, `Ernie4_5_MoeForCausalLM`)
+    * [x] [0.3B](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT/tree/c163aa422d265f995b024d1322d91c4e3cb52ec8), [A3B](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT/tree/b24b8917f5379129992dad46c279683c7b845c96)
 
 * EXAONE (`ExaoneForCausalLM`)
     * [x] v3.5: [Instruct-2.4B](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct), [Instruct-7.8B](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct), [Instruct-32B](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-32B-Instruct)
 
@@ -13,7 +13,7 @@ namespace chatllm::ernie::dense
 
     static ChatHistoryEncoder _chat_encoder;
 
-    Tokenizer::Tokenizer(const Config &config)
+    Tokenizer::Tokenizer(const BaseConfig &config)
         : chatllm::llama::v2::Tokenizer(config, &_chat_encoder)
     {}
 
@@ -65,4 +65,135 @@ namespace chatllm::ernie::dense
             attention.freq_base = config.rope_theta;
         }
     }
+}
+
+namespace chatllm::ernie::moe
+{
+    template <class ErnieMoEMLP> class ErnieMoEBlock : public LMBlock1<RMSNorm, LlamaSelfAttention, RMSNorm, ErnieMoEMLP>
+    {
+    public:
+        ErnieMoEBlock(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size,
+                int mlp_intermediate_size1, int mlp_intermediate_size2,
+                int num_kv_heads, int head_dim, int max_length)
+            : LMBlock1<RMSNorm, LlamaSelfAttention, RMSNorm, ErnieMoEMLP>(ctx, hidden_size, num_attention_heads, intermediate_size, mlp_intermediate_size1, mlp_intermediate_size2,
+            num_kv_heads, head_dim, max_length)
+        {}
+    };
+
+    template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class ErnieSparseMoE : public BaseSparseMLP
+    {
+    public:
+        ErnieSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
+            : BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false)
+        {
+        }
+    };
+
+    template <const int NUM_EXPERTS, const int EXPERTS_PER_TOK, const int EFFECTIVE_EXPERTS_PER_TOK> class GenericConditionalGeneration : public BaseModelForConditionalGeneration
+    {
+    public:
+        typedef CombinedMLP<ErnieSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>, SiLUMLP> ErnieMoEMLP;
+        typedef ErnieMoEBlock<ErnieMoEMLP> MoEBlock;
+        typedef BaseModelForConditionalGeneration Base;
+        typedef HeterogeneousModel ModelClass;
+    public:
+        GenericConditionalGeneration() = default;
+
+        GenericConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
+            : BaseModelForConditionalGeneration(MODEL_TYPE_ERNIE_MOE, config, runtime_config, 4096 * 4),
+            config(config)
+        {
+            const size_t tensor_ovhd = ggml_tensor_overhead();
+            const size_t moe_layers  = get_moe_layer_num();
+            const size_t dense_layers = config.num_hidden_layers - moe_layers;
+            const size_t num_tensors = 2 + dense_layers * (12) + moe_layers * (16 + 0) + (config.tie_word_embeddings ? 0 : 1);
+            const size_t ctx_size = num_tensors * tensor_ovhd;
+            w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
+            w_ctx_.dtype = config.dtype;
+
+            if (config.use_correction_bias)
+                ggml::log(GGML_LOG_LEVEL_WARN, "use_correction_bias is ignored, see https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT/blob/main/modeling_ernie4_5_moe.py#L369");
+
+            auto create_layer = [&](InitContext *ctx, int layer_index) -> Block * {
+                if (is_layer_moe(layer_index))
+                {
+                    auto layer = new MoEBlock(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
+                        config.moe_intermediate_size, config.moe_intermediate_size * config.moe_num_shared_experts,
+                        config.num_key_value_heads, config.hidden_size / config.num_attention_heads,
+                        config.max_length);
+                    layer->attention.freq_base = config.rope_theta;
+                    layer->mlp.mlp1.norm_topk_prob = true;
+                    return layer;
+                }
+                else
+                {
+                    auto layer = new LlamaBlock(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
+                                        config.num_key_value_heads, config.max_length);
+                    layer->attention.freq_base = config.rope_theta;
+                    return layer;
+                }
+            };
+
+            auto transformer = new ModelClass(&w_ctx_, config.num_hidden_layers, config.hidden_size,
+                create_embedding<Embedding>(&w_ctx_, config),
+                create_final_norm<RMSNorm>(&w_ctx_, config),
+                config.tie_word_embeddings ? nullptr : create_lm_head(&w_ctx_, config, false), create_layer);
+            Base::transformer = transformer;
+
+            w_ctx_.check_used_mem_size(true);
+        }
+
+    protected:
+        int  get_moe_layer_num(void)
+        {
+            int r = 0;
+            for (int i = 0; i < config.num_hidden_layers; i++)
+                if (is_layer_moe(i)) r++;
+            return r;
+        }
+
+        bool is_layer_moe(int i)
+        {
+            if (i < config.moe_layer_start_index) return false;
+            return (i % config.moe_layer_interval) == 0;
+        }
+    public:
+        Config config;
+    };
+
+    namespace experts_64
+    {
+        const int NUM_EXPERTS                   =  64;
+        const int EXPERTS_PER_TOK               =  6;
+
+        // make it easy to test with different number of experts.
+        const int EFFECTIVE_EXPERTS_PER_TOK     =  EXPERTS_PER_TOK;
+
+        typedef GenericConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, EFFECTIVE_EXPERTS_PER_TOK> ConditionalGeneration;
+    }
+
+    ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
+    {
+        switch (config.moe_num_experts)
+        {
+        case experts_64::NUM_EXPERTS:
+            set_proxy_model(new experts_64::ConditionalGeneration(config, runtime_config));
+            break;
+        default:
+            CHATLLM_CHECK(false) << "unsupported MoE param: num_experts = " << config.moe_num_experts;
+            break;
+        }
+    }
+
+    void ConditionalGeneration::load(ModelLoader &loader)
+    {
+        loader.add_tensor_name_translations({
+            {".mlp2.",              ".shared_experts."},
+            {".mlp1.gate.",         ".gate."},
+            {".mlp1.experts.",      ".experts."},
+            {".mlp1.gate_score_correction_bias",     ".moe_statics.e_score_correction_bias"}
+        });
+
+        ModelProxy::load(loader);
+    }
 }
@@ -18,7 +18,7 @@ namespace chatllm::ernie::dense
     class Tokenizer : public chatllm::llama::v2::Tokenizer
     {
     public:
-        Tokenizer(const Config &config);
+        Tokenizer(const BaseConfig &config);
     };
 
     class ConditionalGeneration : public chatllm::llama::v2::GenericConditionalGeneration<LlamaBlock>
@@ -27,4 +27,32 @@ namespace chatllm::ernie::dense
         ConditionalGeneration() = default;
         ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_ERNIE_DENSE);
     };
-}
+}
+
+namespace chatllm::ernie::moe
+{
+    struct Config : public chatllm::llama::v2::Config
+    {
+        int num_key_value_heads;
+        int tie_word_embeddings;
+        int moe_num_experts;
+        int moe_num_shared_experts;
+        int moe_layer_start_index;
+        int moe_intermediate_size;
+        int moe_capacity[3];
+        int moe_k;
+        int moe_layer_interval;
+        int use_correction_bias;
+
+        float rope_theta;
+    };
+
+    typedef dense::Tokenizer Tokenizer;
+
+    class ConditionalGeneration : public ModelProxy
+    {
+    public:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config);
+        void load(ModelLoader &loader);
+    };
+}
@@ -2953,6 +2953,15 @@
                         "url": "chatllm_quantized_ernie/ernie-4.5-0.3b.bin"
                     }
                 }
+            },
+            "a3b": {
+                "default": "q4_1",
+                "quantized": {
+                    "q8": {
+                        "size": 13643262720,
+                        "url": "chatllm_quantized_ernie/ernie-4.5-21b-a3b-q4_1.bin"
+                    }
+                }
             }
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -2953,6 +2953,15 @@`
`2953`	`2953`	`"url": "chatllm_quantized_ernie/ernie-4.5-0.3b.bin"`
`2954`	`2954`	`}`
`2955`	`2955`	`}`
	`2956`	`+ },`
	`2957`	`+ "a3b": {`
	`2958`	`+ "default": "q4_1",`
	`2959`	`+ "quantized": {`
	`2960`	`+ "q8": {`
	`2961`	`+ "size": 13643262720,`
	`2962`	`+ "url": "chatllm_quantized_ernie/ernie-4.5-21b-a3b-q4_1.bin"`
	`2963`	`+ }`
	`2964`	`+ }`
`2956`	`2965`	`}`
`2957`	`2966`	`}`
`2958`	`2967`	`}`