foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 78 additions & 0 deletions b/‎convert.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎docs/models.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/models.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎models/bailing.cpp‎
Lines changed: 17 additions & 38 deletions b/‎models/bailing.cpp‎
Lines changed: 17 additions & 38 deletions
diff --git a/‎models/gpt.cpp‎
Lines changed: 2 additions & 1 deletion b/‎models/gpt.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -33,6 +33,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia"   style="text-
 
 **What's New:**
 
+* 2025-10-31: Megrez2-3x7B-A3B
 * 2025-10-25: LLaDA2.0-mini
 * 2025-10-14: Nanonets-OCR2
 * 2025-10-13: dots.ocr
 
@@ -221,6 +221,8 @@ class ModelType(Enum):
     BailingMoE2     = 0x2E00
     LlaDA2          = 0x2E01
 
+    MegrezMoE       = 0x2F00
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -8099,6 +8101,80 @@ def get_weight_names(config):
 
         return weight_names
 
+class MegrezMoEConverter(BaseConverter):
+    MODEL_TYPE = ModelType.MegrezMoE
+
+    @classmethod
+    def pp(cls, config, name: str, tensor):
+        return DeepSeekV1Converter.pp(config, name, tensor)
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        assert config.hidden_act == 'silu', "hidden_act must be silu"
+        assert config.attention_bias == False, "attention_bias must be False"
+        assert config.ep_size == 1, "ep_size must be 1"
+        assert config.rope_scaling is None
+        assert config.scoring_func == 'sigmoid', "scoring_func must be 'sigmoid'"
+        assert config.topk_method == 'noaux_tc', "topk_method must be 'noaux_tc'"
+        assert config.n_routed_experts is not None, "n_routed_experts must not be null"
+        assert config.pre_gate
+
+        config.scoring_func = 'softmax'
+        DeepSeekV1Converter.dump_config(f, config, ggml_type)
+
+        config_values = [
+            config.experts_shared_frequency,
+            config.n_group,
+            config.topk_group,
+            config.routed_scaling_factor,
+        ]
+        f.write(struct.pack("<iiif", *config_values))
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = ["model.embed_tokens.weight",
+                        "model.norm.weight",
+                        "lm_head.weight"]
+        for i in range(config.num_hidden_layers):
+
+            weight_names += [
+                f"model.layers.{i}.self_attn.k_proj.weight",
+                f"model.layers.{i}.self_attn.q_proj.weight",
+                f"model.layers.{i}.self_attn.v_proj.weight",
+                f"model.layers.{i}.self_attn.o_proj.weight",
+            ]
+
+            if (config.n_routed_experts is not None
+                and (i >= config.first_k_dense_replace)
+                and (i % config.moe_layer_freq == 0)):
+                weight_names += [
+                    f"model.layers.{i}.mlp.gate.e_score_correction_bias",
+                    f"model.layers.{i}.mlp.gate.weight",
+                    f"model.layers.{i}.mlp.shared_experts.gate_proj.weight",
+                    f"model.layers.{i}.mlp.shared_experts.up_proj.weight",
+                    f"model.layers.{i}.mlp.shared_experts.down_proj.weight",
+                ]
+                if (i - config.first_k_dense_replace) % config.experts_shared_frequency == 0:
+                    for j in range(config.n_routed_experts):
+                        weight_names += [
+                            f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight",
+                            f"model.layers.{i}.mlp.experts.{j}.up_proj.weight",
+                            f"model.layers.{i}.mlp.experts.{j}.down_proj.weight",
+                        ]
+            else:
+                weight_names += [
+                    f"model.layers.{i}.mlp.gate_proj.weight",
+                    f"model.layers.{i}.mlp.up_proj.weight",
+                    f"model.layers.{i}.mlp.down_proj.weight",
+                ]
+
+            weight_names += [
+                f"model.layers.{i}.input_layernorm.weight",
+                f"model.layers.{i}.post_attention_layernorm.weight",
+            ]
+
+        return weight_names
+
 def convert_grok_1_base(args, vocab, ggml_type):
     def ffn_size(emb_size, widening_factor):
         _ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -8719,6 +8795,8 @@ def main():
         JanusConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch.endswith('DotsOCRForCausalLM'):
         DotsOCRConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch.endswith('MegrezMoeForCausalLM'):
+        MegrezMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'deepseek-r1-distill-qwen3':
         QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
 
@@ -172,6 +172,9 @@
 
     For other models that using `LlamaForCausalLM` architecture, for example, [aiXcoder-7B](https://huggingface.co/aiXcoder/aixcoder-7b-base), try `-a Yi`.
 
+* Megrez (`MegrezMoeForCausalLM`)
+    * [x] (3x7B-A3B)[https://huggingface.co/Infinigence/Megrez2-3x7B-A3B/tree/3ffc3b7c0ffc0f0b27d71fba2a97dcc14c797bb4]
+
 * MiniCPM (`MiniCPMForCausalLM`, `MiniCPM3ForCausalLM`)
     * [x] [DPO-2B](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp16), [SFT-2B](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16),
           [SFT-1B](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16)🔥
 
@@ -119,52 +119,31 @@ namespace chatllm::bailing::moe2
     const int NUM_EXPERTS                   =  256;
     const int EXPERTS_PER_TOK               =  8;
 
-    class BailingSparseMoE : public BaseSparseMLP
+    class BailingSparseMoE : public GenericGroupedSparseMoE
     {
     public:
-        BailingSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_experts = NUM_EXPERTS, int experts_per_tok = EXPERTS_PER_TOK)
-            : BaseSparseMLP(ctx, hidden_size, intermediate_size, num_experts, experts_per_tok, ActFunc::SILU, true),
-              n_group(-1), topk_group(-1)
+        BailingSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_experts = NUM_EXPERTS, int experts_per_tok = EXPERTS_PER_TOK):
+            GenericGroupedSparseMoE(ctx, hidden_size, num_experts, experts_per_tok, true, false, false, false),
+            experts(ctx, hidden_size, intermediate_size, num_experts, experts_per_tok, ActFunc::SILU, false)
         {
-            score_func = ScoreFunc::Sigmoid;
-            always_scaling = true;
+            set_experts(&experts);
         }
-    protected:
-        ggml::tensor *select_experts(ComputeContext *ctx, ggml::tensor *corrected_score) override;
 
+        int64_t get_param_num(bool effective_only) const override
+        {
+            int64_t r = GenericSparseMLP::get_param_num(effective_only);
+            r += experts.get_param_num(effective_only);
+            return r;
+        }
+        void load(const std::string &path, TensorLoader *loader) override
+        {
+            GenericSparseMLP::load(path, loader);
+            experts.load(path + "experts.", loader);
+        }
     public:
-        int n_group;
-        int topk_group;
+        MultiMLP experts;
     };
 
-    ggml::tensor *BailingSparseMoE::select_experts(ComputeContext *ctx, ggml::tensor *corrected_score)
-    {
-        const int n_expert = num_local_experts;
-        const int experts_per_group = n_expert / n_group;
-        CHATLLM_CHECK(ggml::get_dim(corrected_score, 2) == 1);
-
-        ggml::tensor * selected_experts = nullptr;
-
-        ggml::tensor *grouped_scores = ggml::reshape_4d(ctx, corrected_score, experts_per_group, num_experts_per_tok,
-                                                        ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
-        selected_experts = ggml::top_k(ctx, grouped_scores, topk_group);
-
-        ggml::tensor *selected_experts_i64 = ggml::cast_int_to_i64(ctx, selected_experts);
-
-        CHATLLM_CHECK(ggml::get_dim(grouped_scores, 3) == 1);
-        grouped_scores                      = ggml::reshape_4d(ctx, grouped_scores, 1, ggml::get_dim(grouped_scores, 0), ggml::get_dim(grouped_scores, 1), ggml::get_dim(grouped_scores, 2));
-        ggml::tensor *selected_group_scores = ggml::scale(ctx, grouped_scores, 0.0f);
-        grouped_scores        = ggml::get_rows(ctx, grouped_scores, selected_experts);
-        selected_group_scores = ggml::set_rows(ctx, selected_group_scores, selected_experts_i64, grouped_scores);
-
-        selected_group_scores = ggml::reshape_3d(ctx, selected_group_scores,
-            ggml::get_dim(corrected_score, 0), ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
-
-        selected_experts = ggml::top_k(ctx, selected_group_scores, num_experts_per_tok);
-
-        return selected_experts;
-    }
-
     class AttnParams
     {
     public:
 
@@ -164,7 +164,8 @@ Reasoning: medium
             norm_topk_prob = false;
         }
     public:
-        ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *hidden_states)
+        using Block::forward;
+        ggml::tensor *forward(ComputeContext *ctx, ggml::tensor *hidden_states) override
         {
             const int64_t qlen        = hidden_states->ne[1];
             const int n_expert = num_local_experts;
Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,8 @@ Reasoning: medium`
`164`	`164`	`norm_topk_prob = false;`
`165`	`165`	`}`
`166`	`166`	`public:`
`167`		`- ggml::tensor forward(ComputeContext ctx, ggml::tensor *hidden_states)`
	`167`	`+ using Block::forward;`
	`168`	`+ ggml::tensor forward(ComputeContext ctx, ggml::tensor *hidden_states) override`
`168`	`169`	`{`
`169`	`170`	`const int64_t qlen = hidden_states->ne[1];`
`170`	`171`	`const int n_expert = num_local_experts;`