@@ -221,6 +221,8 @@ class ModelType(Enum):
221221 BailingMoE2 = 0x2E00
222222 LlaDA2 = 0x2E01
223223
224+ MegrezMoE = 0x2F00
225+
224226 BCE_Embedding = 0x10000100
225227 BCE_ReRanker = 0x10000101
226228 BGE_M3 = 0x10000102
@@ -8099,6 +8101,80 @@ def get_weight_names(config):
80998101
81008102 return weight_names
81018103
8104+ class MegrezMoEConverter (BaseConverter ):
8105+ MODEL_TYPE = ModelType .MegrezMoE
8106+
8107+ @classmethod
8108+ def pp (cls , config , name : str , tensor ):
8109+ return DeepSeekV1Converter .pp (config , name , tensor )
8110+
8111+ @staticmethod
8112+ def dump_config (f , config , ggml_type ):
8113+ assert config .hidden_act == 'silu' , "hidden_act must be silu"
8114+ assert config .attention_bias == False , "attention_bias must be False"
8115+ assert config .ep_size == 1 , "ep_size must be 1"
8116+ assert config .rope_scaling is None
8117+ assert config .scoring_func == 'sigmoid' , "scoring_func must be 'sigmoid'"
8118+ assert config .topk_method == 'noaux_tc' , "topk_method must be 'noaux_tc'"
8119+ assert config .n_routed_experts is not None , "n_routed_experts must not be null"
8120+ assert config .pre_gate
8121+
8122+ config .scoring_func = 'softmax'
8123+ DeepSeekV1Converter .dump_config (f , config , ggml_type )
8124+
8125+ config_values = [
8126+ config .experts_shared_frequency ,
8127+ config .n_group ,
8128+ config .topk_group ,
8129+ config .routed_scaling_factor ,
8130+ ]
8131+ f .write (struct .pack ("<iiif" , * config_values ))
8132+
8133+ @staticmethod
8134+ def get_weight_names (config ):
8135+ weight_names = ["model.embed_tokens.weight" ,
8136+ "model.norm.weight" ,
8137+ "lm_head.weight" ]
8138+ for i in range (config .num_hidden_layers ):
8139+
8140+ weight_names += [
8141+ f"model.layers.{ i } .self_attn.k_proj.weight" ,
8142+ f"model.layers.{ i } .self_attn.q_proj.weight" ,
8143+ f"model.layers.{ i } .self_attn.v_proj.weight" ,
8144+ f"model.layers.{ i } .self_attn.o_proj.weight" ,
8145+ ]
8146+
8147+ if (config .n_routed_experts is not None
8148+ and (i >= config .first_k_dense_replace )
8149+ and (i % config .moe_layer_freq == 0 )):
8150+ weight_names += [
8151+ f"model.layers.{ i } .mlp.gate.e_score_correction_bias" ,
8152+ f"model.layers.{ i } .mlp.gate.weight" ,
8153+ f"model.layers.{ i } .mlp.shared_experts.gate_proj.weight" ,
8154+ f"model.layers.{ i } .mlp.shared_experts.up_proj.weight" ,
8155+ f"model.layers.{ i } .mlp.shared_experts.down_proj.weight" ,
8156+ ]
8157+ if (i - config .first_k_dense_replace ) % config .experts_shared_frequency == 0 :
8158+ for j in range (config .n_routed_experts ):
8159+ weight_names += [
8160+ f"model.layers.{ i } .mlp.experts.{ j } .gate_proj.weight" ,
8161+ f"model.layers.{ i } .mlp.experts.{ j } .up_proj.weight" ,
8162+ f"model.layers.{ i } .mlp.experts.{ j } .down_proj.weight" ,
8163+ ]
8164+ else :
8165+ weight_names += [
8166+ f"model.layers.{ i } .mlp.gate_proj.weight" ,
8167+ f"model.layers.{ i } .mlp.up_proj.weight" ,
8168+ f"model.layers.{ i } .mlp.down_proj.weight" ,
8169+ ]
8170+
8171+ weight_names += [
8172+ f"model.layers.{ i } .input_layernorm.weight" ,
8173+ f"model.layers.{ i } .post_attention_layernorm.weight" ,
8174+ ]
8175+
8176+ return weight_names
8177+
81028178def convert_grok_1_base (args , vocab , ggml_type ):
81038179 def ffn_size (emb_size , widening_factor ):
81048180 _ffn_size = int (widening_factor * emb_size ) * 2 // 3
@@ -8719,6 +8795,8 @@ def main():
87198795 JanusConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
87208796 elif arch .endswith ('DotsOCRForCausalLM' ):
87218797 DotsOCRConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8798+ elif arch .endswith ('MegrezMoeForCausalLM' ):
8799+ MegrezMoEConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
87228800 elif arch == 'deepseek-r1-distill-qwen3' :
87238801 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
87248802 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments