@@ -197,6 +197,8 @@ class ModelType(Enum):
197197
198198 Apriel = 0x2400
199199
200+ ERNIE_MoE = 0x2500
201+
200202 BCE_Embedding = 0x10000100
201203 BCE_ReRanker = 0x10000101
202204 BGE_M3 = 0x10000102
@@ -285,6 +287,7 @@ def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
285287 tensor = torch .cat ((scale .half ().view (torch .int8 ), min_values .half ().view (torch .int8 ), tensor ), dim = - 1 )
286288 return tensor
287289
290+ @torch .jit .script
288291def qkx2_quants (x : torch .Tensor , nmax , rmin , rdelta , nstep : int , use_mad : bool ):
289292 assert x .dim () == 1
290293 N = x .shape [0 ]
@@ -297,7 +300,7 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
297300 if min_x > 0 : min_x = torch .tensor (0 )
298301 if min_x == max_x :
299302 L = torch .zeros (N )
300- return 0.0 , - min_x , L
303+ return torch . tensor ( 0.0 ) , - min_x , L
301304
302305 iscale = nmax / (max_x - min_x )
303306 scale = 1 / iscale
@@ -322,7 +325,7 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
322325 this_scale = (sum_w * sum_xl - sum_x * sum_l )/ D
323326 this_min = (sum_l2 * sum_x - sum_l * sum_xl )/ D
324327 if this_min > 0 :
325- this_min = 0
328+ this_min = torch . tensor ( 0 )
326329 this_scale = sum_xl / sum_l2
327330
328331 diff = this_scale * l + this_min - x
@@ -335,19 +338,20 @@ def qkx2_quants(x: torch.Tensor, nmax, rmin, rdelta, nstep: int, use_mad: bool):
335338
336339 return scale , - min_x , L
337340
338- def quantize_q4_k_block (tensor : torch .Tensor ) -> torch .CharTensor :
341+ @torch .jit .script
342+ def quantize_q4_k_block (tensor : torch .Tensor , GGML_QK_K : int ) -> torch .CharTensor :
339343 assert tensor .shape == (GGML_QK_K , )
340344 tensor = tensor .view (- 1 , 32 )
341345
342- subblocks = [qkx2_quants (tensor [i ], 15 , - 1.0 , 0.1 , 20 , False ) for i in range (tensor .shape [0 ])]
346+ subblocks = [qkx2_quants (tensor [i ], torch . tensor ( 15 ), torch . tensor ( - 1.0 ), torch . tensor ( 0.1 ) , 20 , False ) for i in range (tensor .shape [0 ])]
343347 scale = torch .stack ([x [0 ] for x in subblocks ])
344348 min_x = torch .stack ([x [1 ] for x in subblocks ])
345349
346350 max_scale = torch .max (scale )
347351 max_min = torch .max (min_x )
348352
349- inv_scale = 63.0 / max_scale if max_scale > 0 else 0.0
350- inv_min = 64.0 / max_min if max_min > 0 else 0.0
353+ inv_scale = torch . tensor ( 63.0 ) / max_scale if max_scale > 0 else torch . tensor ( 0.0 )
354+ inv_min = torch . tensor ( 64.0 ) / max_min if max_min > 0 else torch . tensor ( 0.0 )
351355
352356 ls = (inv_scale * scale ).round ().clamp (max = 63 )
353357 lm = (inv_min * min_x ).round ().clamp (max = 63 )
@@ -380,11 +384,12 @@ def quantize_q4_k_block(tensor: torch.Tensor) -> torch.CharTensor:
380384
381385 return r
382386
383- def quantize_q4_k (tensor : torch .Tensor ) -> torch .CharTensor :
387+ @torch .jit .script
388+ def quantize_q4_k (tensor : torch .Tensor , GGML_QK_K : int ) -> torch .CharTensor :
384389 # equivalent to dequantize_row_q4_K in ggml-quants.c
385390 assert tensor .shape [tensor .ndim - 1 ] % GGML_QK_K == 0
386391 tensor = tensor .view (- 1 , GGML_QK_K )
387- blocks = [quantize_q4_k_block (tensor [i ]) for i in range (tensor .shape [0 ])]
392+ blocks = [quantize_q4_k_block (tensor [i ], GGML_QK_K ) for i in range (tensor .shape [0 ])]
388393 tensor = torch .cat (blocks , dim = - 1 )
389394 return tensor
390395
@@ -411,7 +416,7 @@ def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType):
411416 elif ggml_type == GGMLType .Q4_1 :
412417 tensor = quantize_q4_1 (tensor )
413418 elif ggml_type == GGMLType .Q4_K :
414- tensor = quantize_q4_k (tensor )
419+ tensor = quantize_q4_k (tensor , GGML_QK_K )
415420 else :
416421 raise NotImplementedError (f"Cannot dump tensor of dtype { tensor .dtype } " )
417422 except Exception as e :
@@ -6364,6 +6369,81 @@ def get_weight_names(config):
63646369
63656370 return weight_names
63666371
6372+ class ERNIEMoEConverter (BaseConverter ):
6373+ MODEL_TYPE = ModelType .ERNIE_MoE
6374+
6375+ @staticmethod
6376+ def dump_config (f , config , ggml_type ):
6377+ assert not config .use_bias
6378+ assert len (config .moe_capacity ) == 3
6379+ if config .rope_scaling is not None :
6380+ assert config .rope_scaling == 1.0 , 'rope_scaling must equal to 1.0'
6381+
6382+ dump_llama_like_config (f , config , ggml_type )
6383+ config_values = [
6384+ config .num_key_value_heads ,
6385+ 1 if config .tie_word_embeddings else 0 ,
6386+ config .moe_num_experts ,
6387+ config .moe_num_shared_experts ,
6388+ config .moe_layer_start_index ,
6389+ config .moe_intermediate_size ,
6390+ config .moe_capacity [0 ],
6391+ config .moe_capacity [1 ],
6392+ config .moe_capacity [2 ],
6393+ config .moe_k ,
6394+ config .moe_layer_interval ,
6395+ 1 if config .moe_use_aux_free else 0 ,
6396+ ]
6397+ f .write (struct .pack ("i" * len (config_values ), * config_values ))
6398+ f .write (struct .pack ("<f" , config .rope_theta ))
6399+
6400+ @staticmethod
6401+ def get_weight_names (config ):
6402+ weight_names = ["model.embed_tokens.weight" ]
6403+ for i in range (config .num_hidden_layers ):
6404+ weight_names += [
6405+ f"model.layers.{ i } .input_layernorm.weight" ,
6406+ f"model.layers.{ i } .post_attention_layernorm.weight" ,
6407+ f"model.layers.{ i } .self_attn.k_proj.weight" ,
6408+ f"model.layers.{ i } .self_attn.o_proj.weight" ,
6409+ f"model.layers.{ i } .self_attn.q_proj.weight" ,
6410+ f"model.layers.{ i } .self_attn.v_proj.weight" ,
6411+ ]
6412+
6413+ if (i >= config .moe_layer_start_index ) and ((i + 1 ) % config .moe_layer_interval == 0 ):
6414+ weight_names += [
6415+ f"model.layers.{ i } .mlp.gate.weight" ,
6416+ f"model.layers.{ i } .mlp.shared_experts.gate_proj.weight" ,
6417+ f"model.layers.{ i } .mlp.shared_experts.up_proj.weight" ,
6418+ f"model.layers.{ i } .mlp.shared_experts.down_proj.weight" ,
6419+ ]
6420+ if config .moe_use_aux_free :
6421+ weight_names += [
6422+ f"model.layers.{ i } .mlp.moe_statics.e_score_correction_bias" ,
6423+ ]
6424+ for j in range (config .moe_num_experts ):
6425+ weight_names += [
6426+ f"model.layers.{ i } .mlp.experts.{ j } .gate_proj.weight" ,
6427+ f"model.layers.{ i } .mlp.experts.{ j } .up_proj.weight" ,
6428+ f"model.layers.{ i } .mlp.experts.{ j } .down_proj.weight" ,
6429+ ]
6430+ else :
6431+ weight_names += [
6432+ f"model.layers.{ i } .mlp.down_proj.weight" ,
6433+ f"model.layers.{ i } .mlp.gate_proj.weight" ,
6434+ f"model.layers.{ i } .mlp.up_proj.weight" ,
6435+ ]
6436+
6437+ weight_names += [
6438+ "model.norm.weight" ,
6439+ ]
6440+
6441+ if not config .tie_word_embeddings :
6442+ weight_names += [
6443+ "lm_head.weight"
6444+ ]
6445+ return weight_names
6446+
63676447class KimiVLConverter (BaseConverter ):
63686448 MODEL_TYPE = ModelType .KimiVL
63696449
@@ -7516,6 +7596,8 @@ def main():
75167596 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
75177597 elif arch == 'Ernie4_5_ForCausalLM' :
75187598 ERNIEDenseConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
7599+ elif arch == 'Ernie4_5_MoeForCausalLM' :
7600+ ERNIEMoEConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
75197601 elif arch == 'deepseek-r1-distill-qwen3' :
75207602 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
75217603 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments