@@ -112,14 +112,14 @@ class ModelType(Enum):
112112 Phi4 = 0x531
113113 Phi4_Mini = 0x532
114114
115- Mistral = 0x600
116- Mixtral = 0x601
117- OpenChat = 0x602
118- NeuralBeagle = 0x603
119- Starling = 0x604
120- WizardLMMoE = 0x605
121- Mistral2 = 0x606
122- DeepHermes3Mistral = 0x607
115+ Mistral = 0x600
116+ Mixtral = 0x601
117+ OpenChat = 0x602
118+ NeuralBeagle = 0x603
119+ Starling = 0x604
120+ WizardLMMoE = 0x605
121+ Mistral2 = 0x606
122+ DeepHermes3Mistral = 0x607
123123
124124 QWen = 0x700
125125 QWen2 = 0x710
@@ -243,6 +243,7 @@ class ModelType(Enum):
243243 LlaMA4 = ModelTypeTagChatImageIn + 0x0000001
244244 Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011
245245 DotsOCR = ModelTypeTagChatImageIn + 0x0000020
246+ Mistral3 = ModelTypeTagChatImageIn + 0x0000030
246247
247248 Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
248249
@@ -8213,6 +8214,90 @@ def get_weight_names(config):
82138214 ]
82148215 return weight_names
82158216
8217+ class Mistral3Converter (BaseConverter ):
8218+ MODEL_TYPE = ModelType .Mistral3
8219+
8220+ @classmethod
8221+ def state_dict_pp (cls , config , state_dict ):
8222+ r = {}
8223+ for name in state_dict :
8224+ tensor : torch .Tensor = state_dict [name ]
8225+ if name .startswith ('language_model.' ):
8226+ name = name .replace ('language_model.' , '' )
8227+ r [name ] = tensor
8228+ continue
8229+ else :
8230+ if name .startswith ('vision_tower.transformer.' ):
8231+ name = name .replace ('vision_tower.transformer.' , 'vision_model.' )
8232+ name = name .replace ('.attention.' , '.attn.' )
8233+ name = name .replace ('.feed_forward.' , '.mlp.' )
8234+ elif name .startswith ('vision_tower.' ):
8235+ name = name .replace ('vision_tower.' , 'vision_model.' )
8236+ r [name ] = tensor
8237+ return r
8238+
8239+ @staticmethod
8240+ def dump_config (f , config , ggml_type ):
8241+ vis_config = AttributeDict (config .vision_config )
8242+ assert vis_config .hidden_act == 'silu'
8243+
8244+ txt_config = AttributeDict (config .text_config )
8245+ Mistral3Converter .txt_config = txt_config
8246+
8247+ assert isinstance (txt_config .rope_parameters , dict )
8248+ assert txt_config .rope_parameters ['rope_type' ] == 'yarn'
8249+
8250+ dump_llama_like_config (f , txt_config , ggml_type )
8251+ config_values = [
8252+ txt_config .num_key_value_heads ,
8253+ txt_config .sliding_window if txt_config .sliding_window is not None else - 1 ,
8254+ 1 if (txt_config .tie_word_embeddings is not None ) and txt_config .tie_word_embeddings else 0 ,
8255+ txt_config .head_dim or txt_config .hidden_size // txt_config .num_attention_heads
8256+ ]
8257+ f .write (struct .pack ("i" * len (config_values ), * config_values ))
8258+ config_values = [
8259+ txt_config .rope_parameters ['beta_fast' ],
8260+ txt_config .rope_parameters ['beta_slow' ],
8261+ txt_config .rope_parameters ['factor' ],
8262+ txt_config .rope_parameters ['llama_4_scaling_beta' ],
8263+ txt_config .rope_parameters ['mscale' ],
8264+ txt_config .rope_parameters ['mscale_all_dim' ],
8265+ txt_config .rope_parameters ['original_max_position_embeddings' ],
8266+ txt_config .rope_parameters ['rope_theta' ],
8267+ ]
8268+ f .write (struct .pack ("<ffffffif" , * config_values ))
8269+
8270+ @staticmethod
8271+ def get_weight_names (config ):
8272+ txt_config = AttributeDict (config .text_config )
8273+ vis_config = AttributeDict (config .vision_config )
8274+
8275+ weight_names = Llama32Converter .get_weight_names (txt_config )
8276+
8277+ for i in range (vis_config .num_hidden_layers ):
8278+ weight_names += [
8279+ f"vision_model.layers.{ i } .attn.q_proj.weight" ,
8280+ f"vision_model.layers.{ i } .attn.k_proj.weight" ,
8281+ f"vision_model.layers.{ i } .attn.v_proj.weight" ,
8282+ f"vision_model.layers.{ i } .attn.o_proj.weight" ,
8283+ f"vision_model.layers.{ i } .mlp.up_proj.weight" ,
8284+ f"vision_model.layers.{ i } .mlp.down_proj.weight" ,
8285+ f"vision_model.layers.{ i } .mlp.gate_proj.weight" ,
8286+ f"vision_model.layers.{ i } .attention_norm.weight" ,
8287+ f"vision_model.layers.{ i } .ffn_norm.weight" ,
8288+ ]
8289+
8290+ weight_names += [
8291+ "multi_modal_projector.linear_1.weight" ,
8292+ "multi_modal_projector.linear_2.weight" ,
8293+ "multi_modal_projector.norm.weight" ,
8294+ "multi_modal_projector.patch_merger.merging_layer.weight" ,
8295+ "vision_model.ln_pre.weight" ,
8296+ "vision_model.patch_conv.weight" ,
8297+ ]
8298+
8299+ return weight_names
8300+
82168301def convert_grok_1_base (args , vocab , ggml_type ):
82178302 def ffn_size (emb_size , widening_factor ):
82188303 _ffn_size = int (widening_factor * emb_size ) * 2 // 3
@@ -8352,7 +8437,7 @@ def load_config(path: Path, config_fn: str) -> Any:
83528437def load_some_info (r : dict , path : Path , prefix : str = '' ) -> None :
83538438 if path .is_dir ():
83548439 globs = ["config.json" , "configuration.json" , "special_tokens_map.json" ,
8355- "tokenizer_config.json" , "preprocessor_config.json" ]
8440+ "tokenizer_config.json" , "preprocessor_config.json" , "processor_config.json" ]
83568441 for glob in globs :
83578442 files = list (path .glob (glob ))
83588443 for f in files :
@@ -8837,6 +8922,8 @@ def main():
88378922 MegrezMoEConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
88388923 elif arch == 'OuroForCausalLM' :
88398924 OuroConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8925+ elif arch == 'Mistral3ForConditionalGeneration' :
8926+ Mistral3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
88408927 elif arch == 'deepseek-r1-distill-qwen3' :
88418928 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
88428929 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments