Skip to content

Commit d21c3bb

Browse files
committed
add ministral-3
1 parent 3c2d40a commit d21c3bb

File tree

10 files changed

+885
-18
lines changed

10 files changed

+885
-18
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3333

3434
**What's New:**
3535

36+
* 2025-12-08: Mistral-3 (a.k.a Ministral-3)
3637
* 2025-11-06: Maya1
3738
* 2025-11-03: Ouro
3839
* 2025-10-31: Megrez2-3x7B-A3B

convert.py

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,14 @@ class ModelType(Enum):
112112
Phi4 = 0x531
113113
Phi4_Mini = 0x532
114114

115-
Mistral = 0x600
116-
Mixtral = 0x601
117-
OpenChat = 0x602
118-
NeuralBeagle = 0x603
119-
Starling = 0x604
120-
WizardLMMoE = 0x605
121-
Mistral2 = 0x606
122-
DeepHermes3Mistral = 0x607
115+
Mistral = 0x600
116+
Mixtral = 0x601
117+
OpenChat = 0x602
118+
NeuralBeagle = 0x603
119+
Starling = 0x604
120+
WizardLMMoE = 0x605
121+
Mistral2 = 0x606
122+
DeepHermes3Mistral = 0x607
123123

124124
QWen = 0x700
125125
QWen2 = 0x710
@@ -243,6 +243,7 @@ class ModelType(Enum):
243243
LlaMA4 = ModelTypeTagChatImageIn + 0x0000001
244244
Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011
245245
DotsOCR = ModelTypeTagChatImageIn + 0x0000020
246+
Mistral3 = ModelTypeTagChatImageIn + 0x0000030
246247

247248
Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
248249

@@ -8213,6 +8214,90 @@ def get_weight_names(config):
82138214
]
82148215
return weight_names
82158216

8217+
class Mistral3Converter(BaseConverter):
8218+
MODEL_TYPE = ModelType.Mistral3
8219+
8220+
@classmethod
8221+
def state_dict_pp(cls, config, state_dict):
8222+
r = {}
8223+
for name in state_dict:
8224+
tensor: torch.Tensor = state_dict[name]
8225+
if name.startswith('language_model.'):
8226+
name = name.replace('language_model.', '')
8227+
r[name] = tensor
8228+
continue
8229+
else:
8230+
if name.startswith('vision_tower.transformer.'):
8231+
name = name.replace('vision_tower.transformer.', 'vision_model.')
8232+
name = name.replace('.attention.', '.attn.')
8233+
name = name.replace('.feed_forward.', '.mlp.')
8234+
elif name.startswith('vision_tower.'):
8235+
name = name.replace('vision_tower.', 'vision_model.')
8236+
r[name] = tensor
8237+
return r
8238+
8239+
@staticmethod
8240+
def dump_config(f, config, ggml_type):
8241+
vis_config = AttributeDict(config.vision_config)
8242+
assert vis_config.hidden_act == 'silu'
8243+
8244+
txt_config = AttributeDict(config.text_config)
8245+
Mistral3Converter.txt_config = txt_config
8246+
8247+
assert isinstance(txt_config.rope_parameters, dict)
8248+
assert txt_config.rope_parameters['rope_type'] == 'yarn'
8249+
8250+
dump_llama_like_config(f, txt_config, ggml_type)
8251+
config_values = [
8252+
txt_config.num_key_value_heads,
8253+
txt_config.sliding_window if txt_config.sliding_window is not None else -1,
8254+
1 if (txt_config.tie_word_embeddings is not None) and txt_config.tie_word_embeddings else 0,
8255+
txt_config.head_dim or txt_config.hidden_size // txt_config.num_attention_heads
8256+
]
8257+
f.write(struct.pack("i" * len(config_values), *config_values))
8258+
config_values = [
8259+
txt_config.rope_parameters['beta_fast'],
8260+
txt_config.rope_parameters['beta_slow'],
8261+
txt_config.rope_parameters['factor'],
8262+
txt_config.rope_parameters['llama_4_scaling_beta'],
8263+
txt_config.rope_parameters['mscale'],
8264+
txt_config.rope_parameters['mscale_all_dim'],
8265+
txt_config.rope_parameters['original_max_position_embeddings'],
8266+
txt_config.rope_parameters['rope_theta'],
8267+
]
8268+
f.write(struct.pack("<ffffffif", *config_values))
8269+
8270+
@staticmethod
8271+
def get_weight_names(config):
8272+
txt_config = AttributeDict(config.text_config)
8273+
vis_config = AttributeDict(config.vision_config)
8274+
8275+
weight_names = Llama32Converter.get_weight_names(txt_config)
8276+
8277+
for i in range(vis_config.num_hidden_layers):
8278+
weight_names += [
8279+
f"vision_model.layers.{i}.attn.q_proj.weight",
8280+
f"vision_model.layers.{i}.attn.k_proj.weight",
8281+
f"vision_model.layers.{i}.attn.v_proj.weight",
8282+
f"vision_model.layers.{i}.attn.o_proj.weight",
8283+
f"vision_model.layers.{i}.mlp.up_proj.weight",
8284+
f"vision_model.layers.{i}.mlp.down_proj.weight",
8285+
f"vision_model.layers.{i}.mlp.gate_proj.weight",
8286+
f"vision_model.layers.{i}.attention_norm.weight",
8287+
f"vision_model.layers.{i}.ffn_norm.weight",
8288+
]
8289+
8290+
weight_names += [
8291+
"multi_modal_projector.linear_1.weight",
8292+
"multi_modal_projector.linear_2.weight",
8293+
"multi_modal_projector.norm.weight",
8294+
"multi_modal_projector.patch_merger.merging_layer.weight",
8295+
"vision_model.ln_pre.weight",
8296+
"vision_model.patch_conv.weight",
8297+
]
8298+
8299+
return weight_names
8300+
82168301
def convert_grok_1_base(args, vocab, ggml_type):
82178302
def ffn_size(emb_size, widening_factor):
82188303
_ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -8352,7 +8437,7 @@ def load_config(path: Path, config_fn: str) -> Any:
83528437
def load_some_info(r: dict, path: Path, prefix: str = '') -> None:
83538438
if path.is_dir():
83548439
globs = ["config.json", "configuration.json", "special_tokens_map.json",
8355-
"tokenizer_config.json", "preprocessor_config.json"]
8440+
"tokenizer_config.json", "preprocessor_config.json", "processor_config.json"]
83568441
for glob in globs:
83578442
files = list(path.glob(glob))
83588443
for f in files:
@@ -8837,6 +8922,8 @@ def main():
88378922
MegrezMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
88388923
elif arch == 'OuroForCausalLM':
88398924
OuroConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8925+
elif arch == 'Mistral3ForConditionalGeneration':
8926+
Mistral3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
88408927
elif arch == 'deepseek-r1-distill-qwen3':
88418928
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
88428929
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,12 @@ Please use `--format completion` for these models.
394394
* `native_resolution`: use native resolution or not, default: `false` (This seems sensitive to quantization, so defaults to `false`).
395395
* `fps`: Default 1.0.
396396

397+
* Mistral (`Mistral3ForConditionalGeneration`)
398+
* [x] Ministral-3: [3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-BF16/tree/e904b5a798e9397c0fd04e063a2aa90355653ffe),
399+
[3B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-3B-Reasoning-2512/tree/039f888eb54340b5e9870721f3c249fbc809b8e8),
400+
[8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512-BF16/tree/bde2b3370dbf8ad77ceab25a5a43bc9013cda350),
401+
[8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512/tree/f511871f6402ba68dadfb42a94a7a7e13499fd65)
402+
397403
* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`)
398404
* [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
399405
* [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)

0 commit comments

Comments
 (0)