model: add Mellum architecture (#23966)

Xarbirus · CISC · web-flow · commit 4fb16eccce5e · 2026-06-02T22:11:12.000+03:00
* model: support for Mellum architecture

* model: improve mellum.py formatting

* model: improve mellum.py formatting once again

* deps: downgrade transformers to 4.57.6 (to fix CI)

* deps: remove huggingface_hub dependency

* deps: remove huggingface_hub from test requirements

---------

Co-authored-by: Sigbjørn Skjæret &lt;sigbjorn.skjaeret@scala.com&gt;
diff --git a/README.md b/README.md
@@ -143,6 +143,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
+- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
 
 #### Multimodal
 
diff --git a/conversion/__init__.py b/conversion/__init__.py
@@ -135,6 +135,7 @@
     "Mamba2ForCausalLM": "mamba",
     "MambaForCausalLM": "mamba",
     "MambaLMHeadModel": "mamba",
+    "MellumForCausalLM": "mellum",
     "MiMoV2FlashForCausalLM": "mimo",
     "MiMoV2ForCausalLM": "mimo",
     "MiniCPM3ForCausalLM": "minicpm",
diff --git a/conversion/base.py b/conversion/base.py
@@ -1663,6 +1663,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669":
             # ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2
             res = "granite-embed-multi-311m"
+        if chkhsh == "9dcf830ee9990cdbf78cc523a5f7bd9ad8f3f9890c2d3581d2785ad10f07049d":
+            # ref: https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base
+            res = "mellum2"
 
         if res is None:
             logger.warning("\n")
diff --git a/conversion/mellum.py b/conversion/mellum.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, TextModel, gguf, logger
+
+
+@ModelBase.register("MellumForCausalLM")
+class MellumModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.MELLUM
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
+
+        use_sliding_window = self.hparams.get("use_sliding_window")
+        sliding_window = self.hparams.get("sliding_window")
+        if (use_sliding_window is True or use_sliding_window is None) and sliding_window is not None:
+            self.gguf_writer.add_sliding_window(sliding_window)
+            logger.info(f"gguf: sliding window = {sliding_window}")
+            self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in self.hparams["layer_types"]])
+            logger.info(f"gguf: sliding window pattern length = {len(self.hparams['layer_types'])}")
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.find("experts") != -1:
+            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, bid)
+                return
+            else:
+                return
+
+        yield from super().modify_tensors(data_torch, name, bid)
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -160,6 +160,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "minicpm5",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
     {"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", },
     {"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", },
+    {"name": "mellum2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base"},
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -510,6 +510,7 @@ class MODEL_ARCH(IntEnum):
     MAINCODER        = auto()
     KIMI_LINEAR      = auto()
     TALKIE           = auto()
+    MELLUM           = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -1030,6 +1031,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MAINCODER:        "maincoder",
     MODEL_ARCH.KIMI_LINEAR:      "kimi-linear",
     MODEL_ARCH.TALKIE:           "talkie",
+    MODEL_ARCH.MELLUM:           "mellum",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -4093,6 +4095,23 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_SCALE,
     ],
+    MODEL_ARCH.MELLUM: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     # TODO
 }
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ requires-python = '>=3.10,<3.15'
 dependencies = [
     'numpy (>=1.26.4,<3.0.0)',
     'sentencepiece (>=0.1.98,<0.3.0)',
-    'transformers (==5.5.1)',
+    'transformers (==4.57.6)',
     'protobuf (>=4.21.0,<5.0.0)',
     'torch (>=2.6.0,<3.0.0)',
     'gguf @ ./gguf-py',
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt
@@ -1,7 +1,7 @@
 numpy~=1.26.4
 sentencepiece>=0.1.98,<0.3.0
 
-transformers==5.5.1
+transformers==4.57.6
 
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0
diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt
@@ -1,6 +1,5 @@
 aiohttp~=3.9.3
 pytest~=8.3.3
-huggingface_hub>=1.5.0,<2.0
 matplotlib~=3.10.0
 numpy~=1.26.4
 openai~=2.14.0
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -135,6 +135,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_MAINCODER,        "maincoder"        },
     { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
     { LLM_ARCH_TALKIE,           "talkie"           },
+    { LLM_ARCH_MELLUM,           "mellum"           },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -139,6 +139,7 @@ enum llm_arch {
     LLM_ARCH_MAINCODER,
     LLM_ARCH_KIMI_LINEAR,
     LLM_ARCH_TALKIE,
+    LLM_ARCH_MELLUM,
     LLM_ARCH_UNKNOWN,
 };
 
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
@@ -29,6 +29,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
         case LLM_ARCH_APERTUS:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
+        case LLM_ARCH_MELLUM:
             return false;
         default:
             return true;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -81,6 +81,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_mpt(params);
         case LLM_ARCH_STABLELM:
             return new llama_model_stablelm(params);
+        case LLM_ARCH_MELLUM:
+            return new llama_model_mellum(params);
         case LLM_ARCH_QWEN:
             return new llama_model_qwen(params);
         case LLM_ARCH_QWEN2:
@@ -764,6 +766,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_A13B:          return "A13B";
         case LLM_TYPE_7B_A1B:        return "7B.A1B";
         case LLM_TYPE_8B_A1B:        return "8B.A1B";
+        case LLM_TYPE_12B_A2_5B:     return "12B.A2.5B";
         case LLM_TYPE_16B_A1B:       return "16B.A1B";
         case LLM_TYPE_21B_A3B:       return "21B.A3B";
         case LLM_TYPE_24B_A2B:       return "24B.A2B";
@@ -1816,7 +1819,11 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
         }
 
-        if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
+        if (arch == LLM_ARCH_MELLUM ||
+                arch == LLM_ARCH_QWEN3MOE ||
+                arch == LLM_ARCH_OPENAI_MOE ||
+                arch == LLM_ARCH_QWEN3VLMOE ||
+                arch == LLM_ARCH_RND1) {
             LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
         }
 
@@ -2404,6 +2411,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
         case LLM_ARCH_TALKIE:
+        case LLM_ARCH_MELLUM:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -116,6 +116,7 @@ enum llm_type {
     LLM_TYPE_A13B,
     LLM_TYPE_7B_A1B,
     LLM_TYPE_8B_A1B, // lfm2moe
+    LLM_TYPE_12B_A2_5B,
     LLM_TYPE_16B_A1B,
     LLM_TYPE_21B_A3B, // Ernie MoE small
     LLM_TYPE_24B_A2B, // lfm2moe
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -353,6 +353,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
             case LLAMA_VOCAB_PRE_TYPE_EXAONE:
             case LLAMA_VOCAB_PRE_TYPE_MINERVA:
+            case LLAMA_VOCAB_PRE_TYPE_MELLUM2:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -2325,6 +2326,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "solar-open") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "mellum2") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_MELLUM2;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -63,6 +63,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_MINICPM5          = 52,
     LLAMA_VOCAB_PRE_TYPE_WHITESPACE        = 53,
     LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54,
+    LLAMA_VOCAB_PRE_TYPE_MELLUM2           = 55,
 };
 
 struct LLM_KV;
diff --git a/src/models/mellum.cpp b/src/models/mellum.cpp
diff --git a/src/models/models.h b/src/models/models.h
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt

Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,7 @@ class TOKENIZER_TYPE(IntEnum):`
`160`	`160`	`{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},`
`161`	`161`	`{"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", },`
`162`	`162`	`{"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", },`
	`163`	`+ {"name": "mellum2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base"},`
`163`	`164`	`]`
`164`	`165`
`165`	`166`	`# some models are known to be broken upstream, so we will skip them as exceptions`