@@ -160,8 +160,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
160160 self.ftype = gguf.LlamaFileType.MOSTLY_F16
161161 logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
162162
163- self.dequant_model()
164-
165163 # Configure GGUF Writer
166164 self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
167165 split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@@ -527,6 +525,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
527525 return ()
528526
529527 def prepare_tensors(self):
528+ self.dequant_model()
529+
530530 # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
531531 if self.tensor_map.mapping:
532532 max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@@ -1815,7 +1815,7 @@ class MmprojModel(ModelBase):
18151815 preprocessor_config: dict[str, Any]
18161816 global_config: dict[str, Any]
18171817
1818- n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
1818+ n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers" ]
18191819
18201820 has_vision_encoder: bool = True # by default
18211821 has_audio_encoder: bool = False
@@ -1870,7 +1870,15 @@ def __init__(self, *args, **kwargs):
18701870 preprocessor_config_path = self.dir_model / "preprocessor_config.json"
18711871 if preprocessor_config_path.is_file():
18721872 with open(preprocessor_config_path, "r", encoding="utf-8") as f:
1873- self.preprocessor_config = json.load(f)
1873+ cfg = json.load(f)
1874+ # move media_proc_cfg to root level for compat
1875+ if "media_proc_cfg" in cfg:
1876+ cfg = {
1877+ **cfg,
1878+ **cfg["media_proc_cfg"],
1879+ }
1880+ # merge configs
1881+ self.preprocessor_config = {**self.preprocessor_config, **cfg}
18741882
18751883 # prefer processor_config.json if possible
18761884 processor_config_path = self.dir_model / "processor_config.json"
@@ -1919,10 +1927,10 @@ def set_gguf_parameters(self):
19191927 self.image_size = self.find_vparam(["image_size"])
19201928 self.gguf_writer.add_vision_image_size(self.image_size)
19211929 self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1922- self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
1923- self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
1930+ self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size" ]))
1931+ self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size" ]))
19241932 self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1925- self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
1933+ self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads" ]))
19261934
19271935 # preprocessor config
19281936 image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -7695,6 +7703,7 @@ def prepare_tensors(self):
76957703 "DeepseekV2ForCausalLM",
76967704 "DeepseekV3ForCausalLM",
76977705 "KimiVLForConditionalGeneration",
7706+ "KimiK25ForConditionalGeneration",
76987707 "YoutuForCausalLM",
76997708 "YoutuVLForConditionalGeneration",
77007709)
@@ -7813,8 +7822,8 @@ def set_gguf_parameters(self):
78137822 _experts: list[dict[str, Tensor]] | None = None
78147823
78157824 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7816- # skip vision tensors and remove "language_model." for Kimi-VL
7817- if "vision_tower" in name or "multi_modal_projector" in name:
7825+ # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
7826+ if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name :
78187827 return
78197828 if name.startswith("siglip2.") or name.startswith("merger."):
78207829 return
@@ -11176,6 +11185,103 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1117611185 yield from super().modify_tensors(data_torch, name, bid)
1117711186
1117811187
11188+ @ModelBase.register("KimiK25ForConditionalGeneration")
11189+ class KimiK25Model(MmprojModel):
11190+ """Kimi-K2.5 with MoonViT3d vision encoder"""
11191+
11192+ def __init__(self, *args, **kwargs):
11193+ super().__init__(*args, **kwargs)
11194+
11195+ assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config"
11196+
11197+ self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2]))
11198+ self.patch_size = self.hparams_vision.get("patch_size", 14)
11199+
11200+ # Set image_size for compatibility with base class
11201+ # Use position embedding dimensions as image_size reference
11202+ pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64)
11203+ self.hparams_vision["image_size"] = pos_emb_h * self.patch_size
11204+
11205+ def set_gguf_parameters(self):
11206+ # Base class MmprojModel.set_gguf_parameters() already writes:
11207+ # - vision_block_count, vision_head_count, vision_embedding_length
11208+ # - vision_feed_forward_length, vision_patch_size, image_mean, image_std
11209+ # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config
11210+ super().set_gguf_parameters()
11211+ assert self.hparams_vision is not None
11212+
11213+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)
11214+
11215+ # Position embedding parameters (for interpolation)
11216+ self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
11217+ self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
11218+ self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))
11219+
11220+ # Projector parameters
11221+ self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu")
11222+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
11223+ self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
11224+
11225+ # Image size limits
11226+ # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet)
11227+ in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384)
11228+ min_patches = 8 # reasonable minimum
11229+ pixels_per_patch = self.patch_size ** 2
11230+ self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
11231+ self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
11232+
11233+ @staticmethod
11234+ def permute(weights: Tensor, n_head: int) -> Tensor:
11235+ out_dim, in_dim = weights.shape
11236+ head_dim = out_dim // n_head
11237+ w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
11238+ w = w.permute(0, 2, 1, 3, 4)
11239+ return w.reshape(out_dim, in_dim)
11240+
11241+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11242+ # Only process vision and projector tensors
11243+ is_vision = any(x in name for x in ["vision_tower", "mm_projector"])
11244+
11245+ if not is_vision:
11246+ return
11247+
11248+ assert self.hparams_vision is not None
11249+ n_head = self.hparams_vision.get("num_attention_heads", 16)
11250+
11251+ # Permute Q/K weights/biases from interleaved to split RoPE format
11252+ # This allows using build_rope_2d at runtime without post-permutation.
11253+ if "wqkv" in name:
11254+ out_dim = data_torch.shape[0]
11255+ qkv_dim = out_dim // 3
11256+ head_dim = qkv_dim // n_head
11257+
11258+ if "weight" in name:
11259+ wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :]
11260+ wq = self.permute(wq, n_head)
11261+ wk = self.permute(wk, n_head)
11262+ data_torch = torch.cat([wq, wk, wv], dim=0)
11263+ elif "bias" in name:
11264+ bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:]
11265+ bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11266+ bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
11267+ data_torch = torch.cat([bq, bk, bv], dim=0)
11268+
11269+ # Temporal embeddings: (T, 1, C) → (T, C)
11270+ if "pos_emb.time_weight" in name:
11271+ T, _, C = data_torch.shape
11272+ data_torch = data_torch.reshape(T, C)
11273+
11274+ # PatchMergerMLP tensor name mapping
11275+ # proj.0.weight → proj.linear_1.weight
11276+ # proj.2.weight → proj.linear_2.weight
11277+ if "mm_projector.proj.0." in name:
11278+ name = name.replace(".proj.0.", ".proj.linear_1.")
11279+ elif "mm_projector.proj.2." in name:
11280+ name = name.replace(".proj.2.", ".proj.linear_2.")
11281+
11282+ yield from super().modify_tensors(data_torch, name, bid)
11283+
11284+
1117911285@ModelBase.register("CogVLMForCausalLM")
1118011286class CogVLMVisionModel(MmprojModel):
1118111287
0 commit comments