@@ -2219,10 +2219,10 @@ def set_gguf_parameters(self):
22192219 self.image_size = self.find_vparam(["image_size"])
22202220 self.gguf_writer.add_vision_image_size(self.image_size)
22212221 self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
2222- self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
2222+ self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", " vt_hidden_size"]))
22232223 self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
22242224 self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
2225- self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
2225+ self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", " vt_num_attention_heads"]))
22262226
22272227 # preprocessor config
22282228 image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -4949,6 +4949,73 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49494949 yield from super().modify_tensors(data_torch, name, bid)
49504950
49514951
4952+ @ModelBase.register("StepVLForConditionalGeneration")
4953+ class Step3VLVisionModel(MmprojModel):
4954+ def __init__(self, *args, **kwargs):
4955+ super().__init__(*args, **kwargs)
4956+ assert self.hparams_vision is not None
4957+
4958+ if not self.hparams_vision.get("intermediate_size"):
4959+ hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
4960+ assert hidden_size > 0
4961+ mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
4962+ self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
4963+
4964+ self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
4965+ self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
4966+
4967+ def set_gguf_parameters(self):
4968+ super().set_gguf_parameters()
4969+ assert self.hparams_vision is not None
4970+
4971+ projector_stride = int(self.global_config.get("understand_projector_stride", -1))
4972+ hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
4973+ num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
4974+ assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
4975+ "current Step3-VL conversion path is only validated for Step3-VL-10B"
4976+ )
4977+
4978+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
4979+ self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
4980+ self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
4981+ # 3024 max resize comes from step3-vl-10b processing_step3.py.
4982+ self.gguf_writer.add_vision_preproc_image_size(3024)
4983+
4984+ def tensor_force_quant(self, name, new_name, bid, n_dims):
4985+ if ".position_embd." in new_name:
4986+ return gguf.GGMLQuantizationType.F32
4987+ return super().tensor_force_quant(name, new_name, bid, n_dims)
4988+
4989+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4990+ if name.startswith("model.") or name.startswith("lm_head."):
4991+ return
4992+
4993+ if name.startswith("vision_model.vit_downsampler"):
4994+ match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
4995+ if match is None:
4996+ raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
4997+
4998+ proj_id = int(match.group(1)) - 1
4999+ suffix = f".{match.group(2)}"
5000+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
5001+ return
5002+
5003+ if name == "vit_large_projector.weight":
5004+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
5005+ return
5006+
5007+ if name.startswith("vision_model."):
5008+ if name == "vision_model.positional_embedding":
5009+ name += ".weight"
5010+ elif name.endswith(".gamma") and ".ls_" in name:
5011+ name = name.removesuffix(".gamma") + ".weight"
5012+
5013+ name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
5014+ name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
5015+
5016+ yield from super().modify_tensors(data_torch, name, bid)
5017+
5018+
49525019@ModelBase.register("Qwen3VLForConditionalGeneration")
49535020class Qwen3VLTextModel(Qwen3Model):
49545021 model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -4969,6 +5036,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49695036 yield from super().modify_tensors(data_torch, name, bid)
49705037
49715038
5039+ @ModelBase.register("StepVLForConditionalGeneration")
5040+ class Step3VLTextModel(Qwen3Model):
5041+ model_arch = gguf.MODEL_ARCH.QWEN3
5042+
5043+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5044+ if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
5045+ return
5046+ yield from super().modify_tensors(data_torch, name, bid)
5047+
5048+
49725049@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
49735050class Qwen3VLMoeTextModel(Qwen3MoeModel):
49745051 model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1299413071 # For non-hf Mamba and Mamba2 models
1299513072 arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
1299613073
13074+ # Step3-VL keeps text config under text_config but uses a custom top-level architecture.
13075+ # For text conversion we route to a dedicated text-only class.
13076+ # TODO: refactor this later to avoid adding exception here
13077+ if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
13078+ return arch
13079+
1299713080 # if "architectures" is found in the sub-config, use that instead
1299813081 if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
1299913082 arch = text_config["architectures"][0]
0 commit comments