@@ -241,6 +241,7 @@ class ModelType(Enum):
241241 Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
242242
243243 Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001
244+ Qwen2_VL = ModelTypeTagChatImageVideoIn + 0x0000002
244245 KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100
245246 SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200
246247
@@ -4479,6 +4480,58 @@ def get_weight_names(config):
44794480
44804481 return weight_names
44814482
4483+ class QWen2_VLConverter (BaseConverter ):
4484+ MODEL_TYPE = ModelType .Qwen2_VL
4485+
4486+ @classmethod
4487+ def state_dict_pp (cls , config , state_dict ):
4488+ r = QWen2_5VLConverter .state_dict_pp (config , state_dict )
4489+ return r
4490+
4491+ @staticmethod
4492+ def dump_config (f , config , ggml_type ):
4493+ assert config .vision_config ['hidden_act' ] == 'quick_gelu'
4494+ config .vision_config ['hidden_act' ] = 'silu'
4495+ config .vision_config ['hidden_size' ] = config .vision_config ['embed_dim' ]
4496+ QWen2_5VLConverter .dump_config (f , config , ggml_type )
4497+
4498+ @staticmethod
4499+ def get_weight_names (config ):
4500+ weight_names = QWen2Converter .get_weight_names (config if config .text_config is None else AttributeDict (config .text_config ))
4501+
4502+ for i in range (config .vision_config ['depth' ]):
4503+ weight_names += [
4504+ f"visual.blocks.{ i } .attn.proj.bias" ,
4505+ f"visual.blocks.{ i } .attn.proj.weight" ,
4506+ f"visual.blocks.{ i } .attn.q_proj.bias" ,
4507+ f"visual.blocks.{ i } .attn.q_proj.weight" ,
4508+ f"visual.blocks.{ i } .attn.k_proj.bias" ,
4509+ f"visual.blocks.{ i } .attn.k_proj.weight" ,
4510+ f"visual.blocks.{ i } .attn.v_proj.bias" ,
4511+ f"visual.blocks.{ i } .attn.v_proj.weight" ,
4512+ f"visual.blocks.{ i } .mlp.fc1.bias" ,
4513+ f"visual.blocks.{ i } .mlp.fc1.weight" ,
4514+ f"visual.blocks.{ i } .mlp.fc2.bias" ,
4515+ f"visual.blocks.{ i } .mlp.fc2.weight" ,
4516+ f"visual.blocks.{ i } .norm1.bias" ,
4517+ f"visual.blocks.{ i } .norm1.weight" ,
4518+ f"visual.blocks.{ i } .norm2.bias" ,
4519+ f"visual.blocks.{ i } .norm2.weight" ,
4520+ ]
4521+
4522+ weight_names += [
4523+ "visual.merger.ln_q.bias" ,
4524+ "visual.merger.ln_q.weight" ,
4525+ "visual.merger.mlp.0.bias" ,
4526+ "visual.merger.mlp.0.weight" ,
4527+ "visual.merger.mlp.2.bias" ,
4528+ "visual.merger.mlp.2.weight" ,
4529+ "visual.patch_embed.proj.0.weight" ,
4530+ "visual.patch_embed.proj.1.weight" ,
4531+ ]
4532+
4533+ return weight_names
4534+
44824535class QWen2_5VLConverter (BaseConverter ):
44834536 MODEL_TYPE = ModelType .Qwen2_5VL
44844537
@@ -4507,21 +4560,23 @@ def state_dict_pp(cls, config, state_dict):
45074560
45084561 @staticmethod
45094562 def dump_config (f , config , ggml_type ):
4510- assert config .rope_scaling ['type' ] == 'mrope' , 'rope_scaling must be mrope'
4563+ # assert config.rope_scaling['type'] == 'mrope', 'rope_scaling must be mrope'
45114564 assert config .vision_config ['hidden_act' ] == 'silu'
45124565
45134566 QWen2Converter .dump_config (f , config , ggml_type )
45144567
45154568 MROPE_SECTION_MAX = 4
45164569
4570+ text_config = config if config .text_config is None else AttributeDict (config .text_config )
4571+
45174572 config_values = [
4518- config .tie_word_embeddings if config .tie_word_embeddings is not None else 0
4573+ text_config .tie_word_embeddings if text_config .tie_word_embeddings is not None else 0
45194574 ] + pad_to_len (config .rope_scaling ['mrope_section' ], MROPE_SECTION_MAX )
45204575 f .write (struct .pack ("<" + "i" * len (config_values ), * config_values ))
45214576
45224577 @staticmethod
45234578 def get_weight_names (config ):
4524- weight_names = QWen2Converter .get_weight_names (config )
4579+ weight_names = QWen2Converter .get_weight_names (config if config . text_config is None else AttributeDict ( config . text_config ) )
45254580
45264581 for i in range (config .vision_config ['depth' ]):
45274582 weight_names += [
@@ -8501,6 +8556,8 @@ def main():
85018556 QWen2Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
85028557 elif arch == 'Qwen2AudioForConditionalGeneration' :
85038558 QWen2AudioConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8559+ elif arch == 'Qwen2VLForConditionalGeneration' :
8560+ QWen2_VLConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
85048561 elif arch == 'Qwen2_5_VLForConditionalGeneration' :
85058562 QWen2_5VLConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
85068563 elif arch == 'KimiVLForConditionalGeneration' :
0 commit comments