|
4 | 4 | import argparse |
5 | 5 | from ast import Dict, Tuple |
6 | 6 | from collections import OrderedDict |
| 7 | +import copy |
7 | 8 | import json |
8 | 9 | import struct |
9 | 10 | import time, os |
@@ -48,6 +49,7 @@ class ChatModelAP(IntEnum): |
48 | 49 | VideoOutput = 0x40 |
49 | 50 |
|
50 | 51 | ModelTypeTagChatImageIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value) >> 1) << 24 |
| 52 | +ModelTypeTagChatAudioIn = ((ChatModelAP.Text.value + ChatModelAP.AudioInput.value) >> 1) << 24 |
51 | 53 | ModelTypeTagChatImageVideoIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value) >> 1) << 24 |
52 | 54 | ModelTypeTagChatImageVideoAudioInAudioOut = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value + ChatModelAP.AudioInput.value + ChatModelAP.AudioOutput.value) >> 1) << 24 |
53 | 55 |
|
@@ -210,6 +212,8 @@ class ModelType(Enum): |
210 | 212 | LlaMA4 = ModelTypeTagChatImageIn + 0x0000001 |
211 | 213 | Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011 |
212 | 214 |
|
| 215 | + Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001 |
| 216 | + |
213 | 217 | Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001 |
214 | 218 | KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100 |
215 | 219 | SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200 |
@@ -4214,6 +4218,89 @@ def get_weight_names(config): |
4214 | 4218 |
|
4215 | 4219 | return weight_names |
4216 | 4220 |
|
| 4221 | +class QWen2AudioConverter(BaseConverter): |
| 4222 | + MODEL_TYPE = ModelType.Qwen2Audio |
| 4223 | + |
| 4224 | + txt_config = {} |
| 4225 | + |
| 4226 | + @classmethod |
| 4227 | + def state_dict_pp(cls, config, state_dict): |
| 4228 | + r = {} |
| 4229 | + for name in state_dict: |
| 4230 | + tensor: torch.Tensor = state_dict[name] |
| 4231 | + new_name = name |
| 4232 | + if new_name.startswith('audio_tower.'): |
| 4233 | + new_name = new_name.replace('audio_tower.', 'audio.') |
| 4234 | + if '.out_proj.' in new_name: |
| 4235 | + new_name = new_name.replace('.out_proj.', '.o_proj.') |
| 4236 | + elif '.fc1.' in new_name: |
| 4237 | + new_name = new_name.replace('.fc1.', '.mlp.fc1.') |
| 4238 | + elif '.fc2.' in new_name: |
| 4239 | + new_name = new_name.replace('.fc2.', '.mlp.fc2.') |
| 4240 | + new_name = new_name.replace('.self_attn_layer_norm.', '.input_layernorm.') |
| 4241 | + new_name = new_name.replace('.final_layer_norm.', '.post_attention_layernorm.') |
| 4242 | + elif new_name.startswith('language_model.'): |
| 4243 | + new_name = new_name.replace('language_model.', '') |
| 4244 | + |
| 4245 | + r[new_name] = tensor |
| 4246 | + return r |
| 4247 | + |
| 4248 | + @staticmethod |
| 4249 | + def dump_config(f, config, ggml_type): |
| 4250 | + txt_config = copy.deepcopy(config.text_config) |
| 4251 | + |
| 4252 | + default = { |
| 4253 | + 'hidden_act': 'silu', |
| 4254 | + 'hidden_size': 4096, |
| 4255 | + 'num_hidden_layers': 32, |
| 4256 | + 'num_attention_heads': 32, |
| 4257 | + 'num_key_value_heads': 32, |
| 4258 | + 'use_sliding_window': False |
| 4259 | + } |
| 4260 | + for k, v in default.items(): |
| 4261 | + if k not in txt_config: |
| 4262 | + txt_config[k] = v |
| 4263 | + |
| 4264 | + QWen2AudioConverter.txt_config = AttributeDict(txt_config) |
| 4265 | + QWen2Converter.dump_config(f, QWen2AudioConverter.txt_config, ggml_type) |
| 4266 | + |
| 4267 | + @staticmethod |
| 4268 | + def get_weight_names(config): |
| 4269 | + weight_names = QWen2Converter.get_weight_names(QWen2AudioConverter.txt_config) |
| 4270 | + |
| 4271 | + for i in range(config.audio_config['encoder_layers']): |
| 4272 | + weight_names += [ |
| 4273 | + f"audio.layers.{i}.mlp.fc1.bias", |
| 4274 | + f"audio.layers.{i}.mlp.fc1.weight", |
| 4275 | + f"audio.layers.{i}.mlp.fc2.bias", |
| 4276 | + f"audio.layers.{i}.mlp.fc2.weight", |
| 4277 | + f"audio.layers.{i}.post_attention_layernorm.bias", |
| 4278 | + f"audio.layers.{i}.post_attention_layernorm.weight", |
| 4279 | + f"audio.layers.{i}.self_attn.k_proj.weight", |
| 4280 | + f"audio.layers.{i}.self_attn.o_proj.bias", |
| 4281 | + f"audio.layers.{i}.self_attn.o_proj.weight", |
| 4282 | + f"audio.layers.{i}.self_attn.q_proj.bias", |
| 4283 | + f"audio.layers.{i}.self_attn.q_proj.weight", |
| 4284 | + f"audio.layers.{i}.self_attn.v_proj.bias", |
| 4285 | + f"audio.layers.{i}.self_attn.v_proj.weight", |
| 4286 | + f"audio.layers.{i}.input_layernorm.bias", |
| 4287 | + f"audio.layers.{i}.input_layernorm.weight", |
| 4288 | + ] |
| 4289 | + |
| 4290 | + weight_names += [ |
| 4291 | + "audio.conv1.bias", |
| 4292 | + "audio.conv1.weight", |
| 4293 | + "audio.conv2.bias", |
| 4294 | + "audio.conv2.weight", |
| 4295 | + "audio.embed_positions.weight", |
| 4296 | + "audio.layer_norm.bias", |
| 4297 | + "audio.layer_norm.weight", |
| 4298 | + "multi_modal_projector.linear.bias", |
| 4299 | + "multi_modal_projector.linear.weight", |
| 4300 | + ] |
| 4301 | + |
| 4302 | + return weight_names |
| 4303 | + |
4217 | 4304 | class QWen2_5VLConverter(BaseConverter): |
4218 | 4305 | MODEL_TYPE = ModelType.Qwen2_5VL |
4219 | 4306 |
|
@@ -7185,6 +7272,8 @@ def main(): |
7185 | 7272 | QWen2Converter.MODEL_TYPE = ModelType.ReaderLM2 |
7186 | 7273 | assert config.tie_word_embeddings |
7187 | 7274 | QWen2Converter.convert(config, model_files, vocab, ggml_type, args.save_path) |
| 7275 | + elif arch == 'Qwen2AudioForConditionalGeneration': |
| 7276 | + QWen2AudioConverter.convert(config, model_files, vocab, ggml_type, args.save_path) |
7188 | 7277 | elif arch == 'Qwen2_5_VLForConditionalGeneration': |
7189 | 7278 | QWen2_5VLConverter.convert(config, model_files, vocab, ggml_type, args.save_path) |
7190 | 7279 | elif arch == 'KimiVLForConditionalGeneration': |
|
0 commit comments