Skip to content

Commit ff5228e

Browse files
committed
support Qwen2-Audio
1 parent 389571c commit ff5228e

File tree

17 files changed

+1210
-36
lines changed

17 files changed

+1210
-36
lines changed

CMakeLists.txt

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,18 @@ if (GGML_CLBLAST)
4343
add_compile_definitions(GGML_USE_CLBLAST)
4444
endif ()
4545

46-
add_library(libchatllm SHARED EXCLUDE_FROM_ALL src/main.cpp src/backend.cpp src/chat.cpp src/vectorstore.cpp src/layers.cpp src/tokenizer.cpp src/models.cpp src/unicode.cpp src/unicode-data.cpp src/vision_process.cpp)
46+
add_library(libchatllm SHARED EXCLUDE_FROM_ALL
47+
src/main.cpp
48+
src/backend.cpp
49+
src/chat.cpp
50+
src/vectorstore.cpp
51+
src/layers.cpp
52+
src/tokenizer.cpp
53+
src/models.cpp
54+
src/unicode.cpp
55+
src/unicode-data.cpp
56+
src/vision_process.cpp
57+
src/audio_process.cpp)
4758
target_link_libraries(libchatllm PRIVATE ggml)
4859
target_compile_definitions(libchatllm PUBLIC CHATLLM_SHARED_LIB)
4960
SET_TARGET_PROPERTIES(libchatllm PROPERTIES PREFIX "")
@@ -52,5 +63,16 @@ set_target_properties(libchatllm
5263
LIBRARY_OUTPUT_DIRECTORY "../bindings"
5364
)
5465

55-
add_executable(main src/main.cpp src/chat.cpp src/backend.cpp src/vectorstore.cpp src/layers.cpp src/tokenizer.cpp src/models.cpp src/unicode.cpp src/unicode-data.cpp src/vision_process.cpp)
66+
add_executable(main
67+
src/main.cpp
68+
src/chat.cpp
69+
src/backend.cpp
70+
src/vectorstore.cpp
71+
src/layers.cpp
72+
src/tokenizer.cpp
73+
src/models.cpp
74+
src/unicode.cpp
75+
src/unicode-data.cpp
76+
src/vision_process.cpp
77+
src/audio_process.cpp)
5678
target_link_libraries(main PRIVATE ggml)

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2025-06-21: [I can hear](./docs/multimodal.md): Qwen2-Audio
1617
* 2025-06-10: SmolVLM2
1718
* 2025-06-07: MiniCPM4
1819
* 2025-06-06: Qwen-3 Embedding & Reranker
@@ -121,6 +122,12 @@ In order to build this project you have several different options.
121122

122123
The executable is `./build/bin/main`.
123124

125+
There are lots of `GGML_...` options to play with. Example: Vulkan acceleration together with RPC and backend dynamic loading:
126+
127+
```sh
128+
cmake -B build -DGGML_VULKAN=1 -DGGML_RPC=1 -DGGML_CPU_ALL_VARIANTS=1 -DGGML_BACKEND_DL=1
129+
```
130+
124131
### Run
125132

126133
Now you may chat with a quantized model by running:

convert.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import argparse
55
from ast import Dict, Tuple
66
from collections import OrderedDict
7+
import copy
78
import json
89
import struct
910
import time, os
@@ -48,6 +49,7 @@ class ChatModelAP(IntEnum):
4849
VideoOutput = 0x40
4950

5051
ModelTypeTagChatImageIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value) >> 1) << 24
52+
ModelTypeTagChatAudioIn = ((ChatModelAP.Text.value + ChatModelAP.AudioInput.value) >> 1) << 24
5153
ModelTypeTagChatImageVideoIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value) >> 1) << 24
5254
ModelTypeTagChatImageVideoAudioInAudioOut = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value + ChatModelAP.AudioInput.value + ChatModelAP.AudioOutput.value) >> 1) << 24
5355

@@ -210,6 +212,8 @@ class ModelType(Enum):
210212
LlaMA4 = ModelTypeTagChatImageIn + 0x0000001
211213
Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011
212214

215+
Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
216+
213217
Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001
214218
KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100
215219
SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200
@@ -4214,6 +4218,89 @@ def get_weight_names(config):
42144218

42154219
return weight_names
42164220

4221+
class QWen2AudioConverter(BaseConverter):
4222+
MODEL_TYPE = ModelType.Qwen2Audio
4223+
4224+
txt_config = {}
4225+
4226+
@classmethod
4227+
def state_dict_pp(cls, config, state_dict):
4228+
r = {}
4229+
for name in state_dict:
4230+
tensor: torch.Tensor = state_dict[name]
4231+
new_name = name
4232+
if new_name.startswith('audio_tower.'):
4233+
new_name = new_name.replace('audio_tower.', 'audio.')
4234+
if '.out_proj.' in new_name:
4235+
new_name = new_name.replace('.out_proj.', '.o_proj.')
4236+
elif '.fc1.' in new_name:
4237+
new_name = new_name.replace('.fc1.', '.mlp.fc1.')
4238+
elif '.fc2.' in new_name:
4239+
new_name = new_name.replace('.fc2.', '.mlp.fc2.')
4240+
new_name = new_name.replace('.self_attn_layer_norm.', '.input_layernorm.')
4241+
new_name = new_name.replace('.final_layer_norm.', '.post_attention_layernorm.')
4242+
elif new_name.startswith('language_model.'):
4243+
new_name = new_name.replace('language_model.', '')
4244+
4245+
r[new_name] = tensor
4246+
return r
4247+
4248+
@staticmethod
4249+
def dump_config(f, config, ggml_type):
4250+
txt_config = copy.deepcopy(config.text_config)
4251+
4252+
default = {
4253+
'hidden_act': 'silu',
4254+
'hidden_size': 4096,
4255+
'num_hidden_layers': 32,
4256+
'num_attention_heads': 32,
4257+
'num_key_value_heads': 32,
4258+
'use_sliding_window': False
4259+
}
4260+
for k, v in default.items():
4261+
if k not in txt_config:
4262+
txt_config[k] = v
4263+
4264+
QWen2AudioConverter.txt_config = AttributeDict(txt_config)
4265+
QWen2Converter.dump_config(f, QWen2AudioConverter.txt_config, ggml_type)
4266+
4267+
@staticmethod
4268+
def get_weight_names(config):
4269+
weight_names = QWen2Converter.get_weight_names(QWen2AudioConverter.txt_config)
4270+
4271+
for i in range(config.audio_config['encoder_layers']):
4272+
weight_names += [
4273+
f"audio.layers.{i}.mlp.fc1.bias",
4274+
f"audio.layers.{i}.mlp.fc1.weight",
4275+
f"audio.layers.{i}.mlp.fc2.bias",
4276+
f"audio.layers.{i}.mlp.fc2.weight",
4277+
f"audio.layers.{i}.post_attention_layernorm.bias",
4278+
f"audio.layers.{i}.post_attention_layernorm.weight",
4279+
f"audio.layers.{i}.self_attn.k_proj.weight",
4280+
f"audio.layers.{i}.self_attn.o_proj.bias",
4281+
f"audio.layers.{i}.self_attn.o_proj.weight",
4282+
f"audio.layers.{i}.self_attn.q_proj.bias",
4283+
f"audio.layers.{i}.self_attn.q_proj.weight",
4284+
f"audio.layers.{i}.self_attn.v_proj.bias",
4285+
f"audio.layers.{i}.self_attn.v_proj.weight",
4286+
f"audio.layers.{i}.input_layernorm.bias",
4287+
f"audio.layers.{i}.input_layernorm.weight",
4288+
]
4289+
4290+
weight_names += [
4291+
"audio.conv1.bias",
4292+
"audio.conv1.weight",
4293+
"audio.conv2.bias",
4294+
"audio.conv2.weight",
4295+
"audio.embed_positions.weight",
4296+
"audio.layer_norm.bias",
4297+
"audio.layer_norm.weight",
4298+
"multi_modal_projector.linear.bias",
4299+
"multi_modal_projector.linear.weight",
4300+
]
4301+
4302+
return weight_names
4303+
42174304
class QWen2_5VLConverter(BaseConverter):
42184305
MODEL_TYPE = ModelType.Qwen2_5VL
42194306

@@ -7185,6 +7272,8 @@ def main():
71857272
QWen2Converter.MODEL_TYPE = ModelType.ReaderLM2
71867273
assert config.tie_word_embeddings
71877274
QWen2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
7275+
elif arch == 'Qwen2AudioForConditionalGeneration':
7276+
QWen2AudioConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
71887277
elif arch == 'Qwen2_5_VLForConditionalGeneration':
71897278
QWen2_5VLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
71907279
elif arch == 'KimiVLForConditionalGeneration':

docs/models.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,12 @@ Please use `--format completion` for these models.
302302

303303
Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting. Use `--set do-pan-and-scan 1` to enable _Pan and Scan_.
304304

305-
306305
* Kimi (`KimiVLForConditionalGeneration`)
307306
* [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
308307

308+
* Qwen (`Qwen2AudioForConditionalGeneration`)
309+
* [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
310+
309311
* SmolVLM2 (`SmolVLMForConditionalGeneration`)
310312
* [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
311313

docs/multimodal.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ Use `--multimedia_file_tags` to specify a pair of tags, for example:
2121
--multimedia_file_tags {{ }}
2222
```
2323

24-
Then, an `image` can be embedded in prompt like this:
24+
Then an `image`, `audio`, or `video` can be embedded in prompt like this:
2525

2626
```
27-
{{image:/path/to/an/image/file}}
27+
{{tag:/path/to/an/image/file}}
2828
```
2929

30+
where `tag` is `image`, `audio`, or `video` respectively.
31+
3032
Take Fuyu model as an example:
3133

3234
```

0 commit comments

Comments
 (0)