foldl
diff --git a/‎CMakeLists.txt‎
Lines changed: 24 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 89 additions & 0 deletions b/‎convert.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎docs/models.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/models.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/multimodal.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/multimodal.md‎
Lines changed: 4 additions & 2 deletions
@@ -43,7 +43,18 @@ if (GGML_CLBLAST)
     add_compile_definitions(GGML_USE_CLBLAST)
 endif ()
 
-add_library(libchatllm SHARED EXCLUDE_FROM_ALL src/main.cpp src/backend.cpp src/chat.cpp src/vectorstore.cpp src/layers.cpp src/tokenizer.cpp src/models.cpp src/unicode.cpp src/unicode-data.cpp src/vision_process.cpp)
+add_library(libchatllm SHARED EXCLUDE_FROM_ALL
+    src/main.cpp
+    src/backend.cpp
+    src/chat.cpp
+    src/vectorstore.cpp
+    src/layers.cpp
+    src/tokenizer.cpp
+    src/models.cpp
+    src/unicode.cpp
+    src/unicode-data.cpp
+    src/vision_process.cpp
+    src/audio_process.cpp)
 target_link_libraries(libchatllm PRIVATE ggml)
 target_compile_definitions(libchatllm PUBLIC CHATLLM_SHARED_LIB)
 SET_TARGET_PROPERTIES(libchatllm PROPERTIES PREFIX "")
@@ -52,5 +63,16 @@ set_target_properties(libchatllm
   LIBRARY_OUTPUT_DIRECTORY "../bindings"
 )
 
-add_executable(main src/main.cpp src/chat.cpp src/backend.cpp src/vectorstore.cpp src/layers.cpp src/tokenizer.cpp src/models.cpp src/unicode.cpp src/unicode-data.cpp src/vision_process.cpp)
+add_executable(main
+    src/main.cpp
+    src/chat.cpp
+    src/backend.cpp
+    src/vectorstore.cpp
+    src/layers.cpp
+    src/tokenizer.cpp
+    src/models.cpp
+    src/unicode.cpp
+    src/unicode-data.cpp
+    src/vision_process.cpp
+    src/audio_process.cpp)
 target_link_libraries(main PRIVATE ggml)
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-06-21: [I can hear](./docs/multimodal.md): Qwen2-Audio
 * 2025-06-10: SmolVLM2
 * 2025-06-07: MiniCPM4
 * 2025-06-06: Qwen-3 Embedding & Reranker
@@ -121,6 +122,12 @@ In order to build this project you have several different options.
 
   The executable is `./build/bin/main`.
 
+  There are lots of `GGML_...` options to play with. Example: Vulkan acceleration together with RPC and backend dynamic loading:
+
+  ```sh
+  cmake -B build -DGGML_VULKAN=1 -DGGML_RPC=1 -DGGML_CPU_ALL_VARIANTS=1 -DGGML_BACKEND_DL=1
+  ```
+
 ### Run
 
 Now you may chat with a quantized model by running:
 
@@ -4,6 +4,7 @@
 import argparse
 from ast import Dict, Tuple
 from collections import OrderedDict
+import copy
 import json
 import struct
 import time, os
@@ -48,6 +49,7 @@ class ChatModelAP(IntEnum):
     VideoOutput     = 0x40
 
 ModelTypeTagChatImageIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value) >> 1) << 24
+ModelTypeTagChatAudioIn = ((ChatModelAP.Text.value + ChatModelAP.AudioInput.value) >> 1) << 24
 ModelTypeTagChatImageVideoIn = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value) >> 1) << 24
 ModelTypeTagChatImageVideoAudioInAudioOut = ((ChatModelAP.Text.value + ChatModelAP.ImageInput.value + ChatModelAP.VideoInput.value + ChatModelAP.AudioInput.value + ChatModelAP.AudioOutput.value) >> 1) << 24
 
@@ -210,6 +212,8 @@ class ModelType(Enum):
     LlaMA4                  = ModelTypeTagChatImageIn + 0x0000001
     Gemma3Vis               = ModelTypeTagChatImageIn + 0x0000011
 
+    Qwen2Audio              = ModelTypeTagChatAudioIn + 0x0000001
+
     Qwen2_5VL               = ModelTypeTagChatImageVideoIn + 0x0000001
     KimiVL                  = ModelTypeTagChatImageVideoIn + 0x0000100
     SmolVLM                 = ModelTypeTagChatImageVideoIn + 0x0000200
@@ -4214,6 +4218,89 @@ def get_weight_names(config):
 
         return weight_names
 
+class QWen2AudioConverter(BaseConverter):
+    MODEL_TYPE = ModelType.Qwen2Audio
+
+    txt_config = {}
+
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        r = {}
+        for name in state_dict:
+            tensor: torch.Tensor = state_dict[name]
+            new_name = name
+            if new_name.startswith('audio_tower.'):
+                new_name = new_name.replace('audio_tower.', 'audio.')
+                if '.out_proj.' in new_name:
+                    new_name = new_name.replace('.out_proj.', '.o_proj.')
+                elif '.fc1.' in new_name:
+                    new_name = new_name.replace('.fc1.', '.mlp.fc1.')
+                elif '.fc2.' in new_name:
+                    new_name = new_name.replace('.fc2.', '.mlp.fc2.')
+                new_name = new_name.replace('.self_attn_layer_norm.', '.input_layernorm.')
+                new_name = new_name.replace('.final_layer_norm.',     '.post_attention_layernorm.')
+            elif new_name.startswith('language_model.'):
+                new_name = new_name.replace('language_model.', '')
+
+            r[new_name] = tensor
+        return r
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        txt_config = copy.deepcopy(config.text_config)
+
+        default = {
+            'hidden_act': 'silu',
+            'hidden_size': 4096,
+            'num_hidden_layers': 32,
+            'num_attention_heads': 32,
+            'num_key_value_heads': 32,
+            'use_sliding_window': False
+        }
+        for k, v in default.items():
+            if k not in txt_config:
+                txt_config[k] = v
+
+        QWen2AudioConverter.txt_config = AttributeDict(txt_config)
+        QWen2Converter.dump_config(f, QWen2AudioConverter.txt_config, ggml_type)
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = QWen2Converter.get_weight_names(QWen2AudioConverter.txt_config)
+
+        for i in range(config.audio_config['encoder_layers']):
+            weight_names += [
+                f"audio.layers.{i}.mlp.fc1.bias",
+                f"audio.layers.{i}.mlp.fc1.weight",
+                f"audio.layers.{i}.mlp.fc2.bias",
+                f"audio.layers.{i}.mlp.fc2.weight",
+                f"audio.layers.{i}.post_attention_layernorm.bias",
+                f"audio.layers.{i}.post_attention_layernorm.weight",
+                f"audio.layers.{i}.self_attn.k_proj.weight",
+                f"audio.layers.{i}.self_attn.o_proj.bias",
+                f"audio.layers.{i}.self_attn.o_proj.weight",
+                f"audio.layers.{i}.self_attn.q_proj.bias",
+                f"audio.layers.{i}.self_attn.q_proj.weight",
+                f"audio.layers.{i}.self_attn.v_proj.bias",
+                f"audio.layers.{i}.self_attn.v_proj.weight",
+                f"audio.layers.{i}.input_layernorm.bias",
+                f"audio.layers.{i}.input_layernorm.weight",
+            ]
+
+        weight_names += [
+            "audio.conv1.bias",
+            "audio.conv1.weight",
+            "audio.conv2.bias",
+            "audio.conv2.weight",
+            "audio.embed_positions.weight",
+            "audio.layer_norm.bias",
+            "audio.layer_norm.weight",
+            "multi_modal_projector.linear.bias",
+            "multi_modal_projector.linear.weight",
+        ]
+
+        return weight_names
+
 class QWen2_5VLConverter(BaseConverter):
     MODEL_TYPE = ModelType.Qwen2_5VL
 
@@ -7185,6 +7272,8 @@ def main():
             QWen2Converter.MODEL_TYPE = ModelType.ReaderLM2
             assert config.tie_word_embeddings
         QWen2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'Qwen2AudioForConditionalGeneration':
+        QWen2AudioConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'Qwen2_5_VLForConditionalGeneration':
         QWen2_5VLConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'KimiVLForConditionalGeneration':
 
@@ -302,10 +302,12 @@ Please use `--format completion` for these models.
 
     Note: Only download `tokenizer.model` and DO NOT download `tokenizer.json` when converting. Use `--set do-pan-and-scan 1` to enable _Pan and Scan_.
 
-
 * Kimi (`KimiVLForConditionalGeneration`)
     * [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
 
+* Qwen (`Qwen2AudioForConditionalGeneration`)
+    * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
+
 * SmolVLM2 (`SmolVLMForConditionalGeneration`)
     * [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
 
 
@@ -21,12 +21,14 @@ Use `--multimedia_file_tags` to specify a pair of tags, for example:
 --multimedia_file_tags {{ }}
 ```
 
-Then, an `image` can be embedded in prompt like this:
+Then an `image`, `audio`, or `video` can be embedded in prompt like this:
 
 ```
-{{image:/path/to/an/image/file}}
+{{tag:/path/to/an/image/file}}
 ```
 
+where `tag` is `image`, `audio`, or `video` respectively.
+
 Take Fuyu model as an example:
 
 ```