feat: support audio modal input & refactor media decoder.

xanecdotex · xanecdotex · commit 0bbd49866ce8 · 2026-01-08T20:56:53.000+08:00
diff --git a/xllm/core/framework/request/mm_codec.cpp b/xllm/core/framework/request/mm_codec.cpp
diff --git a/xllm/core/framework/request/mm_codec.h b/xllm/core/framework/request/mm_codec.h
@@ -52,4 +52,14 @@ class OpenCVVideoDecoder {
               torch::Tensor& t,
               VideoMetadata& meta);
 };
+
+class FFmpegAudioDecoder {
+ public:
+  FFmpegAudioDecoder() = default;
+  ~FFmpegAudioDecoder() = default;
+
+  bool decode(const std::string& raw_data,
+              torch::Tensor& t,
+              AudioMetadata& meta);
+};
 }  // namespace xllm
diff --git a/xllm/core/framework/request/mm_handler.cpp b/xllm/core/framework/request/mm_handler.cpp
@@ -107,7 +107,7 @@ bool ImageHandler::load(const MMContent& content,
 
 bool ImageHandler::decode(MMInputItem& input) {
   OpenCVImageDecoder decoder;
-  return decoder.decode(input.raw_data_, input.decode_data_);
+  return decoder.decode(input.raw_data_, input.decode_image_);
 }
 
 bool VideoHandler::load(const MMContent& content,
@@ -135,14 +135,51 @@ bool VideoHandler::load(const MMContent& content,
 }
 
 bool VideoHandler::decode(MMInputItem& input) {
+  FFmpegAudioDecoder audio_decoder;
+  if (audio_decoder.decode(
+          input.raw_data_, input.decode_audio_, input.audio_meta_)) {
+    input.type_ |= MMType::AUDIO;
+  }
+
   OpenCVVideoDecoder decoder;
-  return decoder.decode(input.raw_data_, input.decode_data_, input.video_meta_);
+  return decoder.decode(
+      input.raw_data_, input.decode_video_, input.video_meta_);
+}
+
+bool AudioHandler::load(const MMContent& content,
+                        MMInputItem& input,
+                        MMPayload& payload) {
+  input.clear();
+
+  const auto& audio_url = content.audio_url;
+  const auto& url = audio_url.url;
+
+  if (url.compare(0, dataurl_prefix_.size(), dataurl_prefix_) ==
+      0) {  // data url
+
+    input.type_ = MMType::AUDIO;
+    return this->load_from_dataurl(url, input.raw_data_, payload);
+  } else if (url.compare(0, httpurl_prefix_.size(), httpurl_prefix_) ==
+             0) {  // http url
+
+    input.type_ = MMType::AUDIO;
+    return this->load_from_http(url, input.raw_data_);
+  } else {
+    LOG(ERROR) << " audio url is invalid, url is " << url;
+    return false;
+  }
+}
+
+bool AudioHandler::decode(MMInputItem& input) {
+  FFmpegAudioDecoder decoder;
+  return decoder.decode(
+      input.raw_data_, input.decode_audio_, input.audio_meta_);
 }
 
 MMHandlerSet::MMHandlerSet() {
   handlers_["image_url"] = std::make_unique<ImageHandler>();
   handlers_["video_url"] = std::make_unique<VideoHandler>();
-  // handlers_["audio_url"] = std::make_unique<AudioHandler>();
+  handlers_["audio_url"] = std::make_unique<AudioHandler>();
   handlers_["image_embedding"] =
       std::make_unique<MMEmbeddingHandler>(MMType::IMAGE);
   handlers_["video_embedding"] =
diff --git a/xllm/core/framework/request/mm_handler.h b/xllm/core/framework/request/mm_handler.h
@@ -83,6 +83,20 @@ class VideoHandler : public MMHandlerBase {
   std::string dataurl_prefix_{"data:video"};
 };
 
+class AudioHandler : public MMHandlerBase {
+ public:
+  AudioHandler() = default;
+  ~AudioHandler() = default;
+
+  virtual bool load(const MMContent& content,
+                    MMInputItem& input,
+                    MMPayload& payload) override;
+  virtual bool decode(MMInputItem& input) override;
+
+ private:
+  std::string dataurl_prefix_{"data:audio"};
+};
+
 class MMHandlerSet {
  public:
   MMHandlerSet();
diff --git a/xllm/core/framework/request/mm_input.h b/xllm/core/framework/request/mm_input.h
@@ -31,13 +31,27 @@ struct MMInputItem {
     raw_data_.clear();
   }
 
-  MMType type_ = MMType::NONE;
+  std::optional<torch::Tensor> get_decode_data(MMType type) const {
+    if (type == MMType::IMAGE)
+      return decode_image_;
+    else if (type == MMType::VIDEO)
+      return decode_video_;
+    else if (type == MMType::AUDIO)
+      return decode_audio_;
+    else
+      return std::nullopt;
+  }
+
+  uint32_t type_ = MMType::NONE;
 
   std::string raw_data_;  // binary
 
-  torch::Tensor decode_data_;  // image: rgb, [c,h,w], uint8
+  torch::Tensor decode_image_;  // image: rgb, [c,h,w], uint8
+  torch::Tensor decode_video_;  // video: rgb, [t,c,h,w], uint8
+  torch::Tensor decode_audio_;  // audio: mono, [t], float32
 
   VideoMetadata video_meta_;
+  AudioMetadata audio_meta_;
 
   EmbeddingOutput embedding_;
 };
@@ -95,8 +109,9 @@ struct MMInput {
     std::vector<torch::Tensor> vec;
 
     for (const auto& item : items_) {
-      if (item.type_ == type) {
-        vec.emplace_back(item.decode_data_);
+      if (item.type_ & type) {
+        auto t = item.get_decode_data(type);
+        if (t.has_value()) vec.emplace_back(*t);
       }
     }
     return std::move(vec);
@@ -106,7 +121,7 @@ struct MMInput {
     std::vector<VideoMetadata> metas;
     metas.reserve(items_.size());
     for (auto& item : items_) {
-      if (item.type_ == MMType::VIDEO) {
+      if (item.type_ & MMType::VIDEO) {
         metas.push_back(item.video_meta_);
       }
     }
diff --git a/xllm/models/vlm/npu/glm4v.h b/xllm/models/vlm/npu/glm4v.h
@@ -952,6 +952,7 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
       auto t = video_input->video_grid_thw.index({torch::indexing::Slice(), 0});
       auto video_tokens =
           ((video_input->video_grid_thw.prod(-1) / merge_size / merge_size) / t)
+              .cpu()
               .contiguous()
               .to(torch::kLong);
       std::vector<int64_t> video_tokens_vec(
diff --git a/xllm/models/vlm/npu/glm4v_moe.h b/xllm/models/vlm/npu/glm4v_moe.h
@@ -121,6 +121,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
       auto t = video_input->video_grid_thw.index({torch::indexing::Slice(), 0});
       auto video_tokens =
           ((video_input->video_grid_thw.prod(-1) / merge_size / merge_size) / t)
+              .cpu()
               .contiguous()
               .to(torch::kLong);
       std::vector<int64_t> video_tokens_vec(
diff --git a/xllm/models/vlm/npu/qwen2_5_vl.h b/xllm/models/vlm/npu/qwen2_5_vl.h
@@ -783,6 +783,7 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
                                   input_params);
       auto video_tokens =
           (video_input->video_grid_thw.prod(-1) / merge_size / merge_size)
+              .cpu()
               .contiguous()
               .to(torch::kLong);
       std::vector<int64_t> video_tokens_vec(
diff --git a/xllm/models/vlm/qwen2_5_vl.h b/xllm/models/vlm/qwen2_5_vl.h
@@ -738,6 +738,7 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
                                   input_params);
       auto video_tokens =
           (video_input->video_grid_thw.prod(-1) / merge_size / merge_size)
+              .cpu()
               .contiguous()
               .to(torch::kLong);
       std::vector<int64_t> video_tokens_vec(
diff --git a/xllm/processors/qwen2_vl_image_processor.cpp b/xllm/processors/qwen2_vl_image_processor.cpp
@@ -163,15 +163,15 @@ bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
     std::vector<torch::Tensor> videos;
     std::vector<VideoMetadata> video_meta_list;
 
-    if (input_item.type_ == MMType::IMAGE) {
-      if (input_item.decode_data_.defined()) {
-        images.push_back(input_item.decode_data_);
+    if (input_item.type_ & MMType::IMAGE) {
+      if (input_item.decode_image_.defined()) {
+        images.push_back(input_item.decode_image_);
       } else if (input_item.embedding_.embedding.defined()) {
         images_embedding.push_back(input_item.embedding_);
       }
-    } else if (input_item.type_ == MMType::VIDEO) {
-      if (input_item.decode_data_.defined()) {
-        videos.push_back(input_item.decode_data_);
+    } else if (input_item.type_ & MMType::VIDEO) {
+      if (input_item.decode_video_.defined()) {
+        videos.push_back(input_item.decode_video_);
       }
       video_meta_list.push_back(input_item.video_meta_);
     }