Skip to content

Commit 0bbd498

Browse files
committed
feat: support audio modal input & refactor media decoder.
1 parent 3e4008b commit 0bbd498

File tree

10 files changed

+480
-206
lines changed

10 files changed

+480
-206
lines changed

xllm/core/framework/request/mm_codec.cpp

Lines changed: 386 additions & 192 deletions
Large diffs are not rendered by default.

xllm/core/framework/request/mm_codec.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,14 @@ class OpenCVVideoDecoder {
5252
torch::Tensor& t,
5353
VideoMetadata& meta);
5454
};
55+
56+
class FFmpegAudioDecoder {
57+
public:
58+
FFmpegAudioDecoder() = default;
59+
~FFmpegAudioDecoder() = default;
60+
61+
bool decode(const std::string& raw_data,
62+
torch::Tensor& t,
63+
AudioMetadata& meta);
64+
};
5565
} // namespace xllm

xllm/core/framework/request/mm_handler.cpp

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ bool ImageHandler::load(const MMContent& content,
107107

108108
bool ImageHandler::decode(MMInputItem& input) {
109109
OpenCVImageDecoder decoder;
110-
return decoder.decode(input.raw_data_, input.decode_data_);
110+
return decoder.decode(input.raw_data_, input.decode_image_);
111111
}
112112

113113
bool VideoHandler::load(const MMContent& content,
@@ -135,14 +135,51 @@ bool VideoHandler::load(const MMContent& content,
135135
}
136136

137137
bool VideoHandler::decode(MMInputItem& input) {
138+
FFmpegAudioDecoder audio_decoder;
139+
if (audio_decoder.decode(
140+
input.raw_data_, input.decode_audio_, input.audio_meta_)) {
141+
input.type_ |= MMType::AUDIO;
142+
}
143+
138144
OpenCVVideoDecoder decoder;
139-
return decoder.decode(input.raw_data_, input.decode_data_, input.video_meta_);
145+
return decoder.decode(
146+
input.raw_data_, input.decode_video_, input.video_meta_);
147+
}
148+
149+
bool AudioHandler::load(const MMContent& content,
150+
MMInputItem& input,
151+
MMPayload& payload) {
152+
input.clear();
153+
154+
const auto& audio_url = content.audio_url;
155+
const auto& url = audio_url.url;
156+
157+
if (url.compare(0, dataurl_prefix_.size(), dataurl_prefix_) ==
158+
0) { // data url
159+
160+
input.type_ = MMType::AUDIO;
161+
return this->load_from_dataurl(url, input.raw_data_, payload);
162+
} else if (url.compare(0, httpurl_prefix_.size(), httpurl_prefix_) ==
163+
0) { // http url
164+
165+
input.type_ = MMType::AUDIO;
166+
return this->load_from_http(url, input.raw_data_);
167+
} else {
168+
LOG(ERROR) << " audio url is invalid, url is " << url;
169+
return false;
170+
}
171+
}
172+
173+
bool AudioHandler::decode(MMInputItem& input) {
174+
FFmpegAudioDecoder decoder;
175+
return decoder.decode(
176+
input.raw_data_, input.decode_audio_, input.audio_meta_);
140177
}
141178

142179
MMHandlerSet::MMHandlerSet() {
143180
handlers_["image_url"] = std::make_unique<ImageHandler>();
144181
handlers_["video_url"] = std::make_unique<VideoHandler>();
145-
// handlers_["audio_url"] = std::make_unique<AudioHandler>();
182+
handlers_["audio_url"] = std::make_unique<AudioHandler>();
146183
handlers_["image_embedding"] =
147184
std::make_unique<MMEmbeddingHandler>(MMType::IMAGE);
148185
handlers_["video_embedding"] =

xllm/core/framework/request/mm_handler.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,20 @@ class VideoHandler : public MMHandlerBase {
8383
std::string dataurl_prefix_{"data:video"};
8484
};
8585

86+
class AudioHandler : public MMHandlerBase {
87+
public:
88+
AudioHandler() = default;
89+
~AudioHandler() = default;
90+
91+
virtual bool load(const MMContent& content,
92+
MMInputItem& input,
93+
MMPayload& payload) override;
94+
virtual bool decode(MMInputItem& input) override;
95+
96+
private:
97+
std::string dataurl_prefix_{"data:audio"};
98+
};
99+
86100
class MMHandlerSet {
87101
public:
88102
MMHandlerSet();

xllm/core/framework/request/mm_input.h

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,27 @@ struct MMInputItem {
3131
raw_data_.clear();
3232
}
3333

34-
MMType type_ = MMType::NONE;
34+
std::optional<torch::Tensor> get_decode_data(MMType type) const {
35+
if (type == MMType::IMAGE)
36+
return decode_image_;
37+
else if (type == MMType::VIDEO)
38+
return decode_video_;
39+
else if (type == MMType::AUDIO)
40+
return decode_audio_;
41+
else
42+
return std::nullopt;
43+
}
44+
45+
uint32_t type_ = MMType::NONE;
3546

3647
std::string raw_data_; // binary
3748

38-
torch::Tensor decode_data_; // image: rgb, [c,h,w], uint8
49+
torch::Tensor decode_image_; // image: rgb, [c,h,w], uint8
50+
torch::Tensor decode_video_; // video: rgb, [t,c,h,w], uint8
51+
torch::Tensor decode_audio_; // audio: mono, [t], float32
3952

4053
VideoMetadata video_meta_;
54+
AudioMetadata audio_meta_;
4155

4256
EmbeddingOutput embedding_;
4357
};
@@ -95,8 +109,9 @@ struct MMInput {
95109
std::vector<torch::Tensor> vec;
96110

97111
for (const auto& item : items_) {
98-
if (item.type_ == type) {
99-
vec.emplace_back(item.decode_data_);
112+
if (item.type_ & type) {
113+
auto t = item.get_decode_data(type);
114+
if (t.has_value()) vec.emplace_back(*t);
100115
}
101116
}
102117
return std::move(vec);
@@ -106,7 +121,7 @@ struct MMInput {
106121
std::vector<VideoMetadata> metas;
107122
metas.reserve(items_.size());
108123
for (auto& item : items_) {
109-
if (item.type_ == MMType::VIDEO) {
124+
if (item.type_ & MMType::VIDEO) {
110125
metas.push_back(item.video_meta_);
111126
}
112127
}

xllm/models/vlm/npu/glm4v.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,7 @@ class Glm4vForConditionalGenerationImpl : public torch::nn::Module {
952952
auto t = video_input->video_grid_thw.index({torch::indexing::Slice(), 0});
953953
auto video_tokens =
954954
((video_input->video_grid_thw.prod(-1) / merge_size / merge_size) / t)
955+
.cpu()
955956
.contiguous()
956957
.to(torch::kLong);
957958
std::vector<int64_t> video_tokens_vec(

xllm/models/vlm/npu/glm4v_moe.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class Glm4vMoeForConditionalGenerationImpl : public torch::nn::Module {
121121
auto t = video_input->video_grid_thw.index({torch::indexing::Slice(), 0});
122122
auto video_tokens =
123123
((video_input->video_grid_thw.prod(-1) / merge_size / merge_size) / t)
124+
.cpu()
124125
.contiguous()
125126
.to(torch::kLong);
126127
std::vector<int64_t> video_tokens_vec(

xllm/models/vlm/npu/qwen2_5_vl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
783783
input_params);
784784
auto video_tokens =
785785
(video_input->video_grid_thw.prod(-1) / merge_size / merge_size)
786+
.cpu()
786787
.contiguous()
787788
.to(torch::kLong);
788789
std::vector<int64_t> video_tokens_vec(

xllm/models/vlm/qwen2_5_vl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,7 @@ class Qwen2_5_VLForConditionalGenerationImpl : public torch::nn::Module {
738738
input_params);
739739
auto video_tokens =
740740
(video_input->video_grid_thw.prod(-1) / merge_size / merge_size)
741+
.cpu()
741742
.contiguous()
742743
.to(torch::kLong);
743744
std::vector<int64_t> video_tokens_vec(

xllm/processors/qwen2_vl_image_processor.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,15 +163,15 @@ bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) {
163163
std::vector<torch::Tensor> videos;
164164
std::vector<VideoMetadata> video_meta_list;
165165

166-
if (input_item.type_ == MMType::IMAGE) {
167-
if (input_item.decode_data_.defined()) {
168-
images.push_back(input_item.decode_data_);
166+
if (input_item.type_ & MMType::IMAGE) {
167+
if (input_item.decode_image_.defined()) {
168+
images.push_back(input_item.decode_image_);
169169
} else if (input_item.embedding_.embedding.defined()) {
170170
images_embedding.push_back(input_item.embedding_);
171171
}
172-
} else if (input_item.type_ == MMType::VIDEO) {
173-
if (input_item.decode_data_.defined()) {
174-
videos.push_back(input_item.decode_data_);
172+
} else if (input_item.type_ & MMType::VIDEO) {
173+
if (input_item.decode_video_.defined()) {
174+
videos.push_back(input_item.decode_video_);
175175
}
176176
video_meta_list.push_back(input_item.video_meta_);
177177
}

0 commit comments

Comments
 (0)