-
Notifications
You must be signed in to change notification settings - Fork 113
feat: support audio modal input & refactor media decoder. #682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,7 +107,7 @@ bool ImageHandler::load(const MMContent& content, | |
|
|
||
| bool ImageHandler::decode(MMInputItem& input) { | ||
| OpenCVImageDecoder decoder; | ||
| return decoder.decode(input.raw_data_, input.decode_data_); | ||
| return decoder.decode(input.raw_data_, input.decode_image_); | ||
| } | ||
|
|
||
| bool VideoHandler::load(const MMContent& content, | ||
|
|
@@ -135,14 +135,51 @@ bool VideoHandler::load(const MMContent& content, | |
| } | ||
|
|
||
| bool VideoHandler::decode(MMInputItem& input) { | ||
| FFmpegAudioDecoder audio_decoder; | ||
| if (audio_decoder.decode( | ||
| input.raw_data_, input.decode_audio_, input.audio_meta_)) { | ||
| input.type_ |= MMType::AUDIO; | ||
| } | ||
|
|
||
| OpenCVVideoDecoder decoder; | ||
| return decoder.decode(input.raw_data_, input.decode_data_, input.video_meta_); | ||
| return decoder.decode( | ||
| input.raw_data_, input.decode_video_, input.video_meta_); | ||
| } | ||
|
Comment on lines
137
to
+147
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return value of bool VideoHandler::decode(MMInputItem& input) {
bool audio_decoded = false;
FFmpegAudioDecoder audio_decoder;
if (audio_decoder.decode(
input.raw_data_, input.decode_audio_, input.audio_meta_)) {
input.type_ |= MMType::AUDIO;
audio_decoded = true;
}
OpenCVVideoDecoder video_decoder;
bool video_decoded = video_decoder.decode(
input.raw_data_, input.decode_video_, input.video_meta_);
return audio_decoded || video_decoded;
} |
||
|
|
||
| bool AudioHandler::load(const MMContent& content, | ||
| MMInputItem& input, | ||
| MMPayload& payload) { | ||
| input.clear(); | ||
|
|
||
| const auto& audio_url = content.audio_url; | ||
| const auto& url = audio_url.url; | ||
|
|
||
| if (url.compare(0, dataurl_prefix_.size(), dataurl_prefix_) == | ||
| 0) { // data url | ||
|
|
||
| input.type_ = MMType::AUDIO; | ||
| return this->load_from_dataurl(url, input.raw_data_, payload); | ||
| } else if (url.compare(0, httpurl_prefix_.size(), httpurl_prefix_) == | ||
| 0) { // http url | ||
|
|
||
| input.type_ = MMType::AUDIO; | ||
| return this->load_from_http(url, input.raw_data_); | ||
| } else { | ||
| LOG(ERROR) << " audio url is invalid, url is " << url; | ||
| return false; | ||
| } | ||
| } | ||
|
Comment on lines
+149
to
+171
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Remediation: |
||
|
|
||
| bool AudioHandler::decode(MMInputItem& input) { | ||
| FFmpegAudioDecoder decoder; | ||
| return decoder.decode( | ||
| input.raw_data_, input.decode_audio_, input.audio_meta_); | ||
| } | ||
|
|
||
| MMHandlerSet::MMHandlerSet() { | ||
| handlers_["image_url"] = std::make_unique<ImageHandler>(); | ||
| handlers_["video_url"] = std::make_unique<VideoHandler>(); | ||
| // handlers_["audio_url"] = std::make_unique<AudioHandler>(); | ||
| handlers_["audio_url"] = std::make_unique<AudioHandler>(); | ||
| handlers_["image_embedding"] = | ||
| std::make_unique<MMEmbeddingHandler>(MMType::IMAGE); | ||
| handlers_["video_embedding"] = | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,23 +162,31 @@ bool Qwen2VLImageProcessor::process(const MMInput& inputs, MMData& datas) { | |
| std::vector<EmbeddingOutput> images_embedding; | ||
| std::vector<torch::Tensor> videos; | ||
| std::vector<VideoMetadata> video_meta_list; | ||
| std::vector<torch::Tensor> audios; | ||
| std::vector<AudioMetadata> audio_meta_list; | ||
|
|
||
| if (input_item.type_ == MMType::IMAGE) { | ||
| if (input_item.decode_data_.defined()) { | ||
| images.push_back(input_item.decode_data_); | ||
| if (input_item.has_type(MMType::IMAGE)) { | ||
| if (input_item.decode_image_.defined()) { | ||
| images.push_back(input_item.decode_image_); | ||
| } else if (input_item.embedding_.embedding.defined()) { | ||
| images_embedding.push_back(input_item.embedding_); | ||
| } | ||
| } else if (input_item.type_ == MMType::VIDEO) { | ||
| if (input_item.decode_data_.defined()) { | ||
| videos.push_back(input_item.decode_data_); | ||
| } else if (input_item.has_type(MMType::VIDEO)) { | ||
| if (input_item.decode_video_.defined()) { | ||
| videos.push_back(input_item.decode_video_); | ||
| } | ||
| video_meta_list.push_back(input_item.video_meta_); | ||
| } else if (input_item.has_type(MMType::AUDIO)) { | ||
| if (input_item.decode_audio_.defined()) { | ||
| audios.push_back(input_item.decode_audio_); | ||
| } | ||
| audio_meta_list.push_back(input_item.audio_meta_); | ||
| } | ||
|
Comment on lines
+168
to
184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The if (input_item.has_type(MMType::IMAGE)) {
if (input_item.decode_image_.defined()) {
images.push_back(input_item.decode_image_);
} else if (input_item.embedding_.embedding.defined()) {
images_embedding.push_back(input_item.embedding_);
}
}
if (input_item.has_type(MMType::VIDEO)) {
if (input_item.decode_video_.defined()) {
videos.push_back(input_item.decode_video_);
}
video_meta_list.push_back(input_item.video_meta_);
}
if (input_item.has_type(MMType::AUDIO)) {
if (input_item.decode_audio_.defined()) {
audios.push_back(input_item.decode_audio_);
}
audio_meta_list.push_back(input_item.audio_meta_);
} |
||
|
|
||
| if (images_embedding.empty() && images.empty() && | ||
| (videos.empty() || video_meta_list.empty())) { | ||
| LOG(ERROR) << "no image/video tensor or embedding found."; | ||
| (videos.empty() || video_meta_list.empty()) && | ||
| (audios.empty() || audio_meta_list.empty())) { | ||
| LOG(ERROR) << "no image/video/audio tensor or embedding found."; | ||
| return false; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should use AI to refactor all class and func in this files and add commetns.