Skip to content

Commit 3da395e

Browse files
committed
Improve usability when mixing codecs with differing codec delay.
- Background: - Most users should use `samples_to_trim_at_start_includes_codec_delay == false` as documented in audio_frame.proto. - Previously, the implementation inserted codec delay on a per-codec basis. - Tweak logic to insert the worst-case delay among all codecs. - Rationale: - Useful (e.g.) when mixing LPCM and AAC/Opus. - Under the old behavior, the user would have to compute the difference in codec delay, and insert artificial delay themselves. - This new behavior is seamless and easy to use correctly. Just continue using the recommended settings. The post-trim domain will automatically be aligned. - b/462405451: Demonstrate this via new test vectors which exercise LPCM x {Opus, AAC}. PiperOrigin-RevId: 836221159
1 parent 7653efa commit 3da395e

File tree

5 files changed

+584
-33
lines changed

5 files changed

+584
-33
lines changed

iamf/cli/proto_conversion/proto_to_obu/audio_frame_generator.cc

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,8 @@ absl::Status GetNumSamplesToPadAtEndAndValidate(
184184
}
185185

186186
absl::Status InitializeSubstreamData(
187+
uint32_t required_samples_to_delay,
187188
const SubstreamIdLabelsMap& substream_id_to_labels,
188-
const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
189-
substream_id_to_encoder,
190189
const size_t num_samples_per_frame,
191190
bool user_samples_to_trim_at_start_includes_codec_delay,
192191
const uint32_t user_samples_to_trim_at_start,
@@ -198,17 +197,9 @@ absl::Status InitializeSubstreamData(
198197
// samples will occur later to keep trimming logic in one place as much as
199198
// possible.
200199
for (const auto& [substream_id, labels] : substream_id_to_labels) {
201-
const auto encoder_iter = substream_id_to_encoder.find(substream_id);
202-
if (encoder_iter == substream_id_to_encoder.end()) {
203-
return absl::InvalidArgumentError(absl::StrCat(
204-
"Failed to find encoder for substream ID= ", substream_id));
205-
}
206-
207-
uint32_t encoder_required_samples_to_delay =
208-
encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
209200
if (user_samples_to_trim_at_start_includes_codec_delay) {
210201
MAYBE_RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
211-
user_samples_to_trim_at_start, encoder_required_samples_to_delay));
202+
user_samples_to_trim_at_start, required_samples_to_delay));
212203
}
213204

214205
// Initialize a `SubstreamData` with virtual samples for any delay
@@ -217,17 +208,17 @@ absl::Status InitializeSubstreamData(
217208
const auto& [substream_data_iter, inserted] =
218209
substream_id_to_substream_data.emplace(
219210
substream_id,
220-
SubstreamData{.substream_id = substream_id,
221-
.frames_in_obu = SubstreamFrames<InternalSampleType>(
222-
num_channels, num_samples_per_frame),
223-
.frames_to_encode = SubstreamFrames<int32_t>(
224-
num_channels, num_samples_per_frame),
225-
.output_gains_linear = {},
226-
.num_samples_to_trim_at_end = 0,
227-
.num_samples_to_trim_at_start =
228-
encoder_required_samples_to_delay});
211+
SubstreamData{
212+
.substream_id = substream_id,
213+
.frames_in_obu = SubstreamFrames<InternalSampleType>(
214+
num_channels, num_samples_per_frame),
215+
.frames_to_encode = SubstreamFrames<int32_t>(
216+
num_channels, num_samples_per_frame),
217+
.output_gains_linear = {},
218+
.num_samples_to_trim_at_end = 0,
219+
.num_samples_to_trim_at_start = required_samples_to_delay});
229220
substream_data_iter->second.frames_in_obu.PadZeros(
230-
encoder_required_samples_to_delay);
221+
required_samples_to_delay);
231222
}
232223

233224
return absl::OkStatus();
@@ -571,7 +562,7 @@ absl::Status ApplyUserTrimForFrame(const bool from_start,
571562
if (num_samples_trimmed_in_obu > frame_samples_to_trim) {
572563
return absl::InvalidArgumentError(
573564
absl::StrCat("More samples were trimmed from the ", start_or_end_string,
574-
"than expected: (", num_samples_trimmed_in_obu, " vs ",
565+
" than expected: (", num_samples_trimmed_in_obu, " vs ",
575566
frame_samples_to_trim, ")"));
576567
}
577568

@@ -654,6 +645,34 @@ AudioFrameGenerator::Create(
654645
codec_config_obu_metadata.codec_config();
655646
}
656647

648+
// Initialize all of the encoders.
649+
absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>
650+
substream_id_to_encoder;
651+
for (const auto& audio_frame_metadata : audio_frame_metadatas) {
652+
const DecodedUleb128 audio_element_id =
653+
audio_frame_metadata.audio_element_id();
654+
const auto audio_elements_iter = audio_elements.find(audio_element_id);
655+
if (audio_elements_iter == audio_elements.end()) {
656+
return absl::InvalidArgumentError(absl::StrCat(
657+
"Audio Element with ID= ", audio_element_id, " not found"));
658+
}
659+
660+
// Create an encoder for each substream.
661+
RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
662+
codec_config_metadata, audio_elements_iter->second,
663+
substream_id_to_encoder));
664+
}
665+
666+
// Get the global maximum delay among all encoders. IAMF requires that all
667+
// substreams have the same number of samples trimmed at the start. When
668+
// mixing multiple codec config OBUs, codecs that do not traditionally have
669+
// delay may need delay added for alignment.
670+
uint32_t max_codec_delay = 0;
671+
for (const auto& [substream_id, encoder] : substream_id_to_encoder) {
672+
max_codec_delay =
673+
std::max(max_codec_delay, encoder->GetNumberOfSamplesToDelayAtStart());
674+
}
675+
657676
const auto& first_audio_frame_metadata = *audio_frame_metadatas.begin();
658677
const int64_t common_samples_to_trim_at_start = static_cast<int64_t>(
659678
first_audio_frame_metadata.samples_to_trim_at_start());
@@ -664,11 +683,8 @@ AudioFrameGenerator::Create(
664683
const bool common_samples_to_trim_at_start_includes_codec_delay =
665684
first_audio_frame_metadata
666685
.samples_to_trim_at_start_includes_codec_delay();
667-
668686
absl::flat_hash_map<DecodedUleb128, absl::flat_hash_set<ChannelLabel::Label>>
669687
audio_element_id_to_labels;
670-
absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>
671-
substream_id_to_encoder;
672688
absl::flat_hash_map<uint32_t, SubstreamData> substream_id_to_substream_data;
673689
absl::flat_hash_map<uint32_t, TrimmingState> substream_id_to_trimming_state;
674690
for (const auto& audio_frame_metadata : audio_frame_metadatas) {
@@ -687,7 +703,6 @@ AudioFrameGenerator::Create(
687703
"Audio Element with ID= ", audio_element_id, " not found"));
688704
}
689705

690-
// Create an encoder for each substream.
691706
const AudioElementWithData& audio_element_with_data =
692707
audio_elements_iter->second;
693708
const auto num_samples_per_frame =
@@ -696,13 +711,9 @@ AudioFrameGenerator::Create(
696711
return absl::InvalidArgumentError(
697712
"The spec disallows trimming multiple frames from the end.");
698713
}
699-
RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
700-
codec_config_metadata, audio_element_with_data,
701-
substream_id_to_encoder));
702-
703714
// Intermediate data for all substreams belonging to an Audio Element.
704715
RETURN_IF_NOT_OK(InitializeSubstreamData(
705-
audio_element_with_data.substream_id_to_labels, substream_id_to_encoder,
716+
max_codec_delay, audio_element_with_data.substream_id_to_labels,
706717
num_samples_per_frame,
707718
audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
708719
audio_frame_metadata.samples_to_trim_at_start(),
@@ -736,8 +747,7 @@ AudioFrameGenerator::Create(
736747
const int64_t additional_samples_to_trim_at_start =
737748
common_samples_to_trim_at_start_includes_codec_delay
738749
? 0
739-
: substream_id_to_encoder[substream_id]
740-
->GetNumberOfSamplesToDelayAtStart();
750+
: max_codec_delay;
741751
substream_id_to_trimming_state[substream_id] = {
742752
.increment_samples_to_trim_at_end_by_padding =
743753
!audio_frame_metadata.samples_to_trim_at_end_includes_padding(),

iamf/cli/proto_conversion/proto_to_obu/tests/audio_frame_generator_test.cc

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,14 @@ namespace {
5555

5656
using ::absl_testing::IsOk;
5757
using ::absl_testing::IsOkAndHolds;
58+
using ::testing::AllOf;
59+
using ::testing::Each;
5860
using ::testing::ElementsAre;
5961
using ::testing::Not;
6062
using ::testing::NotNull;
6163

64+
using absl::MakeConstSpan;
65+
6266
constexpr DecodedUleb128 kCodecConfigId = 99;
6367
constexpr uint32_t kSampleRate = 48000;
6468

@@ -93,6 +97,11 @@ MATCHER_P(NumSamplesToTrimAtStartIs, expected_samples_to_trim_at_start, "") {
9397
expected_samples_to_trim_at_start;
9498
}
9599

100+
MATCHER_P(NumSamplesToTrimAtEndIs, expected_samples_to_trim_at_end, "") {
101+
return arg.obu.header_.num_samples_to_trim_at_end ==
102+
expected_samples_to_trim_at_end;
103+
}
104+
96105
constexpr std::array<InternalSampleType, 0> kEmptyFrame = {};
97106

98107
// TODO(b/301490667): Add more tests. Include tests with multiple substreams.
@@ -710,6 +719,79 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
710719
}
711720
}
712721

722+
TEST(AudioFrameGenerator, AllAudioElementsHaveSameCodecDelay) {
723+
// Configure two audio elements: one with LPCM (0 delay) and one with AAC
724+
// (2048 delay). Typically, the frame size and sample rate still must agree.
725+
iamf_tools_cli_proto::UserMetadata user_metadata;
726+
ConfigureOneStereoSubstreamLittleEndian(user_metadata);
727+
user_metadata.mutable_codec_config_metadata(0)
728+
->mutable_codec_config()
729+
->set_num_samples_per_frame(kAacNumSamplesPerFrame);
730+
user_metadata.mutable_audio_frame_metadata(0)
731+
->set_samples_to_trim_at_start_includes_codec_delay(false);
732+
user_metadata.mutable_audio_frame_metadata(0)
733+
->set_samples_to_trim_at_end_includes_padding(false);
734+
// Add AAC codec config and associated audio element.
735+
const DecodedUleb128 kSecondCodecConfigId = 100;
736+
auto* aac_codec_config_metadata = user_metadata.add_codec_config_metadata();
737+
ConfigureAacCodecConfigMetadata(*aac_codec_config_metadata);
738+
aac_codec_config_metadata->set_codec_config_id(kSecondCodecConfigId);
739+
AddStereoAudioElementAndAudioFrameMetadata(
740+
user_metadata, kSecondAudioElementId, kSecondSubstreamId);
741+
user_metadata.mutable_audio_element_metadata(1)->set_codec_config_id(
742+
kSecondCodecConfigId);
743+
user_metadata.mutable_audio_frame_metadata(1)
744+
->set_samples_to_trim_at_start_includes_codec_delay(false);
745+
user_metadata.mutable_audio_frame_metadata(1)
746+
->set_samples_to_trim_at_end_includes_padding(false);
747+
const absl::flat_hash_map<uint32_t, ParamDefinitionVariant> param_definitions;
748+
absl::flat_hash_map<uint32_t, CodecConfigObu> codec_config_obus;
749+
absl::flat_hash_map<uint32_t, AudioElementWithData> audio_elements;
750+
std::unique_ptr<GlobalTimingModule> global_timing_module;
751+
std::unique_ptr<ParametersManager> parameters_manager;
752+
std::unique_ptr<AudioFrameGenerator> audio_frame_generator;
753+
InitializeAudioFrameGenerator(
754+
user_metadata, param_definitions, codec_config_obus, audio_elements,
755+
global_timing_module, parameters_manager, audio_frame_generator);
756+
757+
// Encode the same eight samples for each channel.
758+
const auto kEightSamples = MakeConstSpan(kFrame0R2EightSamples);
759+
EXPECT_THAT(audio_frame_generator->AddSamples(
760+
kFirstAudioElementId, ChannelLabel::kL2, kEightSamples),
761+
IsOk());
762+
EXPECT_THAT(audio_frame_generator->AddSamples(
763+
kFirstAudioElementId, ChannelLabel::kR2, kEightSamples),
764+
IsOk());
765+
EXPECT_THAT(audio_frame_generator->AddSamples(
766+
kSecondAudioElementId, ChannelLabel::kL2, kEightSamples),
767+
IsOk());
768+
EXPECT_THAT(audio_frame_generator->AddSamples(
769+
kSecondAudioElementId, ChannelLabel::kR2, kEightSamples),
770+
IsOk());
771+
EXPECT_THAT(audio_frame_generator->Finalize(), IsOk());
772+
EXPECT_FALSE(audio_frame_generator->TakingSamples());
773+
774+
// AAC has a delay of 2048 samples, with 1024 samples per frame. Only eight
775+
// real samples were encoded. The first two frames are fully trimmed, the
776+
// third frame is partially trimmed from the end.
777+
std::list<AudioFrameWithData> first_temporal_unit;
778+
EXPECT_THAT(audio_frame_generator->OutputFrames(first_temporal_unit), IsOk());
779+
EXPECT_THAT(first_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(1024),
780+
NumSamplesToTrimAtEndIs(0))));
781+
782+
std::list<AudioFrameWithData> second_temporal_unit;
783+
EXPECT_THAT(audio_frame_generator->OutputFrames(second_temporal_unit),
784+
IsOk());
785+
EXPECT_THAT(second_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(1024),
786+
NumSamplesToTrimAtEndIs(0))));
787+
788+
std::list<AudioFrameWithData> third_temporal_unit;
789+
EXPECT_THAT(audio_frame_generator->OutputFrames(third_temporal_unit), IsOk());
790+
EXPECT_THAT(third_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(0),
791+
NumSamplesToTrimAtEndIs(1016))));
792+
EXPECT_FALSE(audio_frame_generator->GeneratingFrames());
793+
}
794+
713795
TEST(AudioFrameGenerator,
714796
ErrorAudioElementsMustHaveSameTrimmingInformationAtEnd) {
715797
iamf_tools_cli_proto::UserMetadata user_metadata = {};

0 commit comments

Comments
 (0)