AOMediaCodec
diff --git a/‎iamf/cli/proto_conversion/proto_to_obu/audio_frame_generator.cc‎
Lines changed: 43 additions & 33 deletions b/‎iamf/cli/proto_conversion/proto_to_obu/audio_frame_generator.cc‎
Lines changed: 43 additions & 33 deletions
diff --git a/‎iamf/cli/proto_conversion/proto_to_obu/tests/audio_frame_generator_test.cc‎
Lines changed: 82 additions & 0 deletions b/‎iamf/cli/proto_conversion/proto_to_obu/tests/audio_frame_generator_test.cc‎
Lines changed: 82 additions & 0 deletions
@@ -184,9 +184,8 @@ absl::Status GetNumSamplesToPadAtEndAndValidate(
 }
 
 absl::Status InitializeSubstreamData(
+    uint32_t required_samples_to_delay,
     const SubstreamIdLabelsMap& substream_id_to_labels,
-    const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
-        substream_id_to_encoder,
     const size_t num_samples_per_frame,
     bool user_samples_to_trim_at_start_includes_codec_delay,
     const uint32_t user_samples_to_trim_at_start,
@@ -198,17 +197,9 @@ absl::Status InitializeSubstreamData(
   // samples will occur later to keep trimming logic in one place as much as
   // possible.
   for (const auto& [substream_id, labels] : substream_id_to_labels) {
-    const auto encoder_iter = substream_id_to_encoder.find(substream_id);
-    if (encoder_iter == substream_id_to_encoder.end()) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Failed to find encoder for substream ID= ", substream_id));
-    }
-
-    uint32_t encoder_required_samples_to_delay =
-        encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
     if (user_samples_to_trim_at_start_includes_codec_delay) {
       MAYBE_RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
-          user_samples_to_trim_at_start, encoder_required_samples_to_delay));
+          user_samples_to_trim_at_start, required_samples_to_delay));
     }
 
     // Initialize a `SubstreamData` with virtual samples for any delay
@@ -217,17 +208,17 @@ absl::Status InitializeSubstreamData(
     const auto& [substream_data_iter, inserted] =
         substream_id_to_substream_data.emplace(
             substream_id,
-            SubstreamData{.substream_id = substream_id,
-                          .frames_in_obu = SubstreamFrames<InternalSampleType>(
-                              num_channels, num_samples_per_frame),
-                          .frames_to_encode = SubstreamFrames<int32_t>(
-                              num_channels, num_samples_per_frame),
-                          .output_gains_linear = {},
-                          .num_samples_to_trim_at_end = 0,
-                          .num_samples_to_trim_at_start =
-                              encoder_required_samples_to_delay});
+            SubstreamData{
+                .substream_id = substream_id,
+                .frames_in_obu = SubstreamFrames<InternalSampleType>(
+                    num_channels, num_samples_per_frame),
+                .frames_to_encode = SubstreamFrames<int32_t>(
+                    num_channels, num_samples_per_frame),
+                .output_gains_linear = {},
+                .num_samples_to_trim_at_end = 0,
+                .num_samples_to_trim_at_start = required_samples_to_delay});
     substream_data_iter->second.frames_in_obu.PadZeros(
-        encoder_required_samples_to_delay);
+        required_samples_to_delay);
   }
 
   return absl::OkStatus();
@@ -571,7 +562,7 @@ absl::Status ApplyUserTrimForFrame(const bool from_start,
   if (num_samples_trimmed_in_obu > frame_samples_to_trim) {
     return absl::InvalidArgumentError(
         absl::StrCat("More samples were trimmed from the ", start_or_end_string,
-                     "than expected: (", num_samples_trimmed_in_obu, " vs ",
+                     " than expected: (", num_samples_trimmed_in_obu, " vs ",
                      frame_samples_to_trim, ")"));
   }
 
@@ -654,6 +645,34 @@ AudioFrameGenerator::Create(
         codec_config_obu_metadata.codec_config();
   }
 
+  // Initialize all of the encoders.
+  absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>
+      substream_id_to_encoder;
+  for (const auto& audio_frame_metadata : audio_frame_metadatas) {
+    const DecodedUleb128 audio_element_id =
+        audio_frame_metadata.audio_element_id();
+    const auto audio_elements_iter = audio_elements.find(audio_element_id);
+    if (audio_elements_iter == audio_elements.end()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Audio Element with ID= ", audio_element_id, " not found"));
+    }
+
+    // Create an encoder for each substream.
+    RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
+        codec_config_metadata, audio_elements_iter->second,
+        substream_id_to_encoder));
+  }
+
+  // Get the global maximum delay among all encoders. IAMF requires that all
+  // substreams have the same number of samples trimmed at the start. When
+  // mixing multiple codec config OBUs, codecs that do not traditionally have
+  // delay may need delay added for alignment.
+  uint32_t max_codec_delay = 0;
+  for (const auto& [substream_id, encoder] : substream_id_to_encoder) {
+    max_codec_delay =
+        std::max(max_codec_delay, encoder->GetNumberOfSamplesToDelayAtStart());
+  }
+
   const auto& first_audio_frame_metadata = *audio_frame_metadatas.begin();
   const int64_t common_samples_to_trim_at_start = static_cast<int64_t>(
       first_audio_frame_metadata.samples_to_trim_at_start());
@@ -664,11 +683,8 @@ AudioFrameGenerator::Create(
   const bool common_samples_to_trim_at_start_includes_codec_delay =
       first_audio_frame_metadata
           .samples_to_trim_at_start_includes_codec_delay();
-
   absl::flat_hash_map<DecodedUleb128, absl::flat_hash_set<ChannelLabel::Label>>
       audio_element_id_to_labels;
-  absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>
-      substream_id_to_encoder;
   absl::flat_hash_map<uint32_t, SubstreamData> substream_id_to_substream_data;
   absl::flat_hash_map<uint32_t, TrimmingState> substream_id_to_trimming_state;
   for (const auto& audio_frame_metadata : audio_frame_metadatas) {
@@ -687,7 +703,6 @@ AudioFrameGenerator::Create(
           "Audio Element with ID= ", audio_element_id, " not found"));
     }
 
-    // Create an encoder for each substream.
     const AudioElementWithData& audio_element_with_data =
         audio_elements_iter->second;
     const auto num_samples_per_frame =
@@ -696,13 +711,9 @@ AudioFrameGenerator::Create(
       return absl::InvalidArgumentError(
           "The spec disallows trimming multiple frames from the end.");
     }
-    RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
-        codec_config_metadata, audio_element_with_data,
-        substream_id_to_encoder));
-
     // Intermediate data for all substreams belonging to an Audio Element.
     RETURN_IF_NOT_OK(InitializeSubstreamData(
-        audio_element_with_data.substream_id_to_labels, substream_id_to_encoder,
+        max_codec_delay, audio_element_with_data.substream_id_to_labels,
         num_samples_per_frame,
         audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
         audio_frame_metadata.samples_to_trim_at_start(),
@@ -736,8 +747,7 @@ AudioFrameGenerator::Create(
       const int64_t additional_samples_to_trim_at_start =
           common_samples_to_trim_at_start_includes_codec_delay
               ? 0
-              : substream_id_to_encoder[substream_id]
-                    ->GetNumberOfSamplesToDelayAtStart();
+              : max_codec_delay;
       substream_id_to_trimming_state[substream_id] = {
           .increment_samples_to_trim_at_end_by_padding =
               !audio_frame_metadata.samples_to_trim_at_end_includes_padding(),
 
@@ -55,10 +55,14 @@ namespace {
 
 using ::absl_testing::IsOk;
 using ::absl_testing::IsOkAndHolds;
+using ::testing::AllOf;
+using ::testing::Each;
 using ::testing::ElementsAre;
 using ::testing::Not;
 using ::testing::NotNull;
 
+using absl::MakeConstSpan;
+
 constexpr DecodedUleb128 kCodecConfigId = 99;
 constexpr uint32_t kSampleRate = 48000;
 
@@ -93,6 +97,11 @@ MATCHER_P(NumSamplesToTrimAtStartIs, expected_samples_to_trim_at_start, "") {
          expected_samples_to_trim_at_start;
 }
 
+MATCHER_P(NumSamplesToTrimAtEndIs, expected_samples_to_trim_at_end, "") {
+  return arg.obu.header_.num_samples_to_trim_at_end ==
+         expected_samples_to_trim_at_end;
+}
+
 constexpr std::array<InternalSampleType, 0> kEmptyFrame = {};
 
 // TODO(b/301490667): Add more tests. Include tests with multiple substreams.
@@ -710,6 +719,79 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
   }
 }
 
+TEST(AudioFrameGenerator, AllAudioElementsHaveSameCodecDelay) {
+  // Configure two audio elements: one with LPCM (0 delay) and one with AAC
+  // (2048 delay). Typically, the frame size and sample rate still must agree.
+  iamf_tools_cli_proto::UserMetadata user_metadata;
+  ConfigureOneStereoSubstreamLittleEndian(user_metadata);
+  user_metadata.mutable_codec_config_metadata(0)
+      ->mutable_codec_config()
+      ->set_num_samples_per_frame(kAacNumSamplesPerFrame);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(false);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_end_includes_padding(false);
+  // Add AAC codec config and associated audio element.
+  const DecodedUleb128 kSecondCodecConfigId = 100;
+  auto* aac_codec_config_metadata = user_metadata.add_codec_config_metadata();
+  ConfigureAacCodecConfigMetadata(*aac_codec_config_metadata);
+  aac_codec_config_metadata->set_codec_config_id(kSecondCodecConfigId);
+  AddStereoAudioElementAndAudioFrameMetadata(
+      user_metadata, kSecondAudioElementId, kSecondSubstreamId);
+  user_metadata.mutable_audio_element_metadata(1)->set_codec_config_id(
+      kSecondCodecConfigId);
+  user_metadata.mutable_audio_frame_metadata(1)
+      ->set_samples_to_trim_at_start_includes_codec_delay(false);
+  user_metadata.mutable_audio_frame_metadata(1)
+      ->set_samples_to_trim_at_end_includes_padding(false);
+  const absl::flat_hash_map<uint32_t, ParamDefinitionVariant> param_definitions;
+  absl::flat_hash_map<uint32_t, CodecConfigObu> codec_config_obus;
+  absl::flat_hash_map<uint32_t, AudioElementWithData> audio_elements;
+  std::unique_ptr<GlobalTimingModule> global_timing_module;
+  std::unique_ptr<ParametersManager> parameters_manager;
+  std::unique_ptr<AudioFrameGenerator> audio_frame_generator;
+  InitializeAudioFrameGenerator(
+      user_metadata, param_definitions, codec_config_obus, audio_elements,
+      global_timing_module, parameters_manager, audio_frame_generator);
+
+  // Encode the same eight samples for each channel.
+  const auto kEightSamples = MakeConstSpan(kFrame0R2EightSamples);
+  EXPECT_THAT(audio_frame_generator->AddSamples(
+                  kFirstAudioElementId, ChannelLabel::kL2, kEightSamples),
+              IsOk());
+  EXPECT_THAT(audio_frame_generator->AddSamples(
+                  kFirstAudioElementId, ChannelLabel::kR2, kEightSamples),
+              IsOk());
+  EXPECT_THAT(audio_frame_generator->AddSamples(
+                  kSecondAudioElementId, ChannelLabel::kL2, kEightSamples),
+              IsOk());
+  EXPECT_THAT(audio_frame_generator->AddSamples(
+                  kSecondAudioElementId, ChannelLabel::kR2, kEightSamples),
+              IsOk());
+  EXPECT_THAT(audio_frame_generator->Finalize(), IsOk());
+  EXPECT_FALSE(audio_frame_generator->TakingSamples());
+
+  // AAC has a delay of 2048 samples, with 1024 samples per frame. Only eight
+  // real samples were encoded. The first two frames are fully trimmed, the
+  // third frame is partially trimmed from the end.
+  std::list<AudioFrameWithData> first_temporal_unit;
+  EXPECT_THAT(audio_frame_generator->OutputFrames(first_temporal_unit), IsOk());
+  EXPECT_THAT(first_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(1024),
+                                              NumSamplesToTrimAtEndIs(0))));
+
+  std::list<AudioFrameWithData> second_temporal_unit;
+  EXPECT_THAT(audio_frame_generator->OutputFrames(second_temporal_unit),
+              IsOk());
+  EXPECT_THAT(second_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(1024),
+                                               NumSamplesToTrimAtEndIs(0))));
+
+  std::list<AudioFrameWithData> third_temporal_unit;
+  EXPECT_THAT(audio_frame_generator->OutputFrames(third_temporal_unit), IsOk());
+  EXPECT_THAT(third_temporal_unit, Each(AllOf(NumSamplesToTrimAtStartIs(0),
+                                              NumSamplesToTrimAtEndIs(1016))));
+  EXPECT_FALSE(audio_frame_generator->GeneratingFrames());
+}
+
 TEST(AudioFrameGenerator,
      ErrorAudioElementsMustHaveSameTrimmingInformationAtEnd) {
   iamf_tools_cli_proto::UserMetadata user_metadata = {};