@@ -241,22 +241,36 @@ FLACDecoderResult FLACDecoder::decode_frame(const uint8_t *buffer, size_t buffer
241241 }
242242 }
243243
244- uint32_t bytes_per_sample = (this ->curr_frame_sample_depth_ + 7 ) / 8 ;
245- uint32_t shift_amount = 0 ;
246- if (this ->curr_frame_sample_depth_ % 8 != 0 ) {
247- shift_amount = 8 - (this ->curr_frame_sample_depth_ % 8 );
248- }
249-
250244 // Write decoded samples to output buffer using optimized fast paths
251- if (this ->curr_frame_sample_depth_ == 16 && shift_amount == 0 && this ->num_channels_ == 2 ) {
252- this ->write_samples_16bit_stereo (output_buffer, this ->curr_frame_block_size_ );
253- } else if (this ->curr_frame_sample_depth_ == 16 && shift_amount == 0 && this ->num_channels_ == 1 ) {
254- this ->write_samples_16bit_mono (output_buffer, this ->curr_frame_block_size_ );
255- } else if (this ->curr_frame_sample_depth_ == 24 && shift_amount == 0 && this ->num_channels_ == 2 ) {
256- this ->write_samples_24bit_stereo (output_buffer, this ->curr_frame_block_size_ );
245+ if (this ->output_32bit_samples_ ) {
246+ // 32-bit output mode: all samples output as 4 bytes, left-justified (MSB-aligned)
247+ uint32_t shift_amount = 32 - this ->curr_frame_sample_depth_ ;
248+
249+ if (this ->num_channels_ == 2 ) {
250+ this ->write_samples_32bit_stereo (output_buffer, this ->curr_frame_block_size_ , shift_amount);
251+ } else if (this ->num_channels_ == 1 ) {
252+ this ->write_samples_32bit_mono (output_buffer, this ->curr_frame_block_size_ , shift_amount);
253+ } else {
254+ this ->write_samples_32bit_general (output_buffer, this ->curr_frame_block_size_ , shift_amount);
255+ }
257256 } else {
258- this ->write_samples_general (output_buffer, this ->curr_frame_block_size_ , bytes_per_sample, shift_amount,
259- this ->curr_frame_sample_depth_ );
257+ // Native output mode: pack to nearest byte boundary
258+ uint32_t bytes_per_sample = (this ->curr_frame_sample_depth_ + 7 ) / 8 ;
259+ uint32_t shift_amount = 0 ;
260+ if (this ->curr_frame_sample_depth_ % 8 != 0 ) {
261+ shift_amount = 8 - (this ->curr_frame_sample_depth_ % 8 );
262+ }
263+
264+ if (this ->curr_frame_sample_depth_ == 16 && shift_amount == 0 && this ->num_channels_ == 2 ) {
265+ this ->write_samples_16bit_stereo (output_buffer, this ->curr_frame_block_size_ );
266+ } else if (this ->curr_frame_sample_depth_ == 16 && shift_amount == 0 && this ->num_channels_ == 1 ) {
267+ this ->write_samples_16bit_mono (output_buffer, this ->curr_frame_block_size_ );
268+ } else if (this ->curr_frame_sample_depth_ == 24 && shift_amount == 0 && this ->num_channels_ == 2 ) {
269+ this ->write_samples_24bit_stereo (output_buffer, this ->curr_frame_block_size_ );
270+ } else {
271+ this ->write_samples_general (output_buffer, this ->curr_frame_block_size_ , bytes_per_sample, shift_amount,
272+ this ->curr_frame_sample_depth_ );
273+ }
260274 }
261275
262276 this ->reset_bit_buffer ();
@@ -265,90 +279,22 @@ FLACDecoderResult FLACDecoder::decode_frame(const uint8_t *buffer, size_t buffer
265279
266280FLAC_OPTIMIZE_O3
267281void FLACDecoder::write_samples_16bit_stereo (uint8_t *output_buffer, uint32_t block_size) {
268- // 16-bit stereo fast path with 4-sample unrolling
269- std::size_t output_index = 0 ;
270- uint32_t i = 0 ;
271- const uint32_t unroll_limit = block_size & ~3U ; // Round down to multiple of 4
282+ // 16-bit mono fast path
283+ int16_t *output_samples = reinterpret_cast <int16_t *>(output_buffer);
272284
273- // Process 4 samples at a time
274- for (; i < unroll_limit; i += 4 ) {
275- // Sample 0 - Left and Right channels
276- int32_t sample_0_l = this ->block_samples_ [i];
277- int32_t sample_0_r = this ->block_samples_ [block_size + i];
278- // Sample 1 - Left and Right channels
279- int32_t sample_1_l = this ->block_samples_ [i + 1 ];
280- int32_t sample_1_r = this ->block_samples_ [block_size + i + 1 ];
281- // Sample 2 - Left and Right channels
282- int32_t sample_2_l = this ->block_samples_ [i + 2 ];
283- int32_t sample_2_r = this ->block_samples_ [block_size + i + 2 ];
284- // Sample 3 - Left and Right channels
285- int32_t sample_3_l = this ->block_samples_ [i + 3 ];
286- int32_t sample_3_r = this ->block_samples_ [block_size + i + 3 ];
287-
288- // Direct 16-bit writes (little-endian)
289- output_buffer[output_index++] = sample_0_l & 0xFF ;
290- output_buffer[output_index++] = (sample_0_l >> 8 ) & 0xFF ;
291- output_buffer[output_index++] = sample_0_r & 0xFF ;
292- output_buffer[output_index++] = (sample_0_r >> 8 ) & 0xFF ;
293-
294- output_buffer[output_index++] = sample_1_l & 0xFF ;
295- output_buffer[output_index++] = (sample_1_l >> 8 ) & 0xFF ;
296- output_buffer[output_index++] = sample_1_r & 0xFF ;
297- output_buffer[output_index++] = (sample_1_r >> 8 ) & 0xFF ;
298-
299- output_buffer[output_index++] = sample_2_l & 0xFF ;
300- output_buffer[output_index++] = (sample_2_l >> 8 ) & 0xFF ;
301- output_buffer[output_index++] = sample_2_r & 0xFF ;
302- output_buffer[output_index++] = (sample_2_r >> 8 ) & 0xFF ;
303-
304- output_buffer[output_index++] = sample_3_l & 0xFF ;
305- output_buffer[output_index++] = (sample_3_l >> 8 ) & 0xFF ;
306- output_buffer[output_index++] = sample_3_r & 0xFF ;
307- output_buffer[output_index++] = (sample_3_r >> 8 ) & 0xFF ;
308- }
309-
310- // Handle remaining samples
311- for (; i < block_size; i++) {
312- int32_t sample_l = this ->block_samples_ [i];
313- int32_t sample_r = this ->block_samples_ [block_size + i];
314-
315- output_buffer[output_index++] = sample_l & 0xFF ;
316- output_buffer[output_index++] = (sample_l >> 8 ) & 0xFF ;
317- output_buffer[output_index++] = sample_r & 0xFF ;
318- output_buffer[output_index++] = (sample_r >> 8 ) & 0xFF ;
285+ for (uint32_t i = 0 ; i < block_size; ++i) {
286+ output_samples[2 * i] = this ->block_samples_ [i];
287+ output_samples[2 * i + 1 ] = this ->block_samples_ [block_size + i];
319288 }
320289}
321290
322291FLAC_OPTIMIZE_O3
323292void FLACDecoder::write_samples_16bit_mono (uint8_t *output_buffer, uint32_t block_size) {
324- // 16-bit mono fast path with 4-sample unrolling
325- std::size_t output_index = 0 ;
326- uint32_t i = 0 ;
327- const uint32_t unroll_limit = block_size & ~3U ; // Round down to multiple of 4
328-
329- // Process 4 samples at a time
330- for (; i < unroll_limit; i += 4 ) {
331- int32_t sample_0 = this ->block_samples_ [i];
332- int32_t sample_1 = this ->block_samples_ [i + 1 ];
333- int32_t sample_2 = this ->block_samples_ [i + 2 ];
334- int32_t sample_3 = this ->block_samples_ [i + 3 ];
335-
336- // Direct 16-bit writes (little-endian)
337- output_buffer[output_index++] = sample_0 & 0xFF ;
338- output_buffer[output_index++] = (sample_0 >> 8 ) & 0xFF ;
339- output_buffer[output_index++] = sample_1 & 0xFF ;
340- output_buffer[output_index++] = (sample_1 >> 8 ) & 0xFF ;
341- output_buffer[output_index++] = sample_2 & 0xFF ;
342- output_buffer[output_index++] = (sample_2 >> 8 ) & 0xFF ;
343- output_buffer[output_index++] = sample_3 & 0xFF ;
344- output_buffer[output_index++] = (sample_3 >> 8 ) & 0xFF ;
345- }
293+ // 16-bit mono fast path
294+ int16_t *output_samples = reinterpret_cast <int16_t *>(output_buffer);
346295
347- // Handle remaining samples
348- for (; i < block_size; i++) {
349- int32_t sample = this ->block_samples_ [i];
350- output_buffer[output_index++] = sample & 0xFF ;
351- output_buffer[output_index++] = (sample >> 8 ) & 0xFF ;
296+ for (uint32_t i = 0 ; i < block_size; ++i) {
297+ output_samples[i] = this ->block_samples_ [i];
352298 }
353299}
354300
@@ -448,6 +394,40 @@ void FLACDecoder::write_samples_general(uint8_t *output_buffer, uint32_t block_s
448394 }
449395}
450396
397+ FLAC_OPTIMIZE_O3
398+ void FLACDecoder::write_samples_32bit_stereo (uint8_t *output_buffer, uint32_t block_size, uint32_t shift_amount) {
399+ int32_t *output_samples = reinterpret_cast <int32_t *>(output_buffer);
400+ const int32_t *left = this ->block_samples_ ;
401+ const int32_t *right = this ->block_samples_ + block_size;
402+
403+ for (uint32_t i = 0 ; i < block_size; ++i) {
404+ output_samples[i * 2 ] = left[i] << shift_amount;
405+ output_samples[i * 2 + 1 ] = right[i] << shift_amount;
406+ }
407+ }
408+
409+ FLAC_OPTIMIZE_O3
410+ void FLACDecoder::write_samples_32bit_mono (uint8_t *output_buffer, uint32_t block_size, uint32_t shift_amount) {
411+ int32_t *output_samples = reinterpret_cast <int32_t *>(output_buffer);
412+ const int32_t *samples = this ->block_samples_ ;
413+
414+ for (uint32_t i = 0 ; i < block_size; ++i) {
415+ output_samples[i] = samples[i] << shift_amount;
416+ }
417+ }
418+
419+ FLAC_OPTIMIZE_O3
420+ void FLACDecoder::write_samples_32bit_general (uint8_t *output_buffer, uint32_t block_size, uint32_t shift_amount) {
421+ int32_t *output_samples = reinterpret_cast <int32_t *>(output_buffer);
422+ uint32_t output_index = 0 ;
423+
424+ for (uint32_t i = 0 ; i < block_size; ++i) {
425+ for (uint32_t ch = 0 ; ch < this ->num_channels_ ; ++ch) {
426+ output_samples[output_index++] = this ->block_samples_ [ch * block_size + i] << shift_amount;
427+ }
428+ }
429+ }
430+
451431FLACDecoderResult FLACDecoder::find_frame_sync (uint8_t &sync_byte_0, uint8_t &sync_byte_1) {
452432 this ->frame_start_index_ = 0 ;
453433
@@ -627,9 +607,8 @@ FLACDecoderResult FLACDecoder::decode_frame_header() {
627607 frame_sample_rate = this ->sample_rate_ ;
628608 } else {
629609 // Standard sample rate codes (1-11)
630- static const uint32_t sample_rate_table[] = {
631- 88200 , 176400 , 192000 , 8000 , 16000 , 22050 , 24000 , 32000 , 44100 , 48000 , 96000
632- };
610+ static const uint32_t sample_rate_table[] = {88200 , 176400 , 192000 , 8000 , 16000 , 22050 ,
611+ 24000 , 32000 , 44100 , 48000 , 96000 };
633612 if (sample_rate_code >= 1 && sample_rate_code <= 11 ) {
634613 frame_sample_rate = sample_rate_table[sample_rate_code - 1 ];
635614 } else {
0 commit comments