@@ -252,7 +252,6 @@ class llm_task {
252
252
}
253
253
return false ;
254
254
}
255
- SLOGI (" Processing text: %s" , msg_str.c_str ());
256
255
257
256
// Convert text to phonemes and tones
258
257
std::vector<int > phones_bef, tones_bef;
@@ -262,8 +261,6 @@ class llm_task {
262
261
int phone_len = phones.size ();
263
262
std::vector<int > langids (phone_len, 3 );
264
263
265
- SLOGI (" Phoneme conversion completed, length: %d" , phone_len);
266
-
267
264
// Run the encoder to generate hidden representations
268
265
auto encoder_output =
269
266
encoder_->Run (phones, tones, langids, g_matrix, mode_config_.noise_scale , mode_config_.noise_scale_w ,
@@ -273,27 +270,19 @@ class llm_task {
273
270
auto zp_info = encoder_output.at (0 ).GetTensorTypeAndShapeInfo ();
274
271
auto zp_shape = zp_info.GetShape ();
275
272
276
- SLOGI (" Encoder output completed, shape: [%ld, %ld, %ld], expected audio length: %d" , zp_shape[0 ],
277
- zp_shape[1 ], zp_shape[2 ], audio_len);
278
-
279
273
// Calculate decoder parameters
280
274
int zp_size = decoder_->GetInputSize (0 ) / sizeof (float );
281
275
int dec_len = zp_size / zp_shape[1 ];
282
276
int audio_slice_len = decoder_->GetOutputSize (0 ) / sizeof (float );
283
277
284
- const int pad_frames = 16 ;
278
+ const int pad_frames = 24 ;
285
279
const int samples_per_frame = 512 ;
286
280
287
- SLOGI (" Decoder configuration: frame length=%d, audio slice length=%d, pad length=%d, samples per frame=%d" ,
288
- dec_len, audio_slice_len, pad_frames, samples_per_frame);
289
-
290
281
const int effective_frames = dec_len - 2 * pad_frames;
291
282
292
283
int dec_slice_num =
293
284
static_cast <int >(std::ceil (static_cast <double >(zp_shape[2 ]) / static_cast <double >(effective_frames)));
294
285
295
- SLOGI (" Will perform %d inferences, each with effective frames: %d" , dec_slice_num, effective_frames);
296
-
297
286
// SOLA parameters setup
298
287
const int sola_buffer_frame = pad_frames * samples_per_frame; // Overlap buffer length
299
288
const int sola_search_frame = pad_frames * samples_per_frame; // Search window length
@@ -344,10 +333,6 @@ class llm_task {
344
333
output_start_frame = i * effective_frames;
345
334
output_end_frame = (i + 1 ) * effective_frames - 1 ;
346
335
}
347
-
348
- SLOGI (" Inference #%d: input frame range=[%d-%d], actual length=%d, output frame range=[%d-%d]" , i + 1 ,
349
- input_start, input_start + actual_len - 1 , actual_len, output_start_frame, output_end_frame);
350
-
351
336
// Prepare decoder input, initialize all to zero
352
337
std::vector<float > zp (zp_size, 0 );
353
338
@@ -365,8 +350,6 @@ class llm_task {
365
350
decoder_->SetInput (zp.data (), 0 );
366
351
decoder_->SetInput (g_matrix.data (), 1 );
367
352
368
- SLOGI (" Inference #%d: starting decoding..." , i + 1 );
369
-
370
353
if (0 != decoder_->Run ()) {
371
354
SLOGI (" Inference #%d: decoding failed" , i + 1 );
372
355
throw std::string (" decoder_ RunSync error" );
@@ -416,10 +399,6 @@ class llm_task {
416
399
417
400
first_frame = false ;
418
401
419
- SLOGI (
420
- " Inference #%d: First frame processing, added %d samples from position %d to output, saved %d "
421
- " samples to SOLA buffer" ,
422
- i + 1 , audio_len, audio_start, sola_buffer_frame);
423
402
} else {
424
403
// Non-first frame: SOLA alignment required
425
404
int audio_start = pad_frames * samples_per_frame;
@@ -451,9 +430,6 @@ class llm_task {
451
430
}
452
431
}
453
432
454
- SLOGI (" Inference #%d: SOLA found best alignment offset %d with correlation coefficient %f" , i + 1 ,
455
- best_offset, best_correlation);
456
-
457
433
// 3. Apply alignment offset
458
434
int aligned_start = audio_start + best_offset;
459
435
@@ -482,9 +458,6 @@ class llm_task {
482
458
int remaining_len =
483
459
std::min (remaining_needed, static_cast <int >(decoder_output.size () - remaining_start));
484
460
485
- SLOGI (" Inference #%d (final): Expected total=%d, processed=%d, needed=%d, available=%d" , i + 1 ,
486
- total_expected_samples, processed_samples, remaining_needed, remaining_len);
487
-
488
461
if (remaining_len > 0 ) {
489
462
pcmlist.insert (pcmlist.end (), decoder_output.begin () + remaining_start,
490
463
decoder_output.begin () + remaining_start + remaining_len);
@@ -514,50 +487,34 @@ class llm_task {
514
487
}
515
488
std::fill (sola_buffer.begin () + avail, sola_buffer.end (), 0 .0f );
516
489
}
517
-
518
- SLOGI (" Inference #%d: Added %d + %d samples to output, cumulative length: %zu" , i + 1 ,
519
- sola_buffer_frame, remaining_len, pcmlist.size ());
520
490
}
521
491
}
522
492
}
523
493
524
- SLOGI (" All inference completed, raw generated PCM length: %zu" , pcmlist.size ());
525
-
526
494
if (pcmlist.size () > audio_len) {
527
- SLOGI (" Truncating output from %zu to %d samples as per encoder prediction" , pcmlist.size (), audio_len);
528
495
pcmlist.resize (audio_len);
529
496
}
530
497
531
- SLOGI (" Final PCM length after truncation: %zu" , pcmlist.size ());
532
-
533
498
// Post-processing: resample and convert to int16
534
499
double src_ratio =
535
500
static_cast <double >(mode_config_.audio_rate ) / static_cast <double >(mode_config_.mode_rate );
536
501
std::vector<float > tmp_pcm ((pcmlist.size () * src_ratio + 1 ));
537
502
int len;
538
503
539
- SLOGI (" Starting audio resampling, source rate: %f, target rate: %f, ratio: %f" ,
540
- static_cast <float >(mode_config_.mode_rate ), static_cast <float >(mode_config_.audio_rate ), src_ratio);
541
-
542
504
resample_audio (pcmlist.data (), pcmlist.size (), tmp_pcm.data (), &len, src_ratio);
543
505
544
- SLOGI (" Resampling completed, length after resampling: %d" , len);
545
-
546
506
// Convert to 16-bit PCM
547
507
wav_pcm_data.reserve (len);
548
508
std::transform (tmp_pcm.begin (), tmp_pcm.begin () + len, std::back_inserter (wav_pcm_data),
549
509
[](const auto val) { return static_cast <int16_t >(val * INT16_MAX); });
550
510
551
- SLOGI (" Final audio length: %zu samples" , wav_pcm_data.size ());
552
-
553
511
// Call the output callback function with the result
554
512
if (out_callback_) {
555
513
out_callback_ (
556
514
std::string (reinterpret_cast <char *>(wav_pcm_data.data ()), wav_pcm_data.size () * sizeof (int16_t )),
557
515
finish);
558
516
}
559
517
560
- SLOGI (" TTS processing completed, output callback invoked" );
561
518
} catch (const std::exception &e) {
562
519
SLOGI (" TTS processing exception: %s" , e.what ());
563
520
return true ;
0 commit comments