@@ -753,15 +753,26 @@ def speech_synthesis_worker(request_queue, response_queue, audio_api_key, voice_
753753 class Callback (ResultCallback ):
754754 def __init__ (self , response_queue ):
755755 self .response_queue = response_queue
756+ self .cache = np .zeros (0 ).astype (np .float32 )
756757 def on_open (self ): pass
757- def on_complete (self ): pass
758+ def on_complete (self ):
759+ if len (self .cache )> 0 :
760+ data = (resample (self .cache , orig_sr = 24000 , target_sr = 48000 )* 32768. ).clip (- 32768 , 32767 ).astype (np .int16 ).tobytes ()
761+ self .response_queue .put (data )
762+ self .cache = np .zeros (0 ).astype (np .float32 )
758763 def on_error (self , message : str ): print (f"TTS Error: { message } " )
759764 def on_close (self ): pass
760765 def on_event (self , message ): pass
761766 def on_data (self , data : bytes ) -> None :
762767 audio = np .frombuffer (data , dtype = np .int16 ).astype (np .float32 ) / 32768.0
763- data = (resample (audio , orig_sr = 24000 , target_sr = 48000 )* 32767. ).clip (- 32767 , 32766 ).astype (np .int16 ).tobytes ()
764- self .response_queue .put (data )
768+ self .cache = np .concatenate ([self .cache , audio ])
769+ if len (self .cache )>= 8000 :
770+ data = self .cache [:8000 ]
771+ data = (resample (data , orig_sr = 24000 , target_sr = 48000 )* 32768. ).clip (- 32768 , 32767 ).astype (np .int16 ).tobytes ()
772+ self .response_queue .put (data )
773+ self .cache = self .cache [8000 :]
774+
775+
765776 callback = Callback (response_queue )
766777 current_speech_id = None
767778 synthesizer = None
0 commit comments