try to fix tts

richiejp · richiejp · commit 915824d53e0e · 2026-01-07T08:42:15.000Z
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
@@ -161,7 +161,7 @@ type Model interface {
 	Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error)
 	Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error)
 	PredictStream(ctx context.Context, in *proto.PredictOptions, f func(*proto.Reply), opts ...grpc.CallOption) error
-	TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, error)
+	TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, string, error)
 }
 
 var upgrader = websocket.Upgrader{
@@ -581,7 +581,7 @@ func updateSession(session *Session, update *types.ClientSession, cl *config.Mod
 	if update.Model != "" || update.Voice != "" || update.InputAudioTranscription != nil {
 		pipeline := config.Pipeline{
 			VAD:           defaultVADModel,
-			LLM:           update.Model,
+			LLM:           session.Model,
 			Transcription: session.InputAudioTranscription.Model,
 			TTS:           session.Voice,
 		}
@@ -923,29 +923,12 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
 	}
 	conv.Lock.Unlock()
 
-	f, err := os.CreateTemp("", "realtime-tts-*.wav")
-	if err != nil {
-		xlog.Error("failed to create temp file for TTS", "error", err)
-		sendError(c, "tts_error", "Failed to create temp file for TTS", "", item.ID)
-		return
-	}
-	defer os.Remove(f.Name())
-
-	modelWrapped, ok := session.ModelInterface.(*wrappedModel)
-	if !ok {
-		xlog.Error("model is not wrappedModel")
-		sendError(c, "model_error", "Model is not wrappedModel", "", item.ID)
-		return
-	}
-
 	ttsReq := &proto.TTSRequest{
 		Text:  response,
 		Voice: session.Voice,
-		Model: modelWrapped.TTSConfig.Model,
-		Dst:   f.Name(),
 	}
 
-	res, err := session.ModelInterface.TTS(context.TODO(), ttsReq)
+	res, audioFilePath, err := session.ModelInterface.TTS(context.TODO(), ttsReq)
 	if err != nil {
 		xlog.Error("TTS failed", "error", err)
 		sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.ID)
@@ -956,8 +939,9 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
 		sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.ID)
 		return
 	}
+	defer os.Remove(audioFilePath)
 
-	audioBytes, err := os.ReadFile(f.Name())
+	audioBytes, err := os.ReadFile(audioFilePath)
 	if err != nil {
 		xlog.Error("failed to read TTS file", "error", err)
 		sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.ID)
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
@@ -3,12 +3,15 @@ package openai
 import (
 	"context"
 	"fmt"
+	"os"
+	"path/filepath"
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	grpcClient "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/xlog"
 	"google.golang.org/grpc"
 )
@@ -31,6 +34,7 @@ type wrappedModel struct {
 
 	VADConfig *config.ModelConfig
 	VADClient grpcClient.Backend
+	appConfig *config.ApplicationConfig
 }
 
 // anyToAnyModel represent a model which supports Any-to-Any operations
@@ -49,6 +53,7 @@ type transcriptOnlyModel struct {
 	TranscriptionClient grpcClient.Backend
 	VADConfig           *config.ModelConfig
 	VADClient           grpcClient.Backend
+	appConfig           *config.ApplicationConfig
 }
 
 func (m *transcriptOnlyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
@@ -67,8 +72,8 @@ func (m *transcriptOnlyModel) PredictStream(ctx context.Context, in *proto.Predi
 	return fmt.Errorf("predict stream operation not supported in transcript-only mode")
 }
 
-func (m *transcriptOnlyModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, error) {
-	return nil, fmt.Errorf("TTS not supported in transcript-only mode")
+func (m *transcriptOnlyModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, string, error) {
+	return nil, "", fmt.Errorf("TTS not supported in transcript-only mode")
 }
 
 func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
@@ -101,8 +106,28 @@ func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptio
 	return m.LLMClient.PredictStream(ctx, in, f)
 }
 
-func (m *wrappedModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, error) {
-	return m.TTSClient.TTS(ctx, in, opts...)
+func (m *wrappedModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, string, error) {
+	if m.appConfig != nil && m.appConfig.SystemState != nil {
+		mp := filepath.Join(m.appConfig.SystemState.Model.ModelsPath, m.TTSConfig.Model)
+		if _, err := os.Stat(mp); err == nil {
+			if err := utils.VerifyPath(mp, m.appConfig.SystemState.Model.ModelsPath); err == nil {
+				in.Model = mp
+			}
+		}
+	}
+
+	if in.Dst == "" && m.appConfig != nil {
+		audioDir := filepath.Join(m.appConfig.GeneratedContentDir, "audio")
+		if err := os.MkdirAll(audioDir, 0750); err != nil {
+			return nil, "", fmt.Errorf("failed creating audio directory: %s", err)
+		}
+
+		fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
+		in.Dst = filepath.Join(audioDir, fileName)
+	}
+
+	res, err := m.TTSClient.TTS(ctx, in, opts...)
+	return res, in.Dst, err
 }
 
 func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
@@ -113,8 +138,10 @@ func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOpti
 	return m.LLMClient.PredictStream(ctx, in, f)
 }
 
-func (m *anyToAnyModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, error) {
-	return m.LLMClient.TTS(ctx, in, opts...)
+func (m *anyToAnyModel) TTS(ctx context.Context, in *proto.TTSRequest, opts ...grpc.CallOption) (*proto.Result, string, error) {
+	// TODO: Handle file generation if needed for anyToAnyModel
+	res, err := m.LLMClient.TTS(ctx, in, opts...)
+	return res, in.Dst, err
 }
 
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
@@ -155,6 +182,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		VADClient:           VADClient,
 		TranscriptionConfig: cfgSST,
 		TranscriptionClient: transcriptionClient,
+		appConfig:           appConfig,
 	}, cfgSST, nil
 }
 
@@ -266,5 +294,6 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 
 		VADConfig: cfgVAD,
 		VADClient: VADClient,
+		appConfig: appConfig,
 	}, nil
 }