fix(realtime): always strip reasoning from spoken output

mudler · mudler · commit e14e7d7e1d60 · 2026-06-04T23:18:37.000Z
disable_thinking maps to ReasoningConfig.DisableReasoning=true on the LLM
config, which the backend reads as enable_thinking=false. But the realtime
handler reads that SAME config to drive reasoning extraction, and there
DisableReasoning=true means "skip stripping". PredictConfig() returns this
LLM config, so both the streamed (speechStreamer) and buffered realtime
paths stopped stripping &lt;think&gt;…&lt;/think&gt; exactly when disable_thinking was
on — leaking raw reasoning to the client whenever the model ignored the
enable_thinking hint (e.g. lfm2.5).

Add spokenReasoningConfig() which clears DisableReasoning for extraction
(keeping custom tokens/tag pairs) and route both realtime paths through it.
Spoken output now always strips reasoning, independent of the backend
suppression hint.

Assisted-by: Claude:claude-opus-4-8 go test, golangci-lint
Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
@@ -1590,15 +1590,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
 		// so it's safe to apply unconditionally in the no-reasoning branch.
 		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
 		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
 		textContent = deltaContent
 		cleanedResponse = deltaContent
 		toolCalls = deltaToolCalls
 	} else {
-		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
 		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
 		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -2,6 +2,7 @@ package openai
 
 import (
 	"context"
+	"strings"
 
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
 	return n
 }
 
+// transcriptDeltaText concatenates the Delta of every recorded transcript
+// delta event — i.e. the text streamed to the client as it is generated.
+func (f *fakeTransport) transcriptDeltaText() string {
+	var b strings.Builder
+	for _, e := range f.events {
+		if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
+			b.WriteString(d.Delta)
+		}
+	}
+	return b.String()
+}
+
 // fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
 // and TranscribeStream replays transcribeDeltas, so the handler's streaming
 // paths can be driven deterministically.
diff --git a/core/http/endpoints/openai/realtime_stream.go b/core/http/endpoints/openai/realtime_stream.go
@@ -35,6 +35,9 @@ type speechStreamer struct {
 }
 
 func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
+	// Spoken output must never contain reasoning, even when disable_thinking set
+	// DisableReasoning (which would otherwise turn the extractor's stripping off).
+	reasoningCfg = spokenReasoningConfig(reasoningCfg)
 	return &speechStreamer{
 		ctx:        ctx,
 		t:          t,
diff --git a/core/http/endpoints/openai/realtime_stream_test.go b/core/http/endpoints/openai/realtime_stream_test.go
@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
 		Expect(audio).To(Equal([]byte{7, 7}))
 	})
 
+	It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
+		// disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
+		// the backend enable_thinking=false). When the model ignores that and emits
+		// thinking anyway, the spoken stream must still not leak it: the streamer is
+		// the last line of defence and always strips reasoning from spoken content.
+		disable := true
+		session := &Session{
+			OutputSampleRate: 24000,
+			ModelInterface:   &fakeModel{},
+			ModelConfig:      &config.ModelConfig{}, // streaming.tts off
+		}
+		t := &fakeTransport{}
+		s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
+			reasoning.Config{DisableReasoning: &disable})
+
+		s.onToken("<think>secret plan</think>")
+		s.onToken("The answer is 42.")
+		content, _, err := s.finish()
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(content).To(Equal("The answer is 42."))
+		Expect(content).ToNot(ContainSubstring("secret plan"))
+		// The text streamed to the client must not carry the reasoning either.
+		Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
+	})
+
 	It("does not synthesize audio when TTS streaming is disabled", func() {
 		m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
 		session := &Session{
diff --git a/core/http/endpoints/openai/realtime_thinking.go b/core/http/endpoints/openai/realtime_thinking.go
@@ -1,6 +1,9 @@
 package openai
 
-import "github.com/mudler/LocalAI/core/config"
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"
+)
 
 // applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
 // pipeline sets disable_thinking, mapping to the enable_thinking=false backend
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
 	disable := true
 	llm.ReasoningConfig.DisableReasoning = &disable
 }
+
+// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
+// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
+// the backend reads it as the "enable_thinking=false" hint (which pipeline
+// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
+// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
+// when extracting for speech would leak raw <think>…</think> whenever the model
+// ignores the suppression hint. Spoken output must never contain reasoning, so we
+// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
+func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
+	cfg.DisableReasoning = nil
+	return cfg
+}
diff --git a/core/http/endpoints/openai/realtime_thinking_test.go b/core/http/endpoints/openai/realtime_thinking_test.go
@@ -5,6 +5,7 @@ import (
 	. "github.com/onsi/gomega"
 
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"
 )
 
 // applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
 		Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
 	})
 })
+
+// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
+// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
+// LLM config (which the backend reads as enable_thinking=false).
+var _ = Describe("spokenReasoningConfig", func() {
+	It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
+		disable := true
+		out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
+		Expect(out.DisableReasoning).To(BeNil())
+	})
+
+	It("preserves the other reasoning settings", func() {
+		disable := true
+		out := spokenReasoningConfig(reasoning.Config{
+			DisableReasoning:    &disable,
+			ThinkingStartTokens: []string{"<reason>"},
+			TagPairs:            []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
+		})
+		Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
+		Expect(out.TagPairs).To(HaveLen(1))
+		Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
+	})
+})

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,9 @@ type speechStreamer struct {`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`func newSpeechStreamer(ctx context.Context, t Transport, session Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) speechStreamer {`
	`38`	`+ // Spoken output must never contain reasoning, even when disable_thinking set`
	`39`	`+ // DisableReasoning (which would otherwise turn the extractor's stripping off).`
	`40`	`+ reasoningCfg = spokenReasoningConfig(reasoningCfg)`
`38`	`41`	`return &speechStreamer{`
`39`	`42`	`ctx: ctx,`
`40`	`43`	`t: t,`