Skip to content

Commit e14e7d7

Browse files
committed
fix(realtime): always strip reasoning from spoken output
disable_thinking maps to ReasoningConfig.DisableReasoning=true on the LLM config, which the backend reads as enable_thinking=false. But the realtime handler reads that SAME config to drive reasoning extraction, and there DisableReasoning=true means "skip stripping". PredictConfig() returns this LLM config, so both the streamed (speechStreamer) and buffered realtime paths stopped stripping <think>…</think> exactly when disable_thinking was on — leaking raw reasoning to the client whenever the model ignored the enable_thinking hint (e.g. lfm2.5). Add spokenReasoningConfig() which clears DisableReasoning for extraction (keeping custom tokens/tag pairs) and route both realtime paths through it. Spoken output now always strips reasoning, independent of the backend suppression hint. Assisted-by: Claude:claude-opus-4-8 go test, golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent fbbc4d7 commit e14e7d7

6 files changed

Lines changed: 85 additions & 3 deletions

File tree

core/http/endpoints/openai/realtime.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,15 +1590,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
15901590
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
15911591
// so it's safe to apply unconditionally in the no-reasoning branch.
15921592
if deltaReasoning == "" && deltaContent != "" {
1593-
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
1593+
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
15941594
}
15951595
reasoningText = deltaReasoning
15961596
responseWithoutReasoning = deltaContent
15971597
textContent = deltaContent
15981598
cleanedResponse = deltaContent
15991599
toolCalls = deltaToolCalls
16001600
} else {
1601-
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
1601+
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
16021602
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
16031603
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
16041604
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)

core/http/endpoints/openai/realtime_doubles_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package openai
22

33
import (
44
"context"
5+
"strings"
56

67
"github.com/mudler/LocalAI/core/backend"
78
"github.com/mudler/LocalAI/core/config"
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
4849
return n
4950
}
5051

52+
// transcriptDeltaText concatenates the Delta of every recorded transcript
53+
// delta event — i.e. the text streamed to the client as it is generated.
54+
func (f *fakeTransport) transcriptDeltaText() string {
55+
var b strings.Builder
56+
for _, e := range f.events {
57+
if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
58+
b.WriteString(d.Delta)
59+
}
60+
}
61+
return b.String()
62+
}
63+
5164
// fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
5265
// and TranscribeStream replays transcribeDeltas, so the handler's streaming
5366
// paths can be driven deterministically.

core/http/endpoints/openai/realtime_stream.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ type speechStreamer struct {
3535
}
3636

3737
func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
38+
// Spoken output must never contain reasoning, even when disable_thinking set
39+
// DisableReasoning (which would otherwise turn the extractor's stripping off).
40+
reasoningCfg = spokenReasoningConfig(reasoningCfg)
3841
return &speechStreamer{
3942
ctx: ctx,
4043
t: t,

core/http/endpoints/openai/realtime_stream_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
4343
Expect(audio).To(Equal([]byte{7, 7}))
4444
})
4545

46+
It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
47+
// disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
48+
// the backend enable_thinking=false). When the model ignores that and emits
49+
// thinking anyway, the spoken stream must still not leak it: the streamer is
50+
// the last line of defence and always strips reasoning from spoken content.
51+
disable := true
52+
session := &Session{
53+
OutputSampleRate: 24000,
54+
ModelInterface: &fakeModel{},
55+
ModelConfig: &config.ModelConfig{}, // streaming.tts off
56+
}
57+
t := &fakeTransport{}
58+
s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
59+
reasoning.Config{DisableReasoning: &disable})
60+
61+
s.onToken("<think>secret plan</think>")
62+
s.onToken("The answer is 42.")
63+
content, _, err := s.finish()
64+
65+
Expect(err).ToNot(HaveOccurred())
66+
Expect(content).To(Equal("The answer is 42."))
67+
Expect(content).ToNot(ContainSubstring("secret plan"))
68+
// The text streamed to the client must not carry the reasoning either.
69+
Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
70+
})
71+
4672
It("does not synthesize audio when TTS streaming is disabled", func() {
4773
m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
4874
session := &Session{

core/http/endpoints/openai/realtime_thinking.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package openai
22

3-
import "github.com/mudler/LocalAI/core/config"
3+
import (
4+
"github.com/mudler/LocalAI/core/config"
5+
"github.com/mudler/LocalAI/pkg/reasoning"
6+
)
47

58
// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
69
// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
1518
disable := true
1619
llm.ReasoningConfig.DisableReasoning = &disable
1720
}
21+
22+
// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
23+
// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
24+
// the backend reads it as the "enable_thinking=false" hint (which pipeline
25+
// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
26+
// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
27+
// when extracting for speech would leak raw <think>…</think> whenever the model
28+
// ignores the suppression hint. Spoken output must never contain reasoning, so we
29+
// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
30+
func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
31+
cfg.DisableReasoning = nil
32+
return cfg
33+
}

core/http/endpoints/openai/realtime_thinking_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
. "github.com/onsi/gomega"
66

77
"github.com/mudler/LocalAI/core/config"
8+
"github.com/mudler/LocalAI/pkg/reasoning"
89
)
910

1011
// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
2425
Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
2526
})
2627
})
28+
29+
// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
30+
// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
31+
// LLM config (which the backend reads as enable_thinking=false).
32+
var _ = Describe("spokenReasoningConfig", func() {
33+
It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
34+
disable := true
35+
out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
36+
Expect(out.DisableReasoning).To(BeNil())
37+
})
38+
39+
It("preserves the other reasoning settings", func() {
40+
disable := true
41+
out := spokenReasoningConfig(reasoning.Config{
42+
DisableReasoning: &disable,
43+
ThinkingStartTokens: []string{"<reason>"},
44+
TagPairs: []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
45+
})
46+
Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
47+
Expect(out.TagPairs).To(HaveLen(1))
48+
Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
49+
})
50+
})

0 commit comments

Comments
 (0)