Skip to content

Commit 2aeebf5

Browse files
committed
fix: tighten title language detection
1 parent 5528e2c commit 2aeebf5

3 files changed

Lines changed: 29 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33

44
## [Unreleased]
55

6+
### Fixed
7+
8+
- Title-language detection no longer treats common English tech/jargon text such as "session die" or DAS/DER references as German just because of shared tokens. (Refs #3040)
9+
610
## [v0.51.152] — 2026-05-28 — Release DX (stage-batch34 — single-PR optional gateway-backed browser chat)
711

812
### Added

api/streaming.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,12 +1385,12 @@ def _detect_title_language(text: str) -> str:
13851385
return ''
13861386
german_markers = {
13871387
'warum', 'werden', 'wird', 'wurde', 'hier', 'nicht', 'mehr', 'alte', 'alten',
1388-
'bilder', 'angezeigt', 'session', 'prüfe', 'ich', 'die', 'der', 'das', 'den',
1389-
'und', 'oder', 'mit', 'für', 'von', 'zu', 'ist', 'sind', 'bitte', 'kannst',
1388+
'bilder', 'angezeigt', 'prüfe', 'ich', 'und', 'oder', 'mit', 'für', 'von',
1389+
'zu', 'ist', 'sind', 'bitte', 'kannst',
13901390
}
13911391
tokens = re.findall(r'[A-Za-zÀ-ÖØ-öø-ÿ]+', s)
13921392
german_hits = sum(1 for tok in tokens if tok in german_markers)
1393-
if re.search(r'[äöüß]', s) or german_hits >= 2:
1393+
if re.search(r'[äöüß]', s) or german_hits >= 3:
13941394
return 'de'
13951395
return ''
13961396

tests/test_title_aux_routing.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,28 @@ def fake_call_llm(**kwargs):
229229
self.assertIn('Match the language of the user question', messages[0]['content'])
230230
self.assertIn('If the user writes German, output a German title', messages[0]['content'])
231231

232+
def test_title_language_detection_avoids_english_tech_false_positives(self):
233+
"""English tech/jargon text must not be classified as German by shared tokens."""
234+
from api.streaming import _detect_title_language
235+
236+
examples = [
237+
'Why did the session die after the DAS storage failover?',
238+
'The session can die when DAS storage disconnects.',
239+
'Debug the session and DER certificate import failure.',
240+
]
241+
for text in examples:
242+
with self.subTest(text=text):
243+
self.assertEqual(_detect_title_language(text), '')
244+
245+
def test_title_language_detection_keeps_german_without_umlaut(self):
246+
"""German without umlauts still needs a language hint when evidence is specific."""
247+
from api.streaming import _detect_title_language
248+
249+
self.assertEqual(
250+
_detect_title_language('Warum werden hier die Bilder der alten Session nicht angezeigt?'),
251+
'de',
252+
)
253+
232254
def test_german_source_rejects_english_aux_title(self):
233255
"""Regression: an English aux title must not overwrite a German conversation."""
234256
from api.streaming import _generate_llm_session_title_via_aux

0 commit comments

Comments
 (0)