feat(machinery): more robust response handling for llms

nijel · nijel · commit c5a3871971e1 · 2026-06-11T12:05:04.000+02:00
Gracefully handle when metadata is leaked to the response.
diff --git a/weblate/machinery/anthropic.py b/weblate/machinery/anthropic.py
@@ -50,8 +50,9 @@ def check_failure(self, response) -> None:
     def fetch_llm_translations(
         self, prompt: str, content: str, previous_content: str, previous_response: str
     ) -> str | None:
+        model = self.get_traced_model()
         payload = {
-            "model": self.get_model(),
+            "model": model,
             "max_tokens": self.settings.get("max_tokens", 4096),
             "system": prompt,
             "messages": [
diff --git a/weblate/machinery/llm.py b/weblate/machinery/llm.py
@@ -121,17 +121,17 @@
 7. Do not transliterate or explain translations.
 8.  Output must be entirely in the target_language except preserved placeholders.
 9. Output must be valid JSON.
-10. Output must be a single JSON array of strings.
+10. Output must be a single JSON array containing only JSON strings.
 11. Do not include markdown code fences or any additional text.
-12. The number of output elements must exactly match the number of input strings. Do not emit empty extra strings.
+12. The number of output elements must exactly match the number of input strings. Do not emit empty extra strings, objects, diagnostics, explanations, or metadata.
 13. Ensure all output strings are properly JSON-escaped.
 14. Internally verify placeholder integrity and JSON validity before responding.
 15. Placeholder contract: Tokens like @@PH44@@ are opaque atoms. Never translate, inflect, split, rename, reorder characters inside, wrap, or escape them. Never convert them to another syntax.
 16. Markup contract: Preserve markup, tags, attributes, entities, and similar control sequences exactly. Translate only human-readable text outside markup and outside placeholder tokens.
 17. Output contract: Return exactly one JSON array of strings, with no characters before `[` or after `]`.
-18. Treat context, key, explanation, secondary, plural, failing_checks, and placeholders fields as reference material only. Do not translate them directly and do not add their contents unless they are present in source.
+18. Treat context, key, explanation, secondary, plural, failing_checks, and placeholders fields as reference material only. Do not translate them directly and do not add, copy, or emit their contents unless they are present in source.
 19. Placeholder mappings explain what opaque placeholder tokens represent. This information may guide wording, but the output must still contain the exact placeholder tokens, not the mapped content.
-20. Failing checks describe issues to avoid or fix when improving an existing translation.
+20. Failing checks describe issues to avoid or fix when improving an existing translation. They are context only; do not include their check_id, name, description, or generated diagnostics in output.
 21. Target-language project instructions, when present above, contain additional requirements for the target language. Follow them unless they conflict with preserving the source meaning, placeholders, markup, or output contract.
 22. For translatable markup placeholders that wrap text, translate the whole text between the placeholders. Example: @@PH1@@Reset and reapply@@PH2@@ can become @@PH1@@Zurucksetzen und erneut anwenden@@PH2@@, never @@PH1@@Zurucksetzen und @@PH2@@erneut anwenden@@PH2@@.
 
@@ -141,7 +141,7 @@
 Invalid placeholder handling:
 ["Click <a href=\"/x\">log out</a> and use \\@\\@PH195\\@\\@."]
 
-Respond ONLY with a valid JSON array of strings, one per input string, in the same order:
+Respond ONLY with a valid JSON array of strings, one per input string, in the same order. Do not include JSON objects or any values other than strings:
 
 ["translation 1", "translation 2", ...]
 """
@@ -248,6 +248,14 @@ def fetch_llm_translations(
     ) -> str | None:
         raise NotImplementedError
 
+    def get_model(self) -> str:
+        raise NotImplementedError
+
+    def get_traced_model(self) -> str:
+        model = self.get_model()
+        add_breadcrumb(self.name, "model", model=model)
+        return model
+
     @staticmethod
     def _normalize_context_text(text: str | None) -> str:
         if text is None:
@@ -1515,8 +1523,11 @@ def _normalize_translations(
         if (
             isinstance(translations, list)
             and len(translations) > expected_length
-            and all(isinstance(item, str) for item in translations)
-            and not any(translations[expected_length:])
+            and all(isinstance(item, str) for item in translations[:expected_length])
+            and not any(
+                isinstance(item, str) and item
+                for item in translations[expected_length:]
+            )
         ):
             return translations[:expected_length]
         return translations
diff --git a/weblate/machinery/ollama.py b/weblate/machinery/ollama.py
@@ -32,8 +32,9 @@ def get_model(self) -> str:
     def fetch_llm_translations(
         self, prompt: str, content: str, previous_content: str, previous_response: str
     ) -> str | None:
+        model = self.get_traced_model()
         payload = {
-            "model": self.get_model(),
+            "model": model,
             "messages": [
                 {"role": "system", "content": prompt},
                 {"role": "user", "content": previous_content},
diff --git a/weblate/machinery/openai.py b/weblate/machinery/openai.py
@@ -44,11 +44,12 @@ def fetch_llm_translations(
             {"role": "user", "content": content},
         ]
         self.validate_runtime_url(self.get_runtime_base_url())
+        model = self.get_traced_model()
         response = self.request(
             "post",
             self.get_chat_completions_url(),
             json={
-                "model": self.get_model(),
+                "model": model,
                 "messages": messages,
             },
         )
diff --git a/weblate/machinery/tests.py b/weblate/machinery/tests.py
@@ -69,6 +69,7 @@
 from weblate.machinery.llm import (
     LLM_CURATED_PREVIOUS_EXAMPLE_SOURCES,
     LLM_NEUTRAL_PREVIOUS_EXAMPLE_SOURCES,
+    PROMPT,
 )
 from weblate.machinery.microsoft import MicrosoftCognitiveTranslation
 from weblate.machinery.modernmt import ModernMTTranslation
@@ -3220,6 +3221,7 @@ class OpenAITranslationTest(BaseMachineTranslationTest):
         "persona": "",
         "style": "",
     }
+    TRACE_MODEL: ClassVar[str] = "gpt-5-nano"
 
     def mock_empty(self) -> NoReturn:
         self.skipTest("Not tested")
@@ -3274,6 +3276,39 @@ def mock_response(self, content: str = '["Ahoj světe"]') -> None:
             },
         )
 
+    def test_prompt_forbids_metadata_output(self) -> None:
+        self.assertIn("only JSON strings", PROMPT)
+        self.assertIn(
+            "Do not emit empty extra strings, objects, diagnostics, explanations, "
+            "or metadata.",
+            PROMPT,
+        )
+        self.assertIn(
+            "do not include their check_id, name, description, or generated "
+            "diagnostics in output",
+            PROMPT,
+        )
+        self.assertIn(
+            "Do not include JSON objects or any values other than strings",
+            PROMPT,
+        )
+
+    @responses.activate
+    def test_translate_traces_resolved_model_breadcrumb(self) -> None:
+        self.mock_response()
+        machine = self.get_machine()
+
+        with patch("weblate.machinery.llm.add_breadcrumb") as mock_add_breadcrumb:
+            machine.download_multiple_translations("en", "fr", [("Hello", None)])
+
+        model_call = next(
+            call
+            for call in mock_add_breadcrumb.call_args_list
+            if call.args[:2] == (machine.name, "model")
+        )
+        self.assertEqual(model_call.kwargs["model"], self.TRACE_MODEL)
+        self.assertNotIn("key", model_call.kwargs)
+
     def test_translate_sends_unit_context(self) -> None:
         machine = self.get_machine()
         unit = make_unit(
@@ -4830,6 +4865,28 @@ def test_translate_rejects_non_empty_extra_reply(self) -> None:
                 [("One", None)],
             )
 
+    @responses.activate
+    def test_translate_ignores_trailing_metadata_reply(self) -> None:
+        self.mock_response(
+            json.dumps(
+                [
+                    "Premier",
+                    {
+                        "description": "The following markup is missing.",
+                        "name": "Inconsistent markup",
+                    },
+                ]
+            )
+        )
+
+        translation = self.get_machine().download_multiple_translations(
+            "en",
+            "fr",
+            [("One", None)],
+        )
+
+        self.assertEqual(translation["One"][0]["text"], "Premier")
+
     def test_translate_rejects_ambiguous_rst_duplicate_placeholders(self) -> None:
         machine = self.get_machine()
 
@@ -5252,6 +5309,7 @@ class AzureOpenAITranslationTest(OpenAITranslationTest):
         "style": "",
         "azure_endpoint": "https://my-instance.openai.azure.com",
     }
+    TRACE_MODEL: ClassVar[str] = "my-deployment"
 
     def mock_response(self, content: str = '["Ahoj světe"]') -> None:
         responses.add(

Original file line number	Diff line number	Diff line change
`@@ -44,11 +44,12 @@ def fetch_llm_translations(`
`44`	`44`	`{"role": "user", "content": content},`
`45`	`45`	`]`
`46`	`46`	`self.validate_runtime_url(self.get_runtime_base_url())`
	`47`	`+ model = self.get_traced_model()`
`47`	`48`	`response = self.request(`
`48`	`49`	`"post",`
`49`	`50`	`self.get_chat_completions_url(),`
`50`	`51`	`json={`
`51`		`- "model": self.get_model(),`
	`52`	`+ "model": model,`
`52`	`53`	`"messages": messages,`
`53`	`54`	`},`
`54`	`55`	`)`