DAGE-90: Support structured output in LLM service

nseidan · nseidan · commit 2b00efedb400 · 2025-10-07T13:16:09.000+05:00
diff --git a/rre-tools/commons/src/commons/model/query_schema.py b/rre-tools/commons/src/commons/model/query_schema.py
@@ -0,0 +1,22 @@
+from typing import Type
+
+from pydantic import BaseModel, Field, create_model, conlist, constr
+
+
+def create_queries_schema(num_queries_generate: int) -> Type[BaseModel]:
+    """
+    Returns a Pydantic model that enforces `queries` to be a list of exactly
+    `num_queries_generate` and non-empty strings. Used to validate LLM output.
+    """
+    cleaned_query = constr(strip_whitespace=True, min_length=1)
+
+    exact_num_queries = conlist(cleaned_query, min_length=num_queries_generate,
+                                max_length=num_queries_generate)
+
+    schema = create_model(
+        "LLMQueries",
+        queries=(exact_num_queries, Field(...,
+                                          description=f"Return exactly {num_queries_generate} "
+                                                      f"distinct queries as plain strings.")),
+    )
+    return schema
diff --git a/rre-tools/commons/src/commons/model/score_schema.py b/rre-tools/commons/src/commons/model/score_schema.py
@@ -0,0 +1,14 @@
+from typing import Optional, Literal
+from pydantic import BaseModel, Field
+
+
+class BinaryScore(BaseModel):
+    """Returns a binary relevance score."""
+    score: Literal[0, 1] = Field(..., description="0 = not relevant, 1 = relevant")
+    explanation: Optional[str] = Field(None, description="Explanation for why this score")
+
+
+class GradedScore(BaseModel):
+    """Returns a graded relevance score."""
+    score: Literal[0, 1, 2] = Field(..., description="0 = not relevant, 1 = maybe, 2 = is the answer")
+    explanation: Optional[str] = Field(None, description="Explanation for why this score")
diff --git a/rre-tools/dataset-generator/src/dataset_generator/llm/llm_service.py b/rre-tools/dataset-generator/src/dataset_generator/llm/llm_service.py
@@ -1,10 +1,13 @@
 import json
-from json import JSONDecodeError
+import logging
 
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage, SystemMessage
+from pydantic import BaseModel, ValidationError
+
 from commons.model import LLMQueryResponse, LLMScoreResponse, Document
-import logging
+from commons.model.query_schema import create_queries_schema
+from commons.model.score_schema import BinaryScore, GradedScore
 
 log = logging.getLogger(__name__)
 
@@ -16,13 +19,14 @@ def __init__(self, chat_model: BaseChatModel):
     def generate_queries(self, document: Document, num_queries_generate_per_doc: int) -> LLMQueryResponse:
         """
         Generate queries based on the given document and num_queries_generate_per_doc and
-        Returns a list of generated queries or just a generated string in case of LLM hallucination
+        Returns a list of generated `num_queries_generate_per_doc` queries or throws an exception if LLM hallucinates
         """
+        schema: type[BaseModel] = create_queries_schema(num_queries_generate_per_doc)
+
         system_prompt = (
             f"You are a helpful assistant! Generate {num_queries_generate_per_doc} "
-            "queries based on the given document below. "
-            "**Output only** a JSON array of strings—nothing else. "
-            "Example format: [\"first query\", \"second query\"]"
+            "natural language search queries based strictly on the given document."
+            "Avoid duplicates. Return a structured object matching the provided schema."
         )
 
         doc_json = document.model_dump_json(exclude={"is_used_to_generate_queries"})
@@ -32,54 +36,49 @@ def generate_queries(self, document: Document, num_queries_generate_per_doc: int
             HumanMessage(content=f"Document:\n{doc_json}")
         ]
 
-        # The response from invoke is an AIMessage object which contains all the needed info
-        response = self.chat_model.invoke(messages)
-        response_content = response.content
-        if not isinstance(response_content, str):
-            response_content = json.dumps(response_content)
-
+        # Use LangChain structured output
+        structured_llm = self.chat_model.with_structured_output(schema)
         try:
-            output = LLMQueryResponse(response_content=response_content)
-        except (KeyError, JSONDecodeError, ValueError) as e:
-            log.warning(f"LLM unexpected response. Raw output: {response.content}")
+            model_response = structured_llm.invoke(messages)
+        except (ValidationError, KeyError) as e:
+            log.debug("Invalid LLM response.")
             raise ValueError(f"Invalid LLM response: {e}")
 
-        return output
+        # Remove duplicate generated-queries
+        seen = set()
+        unique_queries: list[str] = []
+        for query in model_response.queries:
+            if query not in seen:
+                seen.add(query)
+                unique_queries.append(query)
+        unique_queries_len = len(unique_queries)
+        if unique_queries_len != num_queries_generate_per_doc:
+            log.info(f"Expected {num_queries_generate_per_doc} unique queries, got {unique_queries_len}")
+
+        return LLMQueryResponse(response_content=json.dumps(unique_queries))
 
     def generate_score(self, document: Document, query: str, relevance_scale: str,
                        explanation: bool = False) -> LLMScoreResponse:
         """
         Generates a relevance score for a given document-query pair using a specified relevance scale.
         If explanation flag is set to true, score explanation is generated as well.
         """
-        if relevance_scale == "binary":
-            description = (" - 0: the query is NOT relevant to the given document\n"
-                           " - 1: the query is relevant to the given document")
-        elif relevance_scale == "graded":
-            description = (" - 0: the query is NOT relevant to the given document\n"
-                           " - 1: the query may be relevant to the given document\n"
-                           " - 2: the document proposed is the answer to the query")
-        else:
-            msg = f"Invalid relevance scale: {relevance_scale}"
-            log.error(msg)
-            raise ValueError(msg)
+        if relevance_scale not in {"binary", "graded"}:
+            raise ValueError(f"Invalid relevance scale: {relevance_scale}")
+
+        schema: type[BaseModel] = BinaryScore if relevance_scale == "binary" else GradedScore
 
         system_prompt = (f"You are a professional data labeler and, given a document with a set of fields and a query "
                          f"and you need to return the relevance score in a scale called {relevance_scale.upper()}. "
-                         f"The scores of this scale are built as follows:\n{description}\n")
-
+                         " Return a structured object matching the provided schema.")
         if explanation:
             system_prompt += (
-                "Return ONLY a **valid JSON** object with two keys:"
-                " `score`: the related score as an integer value\n"
-                " `explanation`: your concise explanation for that score\n"
-                "As an example, I expect a JSON response like the following: "
-                "{\"score\": \"integer value\",\"explanation\": \"I rated this score because...\" }"
+                " Include a clear explanation justifying your score "
+                "in the `explanation` field based on the provided schema."
             )
         else:
             system_prompt += (
-                "Return ONLY a **valid JSON** object with key 'score' and the related score as an integer value."
-                "I expect a JSON response like the following: {\"score\": \"integer value\"}"
+                " Do not include any explanation."
             )
 
         messages = [
@@ -92,24 +91,16 @@ def generate_score(self, document: Document, query: str, relevance_scale: str,
             )
         ]
 
-        response_content = self.chat_model.invoke(messages).content
-        if isinstance(response_content, str):
-            raw = response_content.strip()
-        else:
-            raw = json.dumps(response_content)
-
+        # Use LangChain structured output
+        structured_llm = self.chat_model.with_structured_output(schema)
         try:
-            score = json.loads(raw)['score']
-            score_explanation = None
-            if explanation:
-                score_explanation = json.loads(raw)['explanation']
-        except (JSONDecodeError, KeyError) as e:
-            log.debug(f"LLM unexpected response. Raw output: {raw}")
+            model_response = structured_llm.invoke(messages)
+        except (ValidationError, KeyError) as e:
+            log.debug("Invalid LLM response.")
             raise ValueError(f"Invalid LLM response: {e}")
 
-        try:
-            parsed = LLMScoreResponse(score=score, scale=relevance_scale, explanation=score_explanation)
-            return parsed
-        except ValueError as e:
-            log.warning(f"Validation error for score '{score}' on scale '{relevance_scale}': {e}")
-            raise e
+        return LLMScoreResponse(
+            score=model_response.score,
+            scale=relevance_scale,
+            explanation=(model_response.explanation if explanation else None)
+        )
diff --git a/rre-tools/dataset-generator/tests/unit/llm/llm_mock.py b/rre-tools/dataset-generator/tests/unit/llm/llm_mock.py
@@ -0,0 +1,43 @@
+from typing import List, Optional
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.outputs import ChatResult
+from pydantic import BaseModel
+
+
+class _StructuredOutputMockLLM:
+    def __init__(self, fake_chat_model, schema: type[BaseModel]):
+        self._fake_chat_model = fake_chat_model
+        self._schema = schema
+
+    def invoke(self, messages):
+        payload = self._fake_chat_model.responses.pop(0)
+
+        if isinstance(payload, self._schema):
+            return payload
+
+        if isinstance(payload, dict):
+            return self._schema.model_validate(payload)
+
+        if isinstance(payload, str):
+            return self._schema.model_validate_json(payload)
+
+        raise TypeError(f"Unexpected fake payload type: {type(payload)}")
+
+
+class ChatModelAdapter(BaseChatModel):
+    """Fake adapter for with_structured_output, as the FakeListChatModel doesn't support"""
+
+    def __init__(self, fake_chat_model):
+        super().__init__()
+        self._fake_chat_model = fake_chat_model
+
+    @property
+    def _llm_type(self) -> str:
+        return "fake_adapter"
+
+    def _generate(self, messages: List, stop: Optional[List[str]] = None, **kwargs) -> ChatResult:
+        raise NotImplementedError("_generate is not used in the test")
+
+    def with_structured_output(self, schema: type[BaseModel]):
+        return _StructuredOutputMockLLM(self._fake_chat_model, schema)
diff --git a/rre-tools/dataset-generator/tests/unit/llm/test_llm_service.py b/rre-tools/dataset-generator/tests/unit/llm/test_llm_service.py
@@ -1,7 +1,9 @@
+import pytest
 from langchain_core.language_models.fake_chat_models import FakeListChatModel
-from dataset_generator.llm import LLMService
+
 from commons.model import Document, LLMQueryResponse, LLMScoreResponse
-import pytest
+from dataset_generator.llm import LLMService
+from llm_mock import ChatModelAdapter
 
 
 @pytest.fixture
@@ -18,18 +20,18 @@ def example_doc():
 
 def test_llm_service_generate_queries__expects__response(example_doc):
     # Test that the service can generate queries from a document
-    fake_llm = FakeListChatModel(responses=['["Car"]'])
-    service = LLMService(chat_model=fake_llm)
+    fake_llm = FakeListChatModel(responses=['{"queries": ["Car","Auto","Vehicle","Sedan","Toyota"]}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
 
     response = service.generate_queries(example_doc, 5)
 
     assert isinstance(response, LLMQueryResponse)
-    assert response.get_queries() == ["Car"]
+    assert response.get_queries() == ["Car","Auto","Vehicle","Sedan","Toyota"]
 
 
 def test_llm_service_generate_score__expects__response(example_doc):
-    fake_llm = FakeListChatModel(responses=["{\"score\": 1}"])
-    service = LLMService(chat_model=fake_llm)
+    fake_llm = FakeListChatModel(responses=['{"score": 1}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
 
     query = "Is a Toyota the car of the year?"
 
@@ -45,7 +47,7 @@ def test_llm_service_generate_score__expects__response(example_doc):
 ])
 def test_llm_service_generate_score_with_invalid_responses__expects__raises_value_error(example_doc, invalid_response):
     fake_llm = FakeListChatModel(responses=[invalid_response])
-    service = LLMService(chat_model=fake_llm)
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
 
     query = "Is a Toyota the car of the year?"
     with pytest.raises(ValueError):
diff --git a/rre-tools/dataset-generator/tests/unit/llm/test_llm_service_queries.py b/rre-tools/dataset-generator/tests/unit/llm/test_llm_service_queries.py
@@ -2,6 +2,7 @@
 from langchain_core.language_models.fake_chat_models import FakeListChatModel
 from dataset_generator.llm import LLMService
 from commons.model import Document, LLMQueryResponse
+from llm_mock import ChatModelAdapter
 
 
 @pytest.fixture
@@ -16,44 +17,42 @@ def example_doc():
 
 
 def test_llm_service_generate_queries__expects__valid(example_doc):
-    fake_llm = FakeListChatModel(responses=['["Toyota", "Best Car"]'])
-    service = LLMService(chat_model=fake_llm)
+    fake_llm = FakeListChatModel(responses=['{"queries": ["Toyota", "Best Car"]}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
     response = service.generate_queries(example_doc, 2)
 
     assert isinstance(response, LLMQueryResponse)
     assert response.get_queries() == ["Toyota", "Best Car"]
 
 
 def test_llm_service_generate_queries__expects__empty_list(example_doc):
-    fake_llm = FakeListChatModel(responses=['[]'])
-    service = LLMService(chat_model=fake_llm)
+    fake_llm = FakeListChatModel(responses=['{"queries":[]}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
     response = service.generate_queries(example_doc, 0)
     assert response.get_queries() == []
 
 
 @pytest.mark.parametrize("invalid_response, expected_error", [
-    ('not a json', "Invalid JSON in `response_content`"),
-    ('["", " ", "Valid"]', "must not be empty or only whitespace"),
-    ('["Good", 123, null]', "must be strings"),
+    ('not a json', r"Invalid JSON"),
+    ('{"queries":["", " ", "Valid"]}', r"(at least 1 character|min_length|String should have at least 1)"),
+    ('{"queries":["Good", 123, null]}', r"(valid string|string_type)"),
 ])
 def test_llm_service_generate_queries_with_invalid_responses__expects__error(invalid_response, expected_error, example_doc):
     fake_llm = FakeListChatModel(responses=[invalid_response])
-    service = LLMService(chat_model=fake_llm)
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
     with pytest.raises(ValueError, match=expected_error):
         service.generate_queries(example_doc, 3)
 
 
 def test_generate_queries_with_unicode_strings__expects__list_of_unicode_strings(example_doc):
-    unicode_list = '["こんにちは", "你好", "¡Hola!"]'
-    fake_llm = FakeListChatModel(responses=[unicode_list])
-    service = LLMService(chat_model=fake_llm)
+    fake_llm = FakeListChatModel(responses=['{"queries":["こんにちは", "你好", "¡Hola!"]}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
     response = service.generate_queries(example_doc, 3)
     assert response.get_queries() == ["こんにちは", "你好", "¡Hola!"]
 
 
-def test_generate_queries_with_leading_trailing_whitespace__expects__strings_preserved(example_doc):
-    list_with_whitespace = '["  hello  ", " world "]'
-    fake_llm = FakeListChatModel(responses=[list_with_whitespace])
-    service = LLMService(chat_model=fake_llm)
+def test_generate_queries_with_leading_trailing_whitespace__expects__whitespace_stripped(example_doc):
+    fake_llm = FakeListChatModel(responses=['{"queries":["  hello  ", " world "]}'])
+    service = LLMService(chat_model=ChatModelAdapter(fake_llm))
     response = service.generate_queries(example_doc, 2)
-    assert response.get_queries() == ["  hello  ", " world "]
+    assert response.get_queries() == ["hello", "world"]
diff --git a/rre-tools/dataset-generator/tests/unit/llm/test_llm_service_score.py b/rre-tools/dataset-generator/tests/unit/llm/test_llm_service_score.py