Enable structured output for SimpleKGPipeline (#481)

NathalieCharbel · web-flow · commit 3c3a231a3a12 · 2026-03-11T10:14:15.000+01:00
* Add support for Structured Output in SimpleKGPipeline

* Update unit test

* Update CHANGELOG

* Update docs

* Update examples

* Fix llm fixture

* Fix llm fixture in e2e tests

* Fix invalid json files
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Changed
 
 - Updated examples, default values, and documentation to use `gpt-4.1` / `gpt-4.1-mini` instead of deprecated GPT-4* models (e.g. `gpt-4o`, `gpt-4`).
+- **Breaking**: `SimpleKGPipeline` now automatically enables structured output when the `LLMInterface` supports structured output (so far, `OpenAILLM`, `VertexAILLM`). This takes precedence over any `response_format` configured in `model_params` (e.g., `{"type": "json_object"}`), which will be ignored.
 
 ## 1.13.1
 
diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -190,6 +190,24 @@ To write to a non-default Neo4j database, specify the database name using this p
         # ...
     )
 
+Structured Output
+-----------------
+
+When the configured LLM declares support for structured output (i.e., ``supports_structured_output = True``,
+which is the case for :ref:`OpenAILLM <openaillm>` and :ref:`VertexAILLM <vertexaillm>`),
+``SimpleKGPipeline`` automatically enables structured output for both entity extraction and
+(when auto-extracting) schema generation. This enforces schema conformance at the API level,
+improving reliability over prompt-based JSON parsing.
+
+.. note::
+
+    Structured output takes precedence over any ``response_format`` set in ``model_params``
+    when instantiating the LLM. For example, ``{"type": "json_object"}`` will be ignored
+    in favour of structured output for supported LLMs.
+
+For more details on how structured output works at the component level, see
+:ref:`Using Structured Output <using-structured-output>` in the Entity and Relation Extractor section.
+
 Using Custom Components
 -----------------------
 
@@ -354,8 +372,7 @@ Below is an example of configuring an LLM in a JSON configuration file:
                 },
                 "model_params": {
                     "temperature": 0,
-                    "max_tokens": 2000,
-                    "response_format": {"type": "json_object"}
+                    "max_tokens": 2000
                 }
             }
         }
@@ -375,8 +392,6 @@ And the equivalent YAML:
         model_params:
           temperature: 0
           max_tokens: 2000
-          response_format:
-            type: json_object
 
 - The `class_` key specifies the path to the class to be instantiated.
 - The `params_` key contains the parameters to be passed to the class constructor.
@@ -968,13 +983,15 @@ It can be used in this way:
 
     Using `OpenAILLM` requires the `openai` Python client. You can install it with `pip install "neo4j_graphrag[openai]"`.
 
-.. warning::
+.. note::
 
-    The `LLMEntityRelationExtractor` works better if `"response_format": {"type": "json_object"}` is in the model parameters.
+    For :ref:`OpenAILLM <openaillm>` and :ref:`VertexAILLM <vertexaillm>`, structured output is recommended over ``"response_format": {"type": "json_object"}`` for improved reliability. See :ref:`Using Structured Output <using-structured-output>` below.
 
 The LLM to use can be customized, the only constraint is that it obeys the :ref:`LLMInterface <llminterface>`.
 
 
+.. _using-structured-output:
+
 Using Structured Output
 -----------------------
 
diff --git a/examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_pdf.py b/examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_pdf.py
@@ -38,21 +38,11 @@ async def run_kg_pipeline_with_auto_schema() -> None:
     user = os.getenv("NEO4J_USER", "neo4j")
     password = os.getenv("NEO4J_PASSWORD", "password")
 
-    # Define LLM parameters
-    llm_model_params = {
-        "max_tokens": 2000,
-        "response_format": {"type": "json_object"},
-        "temperature": 0,  # Lower temperature for more consistent output
-    }
-
     # Initialize the Neo4j driver
     driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
 
     # Create the LLM instance
-    llm = OpenAILLM(
-        model_name="gpt-5",
-        model_params=llm_model_params,
-    )
+    llm = OpenAILLM(model_name="gpt-5")
 
     # Create the embedder instance
     embedder = OpenAIEmbeddings()
diff --git a/examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_text.py b/examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_text.py
@@ -55,20 +55,12 @@ async def run_kg_pipeline_with_auto_schema() -> None:
     user = os.getenv("NEO4J_USER", "neo4j")
     password = os.getenv("NEO4J_PASSWORD", "password")
 
-    # Define LLM parameters
-    llm_model_params = {
-        "max_tokens": 2000,
-        "response_format": {"type": "json_object"},
-        "temperature": 0,  # Lower temperature for more consistent output
-    }
-
     # Initialize the Neo4j driver
     driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
 
     # Create the LLM instance
     llm = OpenAILLM(
         model_name="gpt-5",
-        model_params=llm_model_params,
     )
 
     # Create the embedder instance
diff --git a/examples/build_graph/from_config_files/simple_kg_pipeline_config.json b/examples/build_graph/from_config_files/simple_kg_pipeline_config.json
@@ -24,12 +24,7 @@
                 "resolver_": "ENV",
                 "var_": "OPENAI_API_KEY"
             },
-            "model_name": "gpt-5",
-            "model_params": {
-                "temperature": 0,
-                "max_tokens": 2000,
-                "response_format": {"type": "json_object"}
-            }
+            "model_name": "gpt-5"
         }
     },
     "embedder_config": {
diff --git a/examples/build_graph/from_config_files/simple_kg_pipeline_config.yaml b/examples/build_graph/from_config_files/simple_kg_pipeline_config.yaml
@@ -18,11 +18,6 @@ llm_config:
       resolver_: ENV
       var_: OPENAI_API_KEY
     model_name: gpt-5
-    model_params:
-      temperature: 0
-      max_tokens: 2000
-      response_format:
-        type: json_object
 embedder_config:
   class_: OpenAIEmbeddings
   params_:
diff --git a/examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json b/examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json
@@ -24,12 +24,7 @@
                 "resolver_": "ENV",
                 "var_": "OPENAI_API_KEY"
             },
-            "model_name": "gpt-5",
-            "model_params": {
-                "temperature": 0,
-                "max_tokens": 2000,
-                "response_format": {"type": "json_object"}
-            }
+            "model_name": "gpt-5"
         }
     },
     "embedder_config": {
diff --git a/examples/build_graph/simple_kg_builder_from_pdf.py b/examples/build_graph/simple_kg_builder_from_pdf.py
@@ -65,10 +65,6 @@ async def define_and_run_pipeline(
 async def main() -> PipelineResult:
     llm = OpenAILLM(
         model_name="gpt-5",
-        model_params={
-            "max_tokens": 2000,
-            "response_format": {"type": "json_object"},
-        },
     )
     with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
         res = await define_and_run_pipeline(driver, llm)
diff --git a/examples/build_graph/simple_kg_builder_from_text.py b/examples/build_graph/simple_kg_builder_from_text.py
@@ -25,7 +25,6 @@
 logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)
 # logging.getLogger("neo4j_graphrag").setLevel(logging.INFO)
 
-
 # Neo4j db infos
 URI = "neo4j://localhost:7687"
 AUTH = ("neo4j", "password")
@@ -42,7 +41,11 @@
     "Person",
     # ... or with a dict if more details are needed,
     # such as a description:
-    {"label": "House", "description": "Family the person belongs to"},
+    {
+        "label": "House",
+        "description": "Family the person belongs to",
+        "properties": [{"name": "name", "type": "STRING"}],
+    },
     # or a list of properties the LLM will try to attach to the entity:
     {"label": "Planet", "properties": [{"name": "weather", "type": "STRING"}]},
 ]
@@ -93,10 +96,6 @@ async def define_and_run_pipeline(
 async def main() -> PipelineResult:
     llm = OpenAILLM(
         model_name="gpt-5",
-        model_params={
-            "max_tokens": 2000,
-            "response_format": {"type": "json_object"},
-        },
     )
     with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
         res = await define_and_run_pipeline(driver, llm)
diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py
@@ -186,7 +186,11 @@ def _get_schema(self) -> BaseSchemaBuilder:
         Return SchemaFromTextExtractor for automatic extraction or SchemaBuilder for manual schema.
         """
         if not self.has_user_provided_schema():
-            return SchemaFromTextExtractor(llm=self.get_default_llm())
+            llm = self.get_default_llm()
+            return SchemaFromTextExtractor(
+                llm=llm,
+                use_structured_output=llm.supports_structured_output,
+            )
         return SchemaBuilder()
 
     def _process_schema_with_precedence(self) -> dict[str, Any]:
@@ -218,10 +222,12 @@ def _get_run_params_for_schema(self) -> dict[str, Any]:
             return schema_dict
 
     def _get_extractor(self) -> EntityRelationExtractor:
+        llm = self.get_default_llm()
         return LLMEntityRelationExtractor(
-            llm=self.get_default_llm(),
+            llm=llm,
             prompt_template=self.prompt_template,
             on_error=self.on_error,
+            use_structured_output=llm.supports_structured_output,
         )
 
     def _get_pruner(self) -> GraphPruning:
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -48,7 +48,9 @@ def driver() -> Generator[Any, Any, Any]:
 
 @pytest.fixture(scope="function")
 def llm() -> MagicMock:
-    return MagicMock(spec=LLMInterface)
+    mock = MagicMock(spec=LLMInterface)
+    mock.supports_structured_output = False
+    return mock
 
 
 @pytest.fixture
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -42,7 +42,9 @@ def embedder() -> MagicMock:
 
 @pytest.fixture(scope="function")
 def llm() -> MagicMock:
-    return MagicMock(spec=LLMInterface)
+    mock = MagicMock(spec=LLMInterface)
+    mock.supports_structured_output = False
+    return mock
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py b/tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py
@@ -132,6 +132,21 @@ def test_simple_kg_pipeline_config_automatic_schema(
     schema = config._get_schema()
     assert isinstance(schema, SchemaFromTextExtractor)
     assert schema._llm == llm
+    assert schema.use_structured_output is False
+
+
+@patch(
+    "neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm"
+)
+def test_simple_kg_pipeline_config_automatic_schema_structured_output(
+    mock_llm: Mock, llm: LLMInterface
+) -> None:
+    llm.supports_structured_output = True
+    mock_llm.return_value = llm
+    config = SimpleKGPipelineConfig()
+    schema = config._get_schema()
+    assert isinstance(schema, SchemaFromTextExtractor)
+    assert schema.use_structured_output is True
 
 
 def test_simple_kg_pipeline_config_manual_schema() -> None:
@@ -176,6 +191,21 @@ def test_simple_kg_pipeline_config_extractor(mock_llm: Mock, llm: LLMInterface)
     assert extractor.llm == llm
     assert extractor.on_error == OnError.IGNORE
     assert extractor.prompt_template.template == "my template {text}"
+    assert extractor.use_structured_output is False
+
+
+@patch(
+    "neo4j_graphrag.experimental.pipeline.config.template_pipeline.simple_kg_builder.SimpleKGPipelineConfig.get_default_llm"
+)
+def test_simple_kg_pipeline_config_extractor_structured_output(
+    mock_llm: Mock, llm: LLMInterface
+) -> None:
+    llm.supports_structured_output = True
+    mock_llm.return_value = llm
+    config = SimpleKGPipelineConfig()
+    extractor = config._get_extractor()
+    assert isinstance(extractor, LLMEntityRelationExtractor)
+    assert extractor.use_structured_output is True
 
 
 @patch(