Skip to content

Commit 3c3a231

Browse files
Enable structured output for SimpleKGPipeline (#481)
* Add support for Structured Output in SimpleKGPipeline * Update unit test * Update CHANGELOG * Update docs * Update examples * Fix llm fixture * Fix llm fixture in e2e tests * Fix invalid json files
1 parent d1b8f0f commit 3c3a231

File tree

13 files changed

+76
-56
lines changed

13 files changed

+76
-56
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Changed
66

77
- Updated examples, default values, and documentation to use `gpt-4.1` / `gpt-4.1-mini` instead of deprecated GPT-4* models (e.g. `gpt-4o`, `gpt-4`).
8+
- **Breaking**: `SimpleKGPipeline` now automatically enables structured output when the `LLMInterface` supports structured output (so far, `OpenAILLM`, `VertexAILLM`). This takes precedence over any `response_format` configured in `model_params` (e.g., `{"type": "json_object"}`), which will be ignored.
89

910
## 1.13.1
1011

docs/source/user_guide_kg_builder.rst

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,24 @@ To write to a non-default Neo4j database, specify the database name using this p
190190
# ...
191191
)
192192
193+
Structured Output
194+
-----------------
195+
196+
When the configured LLM declares support for structured output (i.e., ``supports_structured_output = True``,
197+
which is the case for :ref:`OpenAILLM <openaillm>` and :ref:`VertexAILLM <vertexaillm>`),
198+
``SimpleKGPipeline`` automatically enables structured output for both entity extraction and
199+
(when auto-extracting) schema generation. This enforces schema conformance at the API level,
200+
improving reliability over prompt-based JSON parsing.
201+
202+
.. note::
203+
204+
Structured output takes precedence over any ``response_format`` set in ``model_params``
205+
when instantiating the LLM. For example, ``{"type": "json_object"}`` will be ignored
206+
in favour of structured output for supported LLMs.
207+
208+
For more details on how structured output works at the component level, see
209+
:ref:`Using Structured Output <using-structured-output>` in the Entity and Relation Extractor section.
210+
193211
Using Custom Components
194212
-----------------------
195213

@@ -354,8 +372,7 @@ Below is an example of configuring an LLM in a JSON configuration file:
354372
},
355373
"model_params": {
356374
"temperature": 0,
357-
"max_tokens": 2000,
358-
"response_format": {"type": "json_object"}
375+
"max_tokens": 2000
359376
}
360377
}
361378
}
@@ -375,8 +392,6 @@ And the equivalent YAML:
375392
model_params:
376393
temperature: 0
377394
max_tokens: 2000
378-
response_format:
379-
type: json_object
380395
381396
- The `class_` key specifies the path to the class to be instantiated.
382397
- The `params_` key contains the parameters to be passed to the class constructor.
@@ -968,13 +983,15 @@ It can be used in this way:
968983

969984
Using `OpenAILLM` requires the `openai` Python client. You can install it with `pip install "neo4j_graphrag[openai]"`.
970985

971-
.. warning::
986+
.. note::
972987

973-
The `LLMEntityRelationExtractor` works better if `"response_format": {"type": "json_object"}` is in the model parameters.
988+
For :ref:`OpenAILLM <openaillm>` and :ref:`VertexAILLM <vertexaillm>`, structured output is recommended over ``"response_format": {"type": "json_object"}`` for improved reliability. See :ref:`Using Structured Output <using-structured-output>` below.
974989

975990
The LLM to use can be customized, the only constraint is that it obeys the :ref:`LLMInterface <llminterface>`.
976991

977992

993+
.. _using-structured-output:
994+
978995
Using Structured Output
979996
-----------------------
980997

examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_pdf.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,11 @@ async def run_kg_pipeline_with_auto_schema() -> None:
3838
user = os.getenv("NEO4J_USER", "neo4j")
3939
password = os.getenv("NEO4J_PASSWORD", "password")
4040

41-
# Define LLM parameters
42-
llm_model_params = {
43-
"max_tokens": 2000,
44-
"response_format": {"type": "json_object"},
45-
"temperature": 0, # Lower temperature for more consistent output
46-
}
47-
4841
# Initialize the Neo4j driver
4942
driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
5043

5144
# Create the LLM instance
52-
llm = OpenAILLM(
53-
model_name="gpt-5",
54-
model_params=llm_model_params,
55-
)
45+
llm = OpenAILLM(model_name="gpt-5")
5646

5747
# Create the embedder instance
5848
embedder = OpenAIEmbeddings()

examples/build_graph/automatic_schema_extraction/simple_kg_builder_schema_from_text.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,20 +55,12 @@ async def run_kg_pipeline_with_auto_schema() -> None:
5555
user = os.getenv("NEO4J_USER", "neo4j")
5656
password = os.getenv("NEO4J_PASSWORD", "password")
5757

58-
# Define LLM parameters
59-
llm_model_params = {
60-
"max_tokens": 2000,
61-
"response_format": {"type": "json_object"},
62-
"temperature": 0, # Lower temperature for more consistent output
63-
}
64-
6558
# Initialize the Neo4j driver
6659
driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
6760

6861
# Create the LLM instance
6962
llm = OpenAILLM(
7063
model_name="gpt-5",
71-
model_params=llm_model_params,
7264
)
7365

7466
# Create the embedder instance

examples/build_graph/from_config_files/simple_kg_pipeline_config.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,7 @@
2424
"resolver_": "ENV",
2525
"var_": "OPENAI_API_KEY"
2626
},
27-
"model_name": "gpt-5",
28-
"model_params": {
29-
"temperature": 0,
30-
"max_tokens": 2000,
31-
"response_format": {"type": "json_object"}
32-
}
27+
"model_name": "gpt-5"
3328
}
3429
},
3530
"embedder_config": {

examples/build_graph/from_config_files/simple_kg_pipeline_config.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@ llm_config:
1818
resolver_: ENV
1919
var_: OPENAI_API_KEY
2020
model_name: gpt-5
21-
model_params:
22-
temperature: 0
23-
max_tokens: 2000
24-
response_format:
25-
type: json_object
2621
embedder_config:
2722
class_: OpenAIEmbeddings
2823
params_:

examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,7 @@
2424
"resolver_": "ENV",
2525
"var_": "OPENAI_API_KEY"
2626
},
27-
"model_name": "gpt-5",
28-
"model_params": {
29-
"temperature": 0,
30-
"max_tokens": 2000,
31-
"response_format": {"type": "json_object"}
32-
}
27+
"model_name": "gpt-5"
3328
}
3429
},
3530
"embedder_config": {

examples/build_graph/simple_kg_builder_from_pdf.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ async def define_and_run_pipeline(
6565
async def main() -> PipelineResult:
6666
llm = OpenAILLM(
6767
model_name="gpt-5",
68-
model_params={
69-
"max_tokens": 2000,
70-
"response_format": {"type": "json_object"},
71-
},
7268
)
7369
with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
7470
res = await define_and_run_pipeline(driver, llm)

examples/build_graph/simple_kg_builder_from_text.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)
2626
# logging.getLogger("neo4j_graphrag").setLevel(logging.INFO)
2727

28-
2928
# Neo4j db infos
3029
URI = "neo4j://localhost:7687"
3130
AUTH = ("neo4j", "password")
@@ -42,7 +41,11 @@
4241
"Person",
4342
# ... or with a dict if more details are needed,
4443
# such as a description:
45-
{"label": "House", "description": "Family the person belongs to"},
44+
{
45+
"label": "House",
46+
"description": "Family the person belongs to",
47+
"properties": [{"name": "name", "type": "STRING"}],
48+
},
4649
# or a list of properties the LLM will try to attach to the entity:
4750
{"label": "Planet", "properties": [{"name": "weather", "type": "STRING"}]},
4851
]
@@ -93,10 +96,6 @@ async def define_and_run_pipeline(
9396
async def main() -> PipelineResult:
9497
llm = OpenAILLM(
9598
model_name="gpt-5",
96-
model_params={
97-
"max_tokens": 2000,
98-
"response_format": {"type": "json_object"},
99-
},
10099
)
101100
with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
102101
res = await define_and_run_pipeline(driver, llm)

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,11 @@ def _get_schema(self) -> BaseSchemaBuilder:
186186
Return SchemaFromTextExtractor for automatic extraction or SchemaBuilder for manual schema.
187187
"""
188188
if not self.has_user_provided_schema():
189-
return SchemaFromTextExtractor(llm=self.get_default_llm())
189+
llm = self.get_default_llm()
190+
return SchemaFromTextExtractor(
191+
llm=llm,
192+
use_structured_output=llm.supports_structured_output,
193+
)
190194
return SchemaBuilder()
191195

192196
def _process_schema_with_precedence(self) -> dict[str, Any]:
@@ -218,10 +222,12 @@ def _get_run_params_for_schema(self) -> dict[str, Any]:
218222
return schema_dict
219223

220224
def _get_extractor(self) -> EntityRelationExtractor:
225+
llm = self.get_default_llm()
221226
return LLMEntityRelationExtractor(
222-
llm=self.get_default_llm(),
227+
llm=llm,
223228
prompt_template=self.prompt_template,
224229
on_error=self.on_error,
230+
use_structured_output=llm.supports_structured_output,
225231
)
226232

227233
def _get_pruner(self) -> GraphPruning:

0 commit comments

Comments
 (0)