We will constrain the LLM output to this schema so we can parse the output reasonably.
First draft, not discussed with anyone yet so there are probably issues with it:
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "DocNode",
"type": "object",
"properties": {
"label": {
"type": "string",
"description": "Label of the node, e.g. '1.2' or '1bis' or 'a)' etc. May be empty."
},
"type": {
"type": "string",
"enum": ["document", "heading", "content", "list", "list_item", "footnote", "image"],
"description": "Type of the node."
},
"content": {
"type": "array",
"items": {
"type": "string"
},
"description": "Array of content strings. Usually only one string, but may be more in case of multiple paragraphs."
},
"children": {
"type": "array",
"items": {
"$ref": "#"
},
"description": "Array of nested nodes."
}
},
"required": ["label", "type"],
"additionalProperties": false
}
And an example document for that:
{
"label": "",
"type": "document",
"content": [],
"children": [
{
"label": "1",
"type": "heading",
"content": ["Introduction"],
"children": [
{
"label": "",
"type": "content",
"content": ["This is the first paragraph under the introduction."],
"children": []
},
{
"label": "",
"type": "list",
"content": [],
"children": [
{
"label": "a)",
"type": "list_item",
"content": ["This is a point in a list under the first paragraph."],
"children": []
},
{
"label": "b)",
"type": "list_item",
"content": ["This is another point in the list."],
"children": []
}
]
},
{
"label": "1.1",
"type": "heading",
"content": ["Subsection"],
"children": [
{
"label": "",
"type": "content",
"content": ["Some subsection content."],
"children": []
},
]
}
]
}
]
}
We will constrain the LLM output to this schema so we can parse the output reasonably.
First draft, not discussed with anyone yet so there are probably issues with it:
{ "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "DocNode", "type": "object", "properties": { "label": { "type": "string", "description": "Label of the node, e.g. '1.2' or '1bis' or 'a)' etc. May be empty." }, "type": { "type": "string", "enum": ["document", "heading", "content", "list", "list_item", "footnote", "image"], "description": "Type of the node." }, "content": { "type": "array", "items": { "type": "string" }, "description": "Array of content strings. Usually only one string, but may be more in case of multiple paragraphs." }, "children": { "type": "array", "items": { "$ref": "#" }, "description": "Array of nested nodes." } }, "required": ["label", "type"], "additionalProperties": false }And an example document for that:
{ "label": "", "type": "document", "content": [], "children": [ { "label": "1", "type": "heading", "content": ["Introduction"], "children": [ { "label": "", "type": "content", "content": ["This is the first paragraph under the introduction."], "children": [] }, { "label": "", "type": "list", "content": [], "children": [ { "label": "a)", "type": "list_item", "content": ["This is a point in a list under the first paragraph."], "children": [] }, { "label": "b)", "type": "list_item", "content": ["This is another point in the list."], "children": [] } ] }, { "label": "1.1", "type": "heading", "content": ["Subsection"], "children": [ { "label": "", "type": "content", "content": ["Some subsection content."], "children": [] }, ] } ] } ] }