Skip to content

Commit 36c6432

Browse files
committed
first draft toc-based pdf header hierarchy
1 parent 4852d8b commit 36c6432

File tree

7 files changed

+350
-10
lines changed

7 files changed

+350
-10
lines changed

docling/models/header_hierarchy/__init__.py

Whitespace-only changes.
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import re
2+
from collections.abc import Generator
3+
from contextlib import contextmanager
4+
from functools import cached_property
5+
from io import BytesIO
6+
from logging import Logger
7+
from pathlib import Path, PurePath
8+
from typing import Optional, Union
9+
10+
from docling_core.types.doc import BoundingBox, DocItemLabel, ListItem, TextItem
11+
12+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
13+
from docling.datamodel.base_models import DocumentStream, PageElement
14+
from docling.datamodel.document import ConversionResult
15+
from docling.models.readingorder_model import ReadingOrderPageElement
16+
17+
from .types.hierarchical_header import HierarchicalHeader
18+
19+
logger = Logger(__name__)
20+
21+
22+
class HeaderNotFoundException(Exception):
23+
def __init__(self, heading: str):
24+
super().__init__(f"Following heading was not found in the document: {heading}")
25+
26+
27+
class ImplausibleHeadingStructureException(Exception):
28+
def __init__(self) -> None:
29+
super().__init__(
30+
"Hierarchy demands equal level heading, but no common parent was found!"
31+
)
32+
33+
34+
class PdfBackendIncompatible(Exception):
35+
def __init__(self, backend) -> None:
36+
super().__init__(
37+
f"The selected backend is '{type(backend)}' instead of 'DoclingParseV4DocumentBackend'."
38+
)
39+
40+
41+
class InvalidSourceTypeException(Exception):
42+
pass
43+
44+
45+
class HierarchyBuilderMetadata:
46+
def __init__(
47+
self,
48+
conv_res: ConversionResult,
49+
sorted_elements: list[ReadingOrderPageElement],
50+
raise_on_error: bool = False,
51+
):
52+
"""Correct
53+
54+
Args:
55+
toc_meta (list[dict]): _description_
56+
headers (list[PageElement]): _description_
57+
raise_on_error (bool, optional): _description_. Defaults to False.
58+
"""
59+
from docling_parse.pdf_parser import DoclingPdfParser
60+
61+
if not isinstance(conv_res.input._backend, DoclingParseV4DocumentBackend):
62+
raise PdfBackendIncompatible(conv_res.input._backend)
63+
backend: DoclingParseV4DocumentBackend = conv_res.input._backend
64+
pdf_parser: DoclingPdfParser = backend.parser
65+
self.toc_meta: list[dict] = pdf_parser.parser.get_table_of_contents(
66+
pdf_parser.list_loaded_keys()[0]
67+
)
68+
self.conv_res: ConversionResult = conv_res
69+
self.all_elements: list[PageElement] = conv_res.assembled.elements
70+
self.all_cids: list[str] = [
71+
f"#/{element.page_no}/{element.cluster.id}" for element in self.all_elements
72+
]
73+
self.sorted_ro_elements: list[ReadingOrderPageElement] = sorted_elements
74+
self.raise_on_error: bool = raise_on_error
75+
self.cid_to_page_element: dict[str, PageElement] = dict(
76+
zip(self.all_cids, self.all_elements)
77+
)
78+
79+
def _iterate_toc(self, toc_list: Optional[list[dict]] = None):
80+
if toc_list is None:
81+
toc_list = self.toc_meta
82+
for toc_el in toc_list:
83+
yield toc_el["level"], toc_el["title"]
84+
if "children" in toc_el:
85+
yield from self._iterate_toc(toc_el["children"])
86+
87+
def infer(self) -> HierarchicalHeader:
88+
root = HierarchicalHeader()
89+
current = root
90+
91+
# my problem is that I will need the font information in PdfTextCell, but at the same time I need the ordered text elements (with self refs ideally)
92+
93+
for level, title in self._iterate_toc():
94+
new_parent = None
95+
this_element = None
96+
orig_text = ""
97+
ref = None
98+
last_i: int = 0
99+
# identify the text item in the document
100+
for _i, ro_element in enumerate(self.sorted_ro_elements[last_i:]):
101+
element = self.cid_to_page_element[ro_element.ref.cref]
102+
# skip all page elements that are before the last ("current") header
103+
# if element.page_no < last_page or element.cluster.id <= last_cid:
104+
# continue
105+
# Future to do: fixme - better to look for an overlap with the 'to' pointer if possible...
106+
from docling.datamodel.base_models import TextElement
107+
108+
if not isinstance(element, TextElement):
109+
continue
110+
orig_text = "".join([c.orig for c in element.cluster.cells])
111+
112+
if re.sub(r"[^A-Za-z0-9]", "", title) == re.sub(
113+
r"[^A-Za-z0-9]", "", orig_text
114+
):
115+
this_element = element
116+
last_i = last_i + _i
117+
ref = ro_element.ref.cref
118+
break
119+
if this_element is None:
120+
if self.raise_on_error:
121+
raise HeaderNotFoundException(title)
122+
else:
123+
logger.warning(HeaderNotFoundException(title))
124+
continue
125+
126+
if this_element.label != DocItemLabel.SECTION_HEADER:
127+
this_element.label = DocItemLabel.SECTION_HEADER
128+
129+
if current.level_toc is None or level > current.level_toc:
130+
# print(f"gt: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
131+
new_parent = current
132+
elif level == current.level_toc:
133+
# print(f"eq: {this_fs_level, this_style_attr} VS: {current.level_fontsize, current.style_attrs}")
134+
if current.parent is not None:
135+
new_parent = current.parent
136+
else:
137+
raise ImplausibleHeadingStructureException()
138+
else:
139+
# go back up in hierarchy and try to find a new parent
140+
new_parent = current
141+
while new_parent.parent is not None and (level <= new_parent.level_toc):
142+
new_parent = new_parent.parent
143+
# print(f"fit parent for : {this_fs_level, this_style_attr} parent: {new_parent.level_fontsize, new_parent.style_attrs}")
144+
new_obj = HierarchicalHeader(
145+
text=orig_text,
146+
parent=new_parent,
147+
level_toc=level,
148+
doc_ref=ref,
149+
)
150+
new_parent.children.append(new_obj)
151+
current = new_obj
152+
153+
return root

docling/models/header_hierarchy/style_based_hierarchy.py

Whitespace-only changes.

docling/models/header_hierarchy/types/__init__.py

Whitespace-only changes.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from enum import Enum
2+
3+
4+
class NumberingLevel(Enum):
5+
level_latin = "level_latin"
6+
level_alpha = "level_alpha"
7+
level_numerical = "level_numerical"
8+
9+
10+
class StyleAttributes(Enum):
11+
font_size = "font_size"
12+
bold = "bold"
13+
italic = "italic"
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from dataclasses import dataclass, field
2+
from typing import Optional, Union
3+
4+
from .enums import NumberingLevel, StyleAttributes
5+
6+
7+
class UnkownNumberingLevel(Exception):
8+
def __init__(self, level_name: NumberingLevel):
9+
super().__init__(
10+
f"Level kind must be one of {NumberingLevel.__members__.values()}, not '{level_name}'."
11+
)
12+
13+
14+
@dataclass
15+
class HierarchicalHeader:
16+
index: Optional[int] = None
17+
level_toc: Optional[int] = None
18+
level_fontsize: Optional[int] = None
19+
style_attrs: list[StyleAttributes] = field(default_factory=list)
20+
level_latin: list[int] = field(default_factory=list)
21+
level_alpha: list[int] = field(default_factory=list)
22+
level_numerical: list[int] = field(default_factory=list)
23+
parent: Optional["HierarchicalHeader"] = None
24+
children: list["HierarchicalHeader"] = field(default_factory=list)
25+
doc_ref: Optional[str] = None
26+
text: Optional[str] = None
27+
28+
def __post_init__(self):
29+
self._doc_ref_to_parent_doc_ref: dict[Union[str, None], Union[str, None]] = None
30+
31+
def any_level(self) -> bool:
32+
return bool(self.level_alpha or self.level_alpha or self.level_numerical)
33+
34+
def last_level_of_kind(
35+
self, kind: NumberingLevel
36+
) -> tuple[list[int], Union["HierarchicalHeader", None]]:
37+
if kind not in NumberingLevel.__members__.values():
38+
raise UnkownNumberingLevel(kind)
39+
if self.parent:
40+
if last := getattr(self.parent, kind.value):
41+
return last, self.parent
42+
return self.parent.last_level_of_kind(kind)
43+
return [], None
44+
45+
def string_repr(self, prefix: str = "") -> str:
46+
out_text = ""
47+
if self.text:
48+
out_text += prefix + self.text + "\n"
49+
for child in self.children:
50+
out_text += child.string_repr(prefix + " ")
51+
return out_text
52+
53+
def __str__(self) -> str:
54+
return self.string_repr()
55+
56+
def _build_doc_ref_to_parent_doc_ref(
57+
self,
58+
) -> dict[Union[str, None], Union[str, None]]:
59+
self._doc_ref_to_parent_doc_ref = {}
60+
for child in self.children:
61+
self._doc_ref_to_parent_doc_ref.update(
62+
child._build_doc_ref_to_parent_doc_ref()
63+
)
64+
if self.parent is not None and self.parent.doc_ref is not None:
65+
self._doc_ref_to_parent_doc_ref[self.doc_ref] = self.parent.doc_ref
66+
else:
67+
self._doc_ref_to_parent_doc_ref[self.doc_ref] = None
68+
return self._doc_ref_to_parent_doc_ref
69+
70+
def get_parent_cid_of(self, doc_ref: str) -> Union[str, None]:
71+
if self._doc_ref_to_parent_doc_ref is None:
72+
self._build_doc_ref_to_parent_doc_ref()
73+
return self._doc_ref_to_parent_doc_ref[doc_ref]

0 commit comments

Comments
 (0)