Skip to content

Commit 12a1881

Browse files
committed
feat: add basic document description fields
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent be2e4cb commit 12a1881

File tree

2 files changed

+67
-3
lines changed

2 files changed

+67
-3
lines changed

docling_core/types/doc/document.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import mimetypes
55
import re
66
import typing
7+
from datetime import datetime
78
from io import BytesIO
89
from typing import Any, Dict, Final, List, Optional, Tuple, Union
910

@@ -23,7 +24,7 @@
2324
from typing_extensions import Annotated, Self
2425

2526
from docling_core.search.package import VERSION_PATTERN
26-
from docling_core.types.base import _JSON_POINTER_REGEX
27+
from docling_core.types.base import _JSON_POINTER_REGEX, UniqueList
2728
from docling_core.types.doc import BoundingBox, Size
2829
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
2930
from docling_core.types.legacy_doc.tokens import DocumentToken
@@ -757,7 +758,16 @@ class PageItem(BaseModel):
757758

758759

759760
class DescriptionItem(BaseModel):
760-
"""DescriptionItem."""
761+
"""Metadata fields describing a document."""
762+
763+
title: Optional[str] = None
764+
author: Optional[str] = None
765+
company: Optional[str] = None
766+
category: Optional[str] = None
767+
keywords: Optional[UniqueList[str]] = None
768+
comment: Optional[str] = None
769+
created: Optional[datetime] = None
770+
modified: Optional[datetime] = None
761771

762772

763773
class DoclingDocument(BaseModel):
@@ -767,7 +777,7 @@ class DoclingDocument(BaseModel):
767777
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
768778
CURRENT_VERSION
769779
)
770-
description: DescriptionItem
780+
description: Optional[DescriptionItem] = None
771781
name: str # The working name of this document, without extensions
772782
# (could be taken from originating doc, or just "Untitled 1")
773783
origin: Optional[DocumentOrigin] = (

test/test_docling_doc.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import deque
2+
from datetime import datetime, timezone
23

34
import pytest
45
import yaml
@@ -410,3 +411,56 @@ def test_version_doc():
410411
description=DescriptionItem(), name="Untitled 1", version=comp_version
411412
)
412413
assert doc.version == CURRENT_VERSION
414+
415+
416+
def test_description_item():
417+
# no description
418+
doc = DoclingDocument(name="Test document")
419+
assert doc.description is None
420+
421+
# set description
422+
desc = DescriptionItem(
423+
title=(
424+
"DocLayNet: A Large Human-Annotated Dataset for Document-Layout " "Analysis"
425+
),
426+
author=(
427+
"Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and "
428+
"Peter Staar"
429+
),
430+
company="IBM Research",
431+
category=(
432+
"- Information systems -> Document structure; - Applied computing "
433+
"-> Document analysis; - Computing methodologies -> Machine learning; "
434+
"Computer vision; Object detection;"
435+
),
436+
keywords=[
437+
"PDF document conversion",
438+
"layout segmentation",
439+
"object-detection",
440+
"data set",
441+
"Machine Learning",
442+
],
443+
created=datetime(
444+
year=2022,
445+
month=6,
446+
day=15,
447+
hour=12,
448+
minute=57,
449+
second=56,
450+
tzinfo=timezone.utc,
451+
),
452+
modified=datetime.now(tz=timezone.utc),
453+
)
454+
doc = DoclingDocument(description=desc, name="Test document")
455+
assert doc.description is not None
456+
457+
# incompatible keywords
458+
with pytest.raises(ValidationError, match="unique"):
459+
desc = DescriptionItem(
460+
keywords=["Machine Learning", "Machine Learning", "layout segmentation"]
461+
)
462+
463+
# good and wrong date casting
464+
desc = DescriptionItem(created="2024-10-16", modified=1729077211.437628)
465+
with pytest.raises(ValidationError, match="valid datetime"):
466+
desc = DescriptionItem(created=False)

0 commit comments

Comments
 (0)