Skip to content

Commit 37982d9

Browse files
committed
add DocTags serialization, revert smiles to smi to prevent confusion with plural
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 5aa768f commit 37982d9

File tree

7 files changed

+45
-30
lines changed

7 files changed

+45
-30
lines changed

docling_core/transforms/serializer/doctags.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
PictureTabularChartData,
4545
ProvenanceItem,
4646
SectionHeaderItem,
47+
TableData,
4748
TableItem,
4849
TextItem,
4950
)
@@ -233,13 +234,22 @@ def serialize(
233234
ysize=params.ysize,
234235
)
235236

236-
classifications = [
237-
ann
238-
for ann in item.annotations
239-
if isinstance(ann, PictureClassificationData)
240-
]
241-
if len(classifications) > 0:
237+
# handle classification data
238+
predicted_class: Optional[str] = None
239+
if item.meta and item.meta.classification:
240+
predicted_class = (
241+
item.meta.classification.get_main_prediction().class_name
242+
)
243+
elif (
244+
classifications := [
245+
ann
246+
for ann in item.annotations
247+
if isinstance(ann, PictureClassificationData)
248+
]
249+
) and classifications[0].predicted_classes:
242250
predicted_class = classifications[0].predicted_classes[0].class_name
251+
if predicted_class:
252+
body += DocumentToken.get_picture_classification_token(predicted_class)
243253
if predicted_class in [
244254
PictureClassificationLabel.PIE_CHART,
245255
PictureClassificationLabel.BAR_CHART,
@@ -250,26 +260,31 @@ def serialize(
250260
PictureClassificationLabel.HEATMAP,
251261
]:
252262
is_chart = True
253-
body += DocumentToken.get_picture_classification_token(predicted_class)
254263

255-
smiles_annotations = [
264+
# handle molecule data
265+
smi: Optional[str] = None
266+
if item.meta and item.meta.molecule:
267+
smi = item.meta.molecule.smi
268+
elif smiles_annotations := [
256269
ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
257-
]
258-
if len(smiles_annotations) > 0:
259-
body += _wrap(
260-
text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
261-
)
262-
263-
tabular_chart_annotations = [
270+
]:
271+
smi = smiles_annotations[0].smi
272+
if smi:
273+
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
274+
275+
# handle tabular chart data
276+
chart_data: Optional[TableData] = None
277+
if item.meta and item.meta.tabular_chart:
278+
chart_data = item.meta.tabular_chart.chart_data
279+
elif tabular_chart_annotations := [
264280
ann
265281
for ann in item.annotations
266282
if isinstance(ann, PictureTabularChartData)
267-
]
268-
if len(tabular_chart_annotations) > 0:
283+
]:
284+
chart_data = tabular_chart_annotations[0].chart_data
285+
if chart_data and chart_data.table_cells:
269286
temp_doc = DoclingDocument(name="temp")
270-
temp_table = temp_doc.add_table(
271-
data=tabular_chart_annotations[0].chart_data
272-
)
287+
temp_table = temp_doc.add_table(data=chart_data)
273288
otsl_content = temp_table.export_to_otsl(
274289
temp_doc, add_cell_location=False
275290
)

docling_core/transforms/serializer/html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
865865
elif isinstance(field_val, PictureClassificationMetaField):
866866
txt = self._humanize_text(field_val.get_main_prediction().class_name)
867867
elif isinstance(field_val, MoleculeMetaField):
868-
txt = field_val.smiles
868+
txt = field_val.smi
869869
elif isinstance(field_val, TabularChartMetaField):
870870
# suppressing tabular chart serialization
871871
return None

docling_core/transforms/serializer/markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def _serialize_meta_field(
313313
elif isinstance(field_val, PictureClassificationMetaField):
314314
txt = self._humanize_text(field_val.get_main_prediction().class_name)
315315
elif isinstance(field_val, MoleculeMetaField):
316-
txt = field_val.smiles
316+
txt = field_val.smi
317317
elif isinstance(field_val, TabularChartMetaField):
318318
# suppressing tabular chart serialization
319319
return None

docling_core/types/doc/document.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ class SummaryMetaField(BasePrediction):
10171017
text: str
10181018

10191019

1020-
# NOTE: should be manually kept in sync with top-level BaseMeta hierarchy fields
1020+
# NOTE: must be manually kept in sync with top-level BaseMeta hierarchy fields
10211021
class MetaFieldName(str, Enum):
10221022
"""Standard meta field names."""
10231023

@@ -1069,7 +1069,7 @@ def get_main_prediction(self) -> PictureClassificationPrediction:
10691069
class MoleculeMetaField(BasePrediction):
10701070
"""Molecule metadata field."""
10711071

1072-
smiles: str = Field(description="The SMILES representation of the molecule.")
1072+
smi: str = Field(description="The SMILES representation of the molecule.")
10731073

10741074

10751075
class TabularChartMetaField(BasePrediction):
@@ -1643,7 +1643,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any:
16431643
data["meta"].setdefault(
16441644
MetaFieldName.MOLECULE.value,
16451645
MoleculeMetaField(
1646-
smiles=ann.smi,
1646+
smi=ann.smi,
16471647
confidence=ann.confidence,
16481648
created_by=ann.provenance,
16491649
**{

docs/DoclingDocument.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,14 +1421,14 @@
14211421
],
14221422
"title": "Created By"
14231423
},
1424-
"smiles": {
1424+
"smi": {
14251425
"description": "The SMILES representation of the molecule.",
1426-
"title": "Smiles",
1426+
"title": "Smi",
14271427
"type": "string"
14281428
}
14291429
},
14301430
"required": [
1431-
"smiles"
1431+
"smi"
14321432
],
14331433
"title": "MoleculeMetaField",
14341434
"type": "object"

test/data/doc/dummy_doc_2_prec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ pictures:
107107
- 1.0
108108
- - 1.0
109109
- 1.0
110-
smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
110+
smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
111111
parent:
112112
$ref: '#/body'
113113
prov:

test/data/doc/dummy_doc_with_meta_modified.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ pictures:
107107
- 1.0
108108
- - 1.0
109109
- 1.0
110-
smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
110+
smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
111111
parent:
112112
$ref: '#/body'
113113
prov:

0 commit comments

Comments
 (0)