Skip to content

Docling >=0.50 vs == 0.49 breaks for images in tables #2223

@geoHeil

Description

@geoHeil

When upgrading from 0.49 to 0.50 I observe a change in the behaviour from docling.

This change seems to be much more impactful than just switching to the new Heron model. See below

Input is (preview here - full document below)

Image

For 0.49 the result is

With Heron

Image detected but no longer correctly referenced.

No longer parsable with heuristics afterwards.

Image

Without Heron

Image correctly detected and classified and referenced.

However a bit distorting the table structure (but still parsable with sufficient heuristics) afterwards.

Image

For 0.50 the result is

With Heron

Image is not there anymore, even the image classification is missing; no hotfix afterwards/heuristics possible

Image

Without Heron

Image is not there anymore, even the image classification is missing; no hotfix afterwards/heuristics possible

Image

NOTICE: This is independent of heron (the major change that happend)

So what is going on?

See the code to reproduce (plus attached dat file).

You have to install/change the library versions accordingly as needed.

Model download and imports

from docling_core.types.doc.document import DoclingDocument
from docling_core.types.io import DocumentStream
from pypdf import PdfReader

import psutil
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
    RapidOcrOptions,
    TableFormerMode,
    ThreadedPdfPipelineOptions
)
from docling.document_converter import DocumentConverter, PdfFormatOption

import os
from pathlib import Path

import requests
import torch
from docling.utils.model_downloader import download_models
from huggingface_hub import hf_hub_download

from modelscope import snapshot_download
from tqdm import tqdm
# from wand.image import Image as WImage
from IPython.display import display, HTML

from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON, DOCLING_LAYOUT_V2
from docling.models.layout_model import LayoutModel
from docling_core.types.doc import ImageRefMode

DEFAULT_CACHE_DIR = Path.home() / ".cache"
CACHE_DIR = Path(os.getenv("MODEL_CACHE_DIR", DEFAULT_CACHE_DIR))
PADDLEOCR_MODELS_PATH = CACHE_DIR / "paddleocr_v4_native"

hf_model_path = CACHE_DIR / "huggingface/hub"
modelscope_model_path = CACHE_DIR / "modelscope/hub"


hf_model_path.mkdir(parents=True, exist_ok=True)
modelscope_model_path.mkdir(parents=True, exist_ok=True)

rapidocr_models_root = str(modelscope_model_path / "RapidAI/RapidOCR/onnx")

def download_modelscope(model_id, model_revision, cache_dir, allow_patterns):
    print("--- Starting Filtered Model Download ---")
    print(f"Model ID: {model_id}")
    print(f"Revision: {model_revision}")
    print(f"Target Directory: {cache_dir}")
    print(f"Allow Patterns: {allow_patterns}")
    print("--------------------------------------")

    local_model_path = snapshot_download(
        model_id=model_id,
        revision=model_revision,
        cache_dir=cache_dir,
        allow_patterns=allow_patterns,
    )
    print("\n✅ Model downloaded successfully!")
    print(f"Model files are located at: {local_model_path}")


rapidocr_model_files = [
        "configuration.json",
        "onnx/PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx",
        "onnx/PP-OCRv5/rec/latin_PP-OCRv5_rec_mobile_infer.onnx",
        "onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
        "resources/fonts/FZYTK.TTF",
    ]
download_modelscope(
    model_id="RapidAI/RapidOCR",
    model_revision="v3.4.0",
    cache_dir=modelscope_model_path,
    allow_patterns=rapidocr_model_files,
)

download_models(progress=True, with_easyocr=False)

LayoutModel.download_models(
    local_dir=hf_model_path / DOCLING_LAYOUT_V2.model_repo_folder,
    force=False,
    progress=True,
)

document parser

def setup_ocr(model_spec=None):
    physical_cores = psutil.cpu_count(logical=False)
    usable_cores = max(2, physical_cores)
    
    IMAGE_RESOLUTION_SCALE = 4

    pipeline_options = ThreadedPdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = False
    pipeline_options.do_picture_classification = True
    pipeline_options.generate_picture_images = True
    
    det_model_path = os.path.join(
        rapidocr_models_root, "PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx"
    )
    rec_model_path = os.path.join(
        rapidocr_models_root, "PP-OCRv5/rec/latin_PP-OCRv5_rec_mobile_infer.onnx"
    )
    cls_model_path = os.path.join(
        rapidocr_models_root, "PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx"
    )
    rec_font_path = os.path.join(rapidocr_models_root, "resources/fonts/FZYTK.TTF")

    pipeline_options.ocr_options = RapidOcrOptions(
        det_model_path=det_model_path,
        rec_model_path=rec_model_path,
        cls_model_path=cls_model_path,
        rec_font_path=rec_font_path,
    )
    
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
    pipeline_options.ocr_options.lang = ["en"]

    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=usable_cores, device=AcceleratorDevice.AUTO
    )

    if model_spec:
        doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    layout_options=LayoutOptions(model_spec=model_spec)
                ),
            }
        )        
    else:
        doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                ),
            }
        )        
    return doc_converter

With Heron

input_path = 'WO2024040109-eval-small'


doc_converter = setup_ocr(model_spec=DOCLING_LAYOUT_HERON)
conv_res = doc_converter.convert(f'{input_path}.pdf')
conv_res.document.save_as_html(f"{input_path}.html")
display(HTML(conv_res.document.export_to_html()))

Without Heron

input_path = 'WO2024040109-eval-small'


doc_converter = setup_ocr(model_spec=DOCLING_LAYOUT_V2)
conv_res = doc_converter.convert(f'{input_path}.pdf')
conv_res.document.save_as_html(f"{input_path}_old.html")
display(HTML(conv_res.document.export_to_html()))

The document which was used for this testing is

WO2024040109-eval-small.pdf

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions