-
Notifications
You must be signed in to change notification settings - Fork 3k
Description
When upgrading from 0.49 to 0.50 I observe a change in the behaviour from docling.
This change seems to be much more impactful than just switching to the new Heron model. See below
Input is (preview here - full document below)

For 0.49 the result is
With Heron
Image detected but no longer correctly referenced.
No longer parsable with heuristics afterwards.

Without Heron
Image correctly detected and classified and referenced.
However a bit distorting the table structure (but still parsable with sufficient heuristics) afterwards.

For 0.50 the result is
With Heron
Image is not there anymore, even the image classification is missing; no hotfix afterwards/heuristics possible

Without Heron
Image is not there anymore, even the image classification is missing; no hotfix afterwards/heuristics possible

NOTICE: This is independent of heron (the major change that happend)
So what is going on?
See the code to reproduce (plus attached dat file).
You have to install/change the library versions accordingly as needed.
Model download and imports
from docling_core.types.doc.document import DoclingDocument
from docling_core.types.io import DocumentStream
from pypdf import PdfReader
import psutil
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
RapidOcrOptions,
TableFormerMode,
ThreadedPdfPipelineOptions
)
from docling.document_converter import DocumentConverter, PdfFormatOption
import os
from pathlib import Path
import requests
import torch
from docling.utils.model_downloader import download_models
from huggingface_hub import hf_hub_download
from modelscope import snapshot_download
from tqdm import tqdm
# from wand.image import Image as WImage
from IPython.display import display, HTML
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON, DOCLING_LAYOUT_V2
from docling.models.layout_model import LayoutModel
from docling_core.types.doc import ImageRefMode
DEFAULT_CACHE_DIR = Path.home() / ".cache"
CACHE_DIR = Path(os.getenv("MODEL_CACHE_DIR", DEFAULT_CACHE_DIR))
PADDLEOCR_MODELS_PATH = CACHE_DIR / "paddleocr_v4_native"
hf_model_path = CACHE_DIR / "huggingface/hub"
modelscope_model_path = CACHE_DIR / "modelscope/hub"
hf_model_path.mkdir(parents=True, exist_ok=True)
modelscope_model_path.mkdir(parents=True, exist_ok=True)
rapidocr_models_root = str(modelscope_model_path / "RapidAI/RapidOCR/onnx")
def download_modelscope(model_id, model_revision, cache_dir, allow_patterns):
print("--- Starting Filtered Model Download ---")
print(f"Model ID: {model_id}")
print(f"Revision: {model_revision}")
print(f"Target Directory: {cache_dir}")
print(f"Allow Patterns: {allow_patterns}")
print("--------------------------------------")
local_model_path = snapshot_download(
model_id=model_id,
revision=model_revision,
cache_dir=cache_dir,
allow_patterns=allow_patterns,
)
print("\n✅ Model downloaded successfully!")
print(f"Model files are located at: {local_model_path}")
rapidocr_model_files = [
"configuration.json",
"onnx/PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx",
"onnx/PP-OCRv5/rec/latin_PP-OCRv5_rec_mobile_infer.onnx",
"onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
"resources/fonts/FZYTK.TTF",
]
download_modelscope(
model_id="RapidAI/RapidOCR",
model_revision="v3.4.0",
cache_dir=modelscope_model_path,
allow_patterns=rapidocr_model_files,
)
download_models(progress=True, with_easyocr=False)
LayoutModel.download_models(
local_dir=hf_model_path / DOCLING_LAYOUT_V2.model_repo_folder,
force=False,
progress=True,
)
document parser
def setup_ocr(model_spec=None):
physical_cores = psutil.cpu_count(logical=False)
usable_cores = max(2, physical_cores)
IMAGE_RESOLUTION_SCALE = 4
pipeline_options = ThreadedPdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = False
pipeline_options.do_picture_classification = True
pipeline_options.generate_picture_images = True
det_model_path = os.path.join(
rapidocr_models_root, "PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx"
)
rec_model_path = os.path.join(
rapidocr_models_root, "PP-OCRv5/rec/latin_PP-OCRv5_rec_mobile_infer.onnx"
)
cls_model_path = os.path.join(
rapidocr_models_root, "PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx"
)
rec_font_path = os.path.join(rapidocr_models_root, "resources/fonts/FZYTK.TTF")
pipeline_options.ocr_options = RapidOcrOptions(
det_model_path=det_model_path,
rec_model_path=rec_model_path,
cls_model_path=cls_model_path,
rec_font_path=rec_font_path,
)
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.ocr_options.lang = ["en"]
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=usable_cores, device=AcceleratorDevice.AUTO
)
if model_spec:
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
layout_options=LayoutOptions(model_spec=model_spec)
),
}
)
else:
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
),
}
)
return doc_converter
With Heron
input_path = 'WO2024040109-eval-small'
doc_converter = setup_ocr(model_spec=DOCLING_LAYOUT_HERON)
conv_res = doc_converter.convert(f'{input_path}.pdf')
conv_res.document.save_as_html(f"{input_path}.html")
display(HTML(conv_res.document.export_to_html()))
Without Heron
input_path = 'WO2024040109-eval-small'
doc_converter = setup_ocr(model_spec=DOCLING_LAYOUT_V2)
conv_res = doc_converter.convert(f'{input_path}.pdf')
conv_res.document.save_as_html(f"{input_path}_old.html")
display(HTML(conv_res.document.export_to_html()))
The document which was used for this testing is