Open
Description
Hi team,
Appreciate the work done in this repo.
Was curious how I can I use a different backend(PyPdfiumDocumentBackend
) other than default backend(docling-parse
).
I am trying it as follows but doesnt see much difference in timing or output.
import spacy
from spacy_layout import spaCyLayout
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
source = "path/to/document.pdf"
nlp = spacy.blank("en")
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False
layout = spaCyLayout(nlp, docling_options={
"format_options":{
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
}
})
doc = layout(source)