@@ -121,16 +121,17 @@ def create_pdf_splits(
121121@dsl .component (
122122 base_image = PYTORCH_CUDA_IMAGE ,
123123 packages_to_install = [
124- "docling" ,
124+ "docling>=2.43.0 " ,
125125 "transformers" ,
126126 "sentence-transformers" ,
127127 "llama-stack" ,
128128 "llama-stack-client" ,
129129 "pymilvus" ,
130130 "fire" ,
131+ "rapidocr-onnxruntime" ,
131132 ],
132133)
133- def docling_convert_and_ingest (
134+ def docling_convert (
134135 input_path : dsl .InputPath ("input-pdfs" ),
135136 pdf_split : List [str ],
136137 output_path : dsl .OutputPath ("output-md" ),
@@ -142,14 +143,15 @@ def docling_convert_and_ingest(
142143 import pathlib
143144
144145 from docling .datamodel .base_models import InputFormat , ConversionStatus
145- from docling .datamodel .pipeline_options import PdfPipelineOptions
146+ from docling .datamodel .pipeline_options import PdfPipelineOptions , RapidOcrOptions
146147 from docling .document_converter import DocumentConverter , PdfFormatOption
147148 from transformers import AutoTokenizer
148149 from sentence_transformers import SentenceTransformer
149150 from docling .chunking import HybridChunker
150151 import logging
151152 from llama_stack_client import LlamaStackClient
152153 import uuid
154+
153155 import json
154156
155157 _log = logging .getLogger (__name__ )
@@ -239,6 +241,7 @@ def process_and_insert_embeddings(conv_results):
239241 pipeline_options = PdfPipelineOptions ()
240242 pipeline_options .do_ocr = True
241243 pipeline_options .generate_page_images = True
244+ pipeline_options .ocr_options = RapidOcrOptions ()
242245
243246 doc_converter = DocumentConverter (
244247 format_options = {
@@ -304,7 +307,7 @@ def docling_convert_pipeline(
304307
305308 with dsl .ParallelFor (pdf_splits .output ) as pdf_split :
306309 with dsl .If (use_gpu == True ):
307- convert_task = docling_convert_and_ingest (
310+ convert_task = docling_convert (
308311 input_path = import_task .output ,
309312 pdf_split = pdf_split ,
310313 embed_model_id = embed_model_id ,
@@ -331,7 +334,7 @@ def docling_convert_pipeline(
331334 )
332335 add_node_selector_json (convert_task , {})
333336 with dsl .Else ():
334- convert_task = docling_convert_and_ingest (
337+ convert_task = docling_convert (
335338 input_path = import_task .output ,
336339 pdf_split = pdf_split ,
337340 embed_model_id = embed_model_id ,
0 commit comments