Skip to content

Commit d5586ff

Browse files
authored
Reset OCR engine to tesseract (#78)
##
1 parent 37b97a0 commit d5586ff

7 files changed

Lines changed: 31 additions & 31 deletions

File tree

kubeflow-pipelines/common/components.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def download_docling_models(
175175
Download Docling models based on pipeline type and configuration.
176176
177177
This unified component handles model downloading for different pipeline types:
178-
- standard : Download traditional Docling models (layout, tableformer, easyocr)
178+
- standard : Download traditional Docling models (layout, tableformer)
179179
- vlm : Download Docling VLM models (smolvlm, smoldocling) for local inference
180180
When remote_model_endpoint_enabled=True, downloads minimal models for remote inference
181181
@@ -200,7 +200,7 @@ def download_docling_models(
200200
progress=True,
201201
with_layout=True,
202202
with_tableformer=True,
203-
with_easyocr=True,
203+
with_easyocr=False,
204204
)
205205
elif pipeline_type == "vlm" and remote_model_endpoint_enabled:
206206
# VLM pipeline with remote model endpoint: Download minimal required models

kubeflow-pipelines/common/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
# Base container images used across all Docling Kubeflow Pipelines
44
PYTHON_BASE_IMAGE = os.getenv(
5-
"PYTHON_BASE_IMAGE", "quay.io/amaredia/aipcc-docling-image"
5+
"PYTHON_BASE_IMAGE", "registry.access.redhat.com/ubi9/python-311:9.6-1755074620"
66
)
77
DOCLING_BASE_IMAGE = os.getenv(
8-
"DOCLING_BASE_IMAGE", "quay.io/amaredia/aipcc-docling-image"
8+
"DOCLING_BASE_IMAGE", "quay.io/fabianofranz/docling-ubi9:2.54.0"
99
)

kubeflow-pipelines/docling-standard/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ The following configuration options are available as KFP parameters when you _Cr
1818
- `docling_image_export_mode`: Image export mode for the document. In `embedded` mode, the image is embedded as base64 encoded string. With `placeholder`, only the position of the image is marked in the output. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.
1919
- `docling_num_threads`: Number of threads to be used internally by the Docling engine.
2020
- `docling_ocr`: If enabled, the bitmap content will be processed using OCR.
21-
- `docling_ocr_engine`: The OCR engine to use. Current values are: `easyocr`.
21+
- `docling_ocr_engine`: The OCR engine to use. Current values are: `tesseract_cli`.
2222
- `docling_pdf_backend`: The PDF backend to use. `pypdfium2`, `dlparse_v1`, `dlparse_v2`, or `dlparse_v4`.
2323
- `docling_table_mode`: The mode to use in the table structure model. `accurate` or `fast`.
2424
- `docling_timeout_per_document`: Timeout for each single document conversion.

kubeflow-pipelines/docling-standard/standard_components.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def docling_convert_standard(
2424
timeout_per_document: int = 300,
2525
ocr: bool = True,
2626
force_ocr: bool = False,
27-
ocr_engine: str = "easyocr",
27+
ocr_engine: str = "tesseract_cli",
2828
allow_external_plugins: bool = False,
2929
enrich_code: bool = False,
3030
enrich_formula: bool = False,

kubeflow-pipelines/docling-standard/standard_convert_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def convert_pipeline(
3333
docling_timeout_per_document: int = 300,
3434
docling_ocr: bool = True,
3535
docling_force_ocr: bool = False,
36-
docling_ocr_engine: str = "easyocr",
36+
docling_ocr_engine: str = "tesseract_cli",
3737
docling_allow_external_plugins: bool = False,
3838
docling_enrich_code: bool = False,
3939
docling_enrich_formula: bool = False,

kubeflow-pipelines/docling-standard/standard_convert_pipeline_compiled.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# docling_image_export_mode: str [Default: 'embedded']
1515
# docling_num_threads: int [Default: 4.0]
1616
# docling_ocr: bool [Default: True]
17-
# docling_ocr_engine: str [Default: 'easyocr']
17+
# docling_ocr_engine: str [Default: 'tesseract_cli']
1818
# docling_pdf_backend: str [Default: 'dlparse_v4']
1919
# docling_table_mode: str [Default: 'accurate']
2020
# docling_timeout_per_document: int [Default: 300.0]
@@ -158,7 +158,7 @@ components:
158158
isOptional: true
159159
parameterType: BOOLEAN
160160
ocr_engine:
161-
defaultValue: easyocr
161+
defaultValue: tesseract_cli
162162
description: Engine to use for OCR.
163163
isOptional: true
164164
parameterType: STRING
@@ -383,7 +383,7 @@ deploymentSpec:
383383
\ for path in input_path_p.glob(\"*.pdf\")]\n all_splits = [all_pdfs[i::num_splits]\
384384
\ for i in range(num_splits)]\n filled_splits = list(filter(None, all_splits))\n\
385385
\ return filled_splits\n\n"
386-
image: quay.io/amaredia/aipcc-docling-image
386+
image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
387387
exec-docling-chunk:
388388
container:
389389
args:
@@ -497,7 +497,7 @@ deploymentSpec:
497497
\ f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
498498
,\n flush=True,\n )\n for filename, reason in skipped_files:\n\
499499
\ print(f\" - {filename}: {reason}\", flush=True)\n\n"
500-
image: quay.io/amaredia/aipcc-docling-image
500+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
501501
resources:
502502
cpuLimit: 2.0
503503
cpuRequest: 0.25
@@ -539,7 +539,7 @@ deploymentSpec:
539539
\ image_export_mode: str = \"embedded\",\n table_mode: str = \"accurate\"\
540540
,\n num_threads: int = 4,\n timeout_per_document: int = 300,\n \
541541
\ ocr: bool = True,\n force_ocr: bool = False,\n ocr_engine: str =\
542-
\ \"easyocr\",\n allow_external_plugins: bool = False,\n enrich_code:\
542+
\ \"tesseract_cli\",\n allow_external_plugins: bool = False,\n enrich_code:\
543543
\ bool = False,\n enrich_formula: bool = False,\n enrich_picture_classes:\
544544
\ bool = False,\n enrich_picture_description: bool = False,\n):\n \
545545
\ \"\"\"\n Convert a list of PDF files to JSON and Markdown using Docling\
@@ -634,7 +634,7 @@ deploymentSpec:
634634
\ flush=True)\n result.document.save_as_markdown(\n output_md_path,\
635635
\ image_mode=ImageRefMode(image_export_mode)\n )\n\n print(\"\
636636
docling-standard-convert: done\", flush=True)\n\n"
637-
image: quay.io/amaredia/aipcc-docling-image
637+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
638638
resources:
639639
cpuLimit: 4.0
640640
cpuRequest: 0.5
@@ -675,8 +675,8 @@ deploymentSpec:
675675
\ bool = False,\n):\n \"\"\"\n Download Docling models based on pipeline\
676676
\ type and configuration.\n\n This unified component handles model downloading\
677677
\ for different pipeline types:\n - standard : Download traditional Docling\
678-
\ models (layout, tableformer, easyocr)\n - vlm : Download Docling VLM\
679-
\ models (smolvlm, smoldocling) for local inference\n When remote_model_endpoint_enabled=True,\
678+
\ models (layout, tableformer)\n - vlm : Download Docling VLM models\
679+
\ (smolvlm, smoldocling) for local inference\n When remote_model_endpoint_enabled=True,\
680680
\ downloads minimal models for remote inference\n\n Args:\n output_path:\
681681
\ Path to the output directory for Docling models\n pipeline_type:\
682682
\ Type of pipeline (standard, vlm)\n remote_model_endpoint_enabled:\
@@ -688,9 +688,9 @@ deploymentSpec:
688688
\n if pipeline_type == \"standard\":\n # Standard pipeline: download\
689689
\ traditional models\n download_models(\n output_dir=output_path_p,\n\
690690
\ progress=True,\n with_layout=True,\n \
691-
\ with_tableformer=True,\n with_easyocr=True,\n )\n \
692-
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
693-
\ # VLM pipeline with remote model endpoint: Download minimal required\
691+
\ with_tableformer=True,\n with_easyocr=False,\n )\n \
692+
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
693+
\ # VLM pipeline with remote model endpoint: Download minimal required\
694694
\ models\n # Only models set are what lives in fabianofranz repo\n\
695695
\ # TODO: figure out what needs to be downloaded or removed\n \
696696
\ download_models(\n output_dir=output_path_p,\n \
@@ -711,7 +711,7 @@ deploymentSpec:
711711
\ )\n else:\n raise ValueError(\n f\"Invalid\
712712
\ pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'\"\n \
713713
\ )\n\n"
714-
image: quay.io/amaredia/aipcc-docling-image
714+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
715715
exec-import-pdfs:
716716
container:
717717
args:
@@ -805,7 +805,7 @@ deploymentSpec:
805805
\ in resp.iter_content(chunk_size=8192):\n if chunk:\n\
806806
\ f.write(chunk)\n\n print(\"import-test-pdfs:\
807807
\ done\", flush=True)\n\n"
808-
image: quay.io/amaredia/aipcc-docling-image
808+
image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
809809
pipelineInfo:
810810
description: Docling standard convert pipeline by the Data Processing Team
811811
name: data-processing-docling-standard-pipeline
@@ -972,7 +972,7 @@ root:
972972
isOptional: true
973973
parameterType: BOOLEAN
974974
docling_ocr_engine:
975-
defaultValue: easyocr
975+
defaultValue: tesseract_cli
976976
isOptional: true
977977
parameterType: STRING
978978
docling_pdf_backend:

kubeflow-pipelines/docling-vlm/vlm_convert_pipeline_compiled.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ deploymentSpec:
301301
\ for path in input_path_p.glob(\"*.pdf\")]\n all_splits = [all_pdfs[i::num_splits]\
302302
\ for i in range(num_splits)]\n filled_splits = list(filter(None, all_splits))\n\
303303
\ return filled_splits\n\n"
304-
image: quay.io/amaredia/aipcc-docling-image
304+
image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
305305
exec-docling-chunk:
306306
container:
307307
args:
@@ -415,7 +415,7 @@ deploymentSpec:
415415
\ f\"docling-chunk: skipped {len(skipped_files)} invalid files:\"\
416416
,\n flush=True,\n )\n for filename, reason in skipped_files:\n\
417417
\ print(f\" - {filename}: {reason}\", flush=True)\n\n"
418-
image: quay.io/amaredia/aipcc-docling-image
418+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
419419
resources:
420420
cpuLimit: 2.0
421421
cpuRequest: 0.25
@@ -547,7 +547,7 @@ deploymentSpec:
547547
\ result.document.save_as_markdown(\n output_md_path,\
548548
\ image_mode=ImageRefMode(image_export_mode)\n )\n\n print(\"\
549549
docling-vlm-convert: done\", flush=True)\n\n"
550-
image: quay.io/amaredia/aipcc-docling-image
550+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
551551
resources:
552552
cpuLimit: 4.0
553553
cpuRequest: 0.5
@@ -588,8 +588,8 @@ deploymentSpec:
588588
\ bool = False,\n):\n \"\"\"\n Download Docling models based on pipeline\
589589
\ type and configuration.\n\n This unified component handles model downloading\
590590
\ for different pipeline types:\n - standard : Download traditional Docling\
591-
\ models (layout, tableformer, easyocr)\n - vlm : Download Docling VLM\
592-
\ models (smolvlm, smoldocling) for local inference\n When remote_model_endpoint_enabled=True,\
591+
\ models (layout, tableformer)\n - vlm : Download Docling VLM models\
592+
\ (smolvlm, smoldocling) for local inference\n When remote_model_endpoint_enabled=True,\
593593
\ downloads minimal models for remote inference\n\n Args:\n output_path:\
594594
\ Path to the output directory for Docling models\n pipeline_type:\
595595
\ Type of pipeline (standard, vlm)\n remote_model_endpoint_enabled:\
@@ -601,9 +601,9 @@ deploymentSpec:
601601
\n if pipeline_type == \"standard\":\n # Standard pipeline: download\
602602
\ traditional models\n download_models(\n output_dir=output_path_p,\n\
603603
\ progress=True,\n with_layout=True,\n \
604-
\ with_tableformer=True,\n with_easyocr=True,\n )\n \
605-
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
606-
\ # VLM pipeline with remote model endpoint: Download minimal required\
604+
\ with_tableformer=True,\n with_easyocr=False,\n )\n \
605+
\ elif pipeline_type == \"vlm\" and remote_model_endpoint_enabled:\n \
606+
\ # VLM pipeline with remote model endpoint: Download minimal required\
607607
\ models\n # Only models set are what lives in fabianofranz repo\n\
608608
\ # TODO: figure out what needs to be downloaded or removed\n \
609609
\ download_models(\n output_dir=output_path_p,\n \
@@ -624,7 +624,7 @@ deploymentSpec:
624624
\ )\n else:\n raise ValueError(\n f\"Invalid\
625625
\ pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'\"\n \
626626
\ )\n\n"
627-
image: quay.io/amaredia/aipcc-docling-image
627+
image: quay.io/fabianofranz/docling-ubi9:2.54.0
628628
exec-import-pdfs:
629629
container:
630630
args:
@@ -718,7 +718,7 @@ deploymentSpec:
718718
\ in resp.iter_content(chunk_size=8192):\n if chunk:\n\
719719
\ f.write(chunk)\n\n print(\"import-test-pdfs:\
720720
\ done\", flush=True)\n\n"
721-
image: quay.io/amaredia/aipcc-docling-image
721+
image: registry.access.redhat.com/ubi9/python-311:9.6-1755074620
722722
pipelineInfo:
723723
description: Docling VLM convert pipeline by the Data Processing Team
724724
name: data-processing-docling-vlm-pipeline

0 commit comments

Comments
 (0)