diff --git a/docling/models/stages/ocr/auto_ocr_model.py b/docling/models/stages/ocr/auto_ocr_model.py index ef2dcbb151..90bfe3830c 100644 --- a/docling/models/stages/ocr/auto_ocr_model.py +++ b/docling/models/stages/ocr/auto_ocr_model.py @@ -95,27 +95,6 @@ def __init__( except ImportError: _log.info("easyocr cannot be used because it is not installed.") - if self._engine is None: - try: - import torch - from rapidocr import EngineType, RapidOCR # type: ignore - - self._engine = RapidOcrModel( - enabled=self.enabled, - artifacts_path=artifacts_path, - options=RapidOcrOptions( - backend="torch", - bitmap_area_threshold=self.options.bitmap_area_threshold, - force_full_page_ocr=self.options.force_full_page_ocr, - ), - accelerator_options=accelerator_options, - ) - _log.info("Auto OCR model selected rapidocr with torch.") - except ImportError: - _log.info( - "rapidocr cannot be used because rapidocr or torch is not installed." - ) - if self._engine is None: _log.warning("No OCR engine found. Please review the install details.") diff --git a/pyproject.toml b/pyproject.toml index a53af5f652..54a6608137 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -243,7 +243,7 @@ cli = [ # CONVENIENCE BUNDLES # ============================================================================ standard = [ - 'docling-slim[format-pdf,models-local,feat-ocr-rapidocr,format-office,format-web,format-latex,format-email,feat-chunking,extract-core,service-client,cli]', + 'docling-slim[format-pdf,models-local,feat-ocr-rapidocr-onnx,format-office,format-web,format-latex,format-email,feat-chunking,extract-core,service-client,cli]', ] all = [ diff --git a/tests/test_options.py b/tests/test_options.py index ce3d8aa5aa..bf23399484 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -123,6 +123,13 @@ def test_kserve_v2_binary_data_deprecated_alias(): assert options.use_binary_data is True +def test_standard_extra_installs_rapidocr_onnx(): + pyproject = Path("pyproject.toml").read_text() + + assert "feat-ocr-rapidocr-onnx" in pyproject + assert "models-local,feat-ocr-rapidocr," not in pyproject + + def test_e2e_conversions(test_doc_path): for converter in get_converters_with_table_options(): print(f"converting {test_doc_path}") diff --git a/uv.lock b/uv.lock index c624d50609..a99ca1a0cf 100644 --- a/uv.lock +++ b/uv.lock @@ -1476,7 +1476,7 @@ all = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "ocrmac", marker = "sys_platform == 'darwin'" }, - { name = "onnxruntime", marker = "python_full_version < '3.14' and sys_platform == 'darwin'" }, + { name = "onnxruntime", marker = "python_full_version < '3.14'" }, { name = "onnxruntime-gpu", marker = "(python_full_version < '3.14' and sys_platform == 'linux') or (python_full_version < '3.14' and sys_platform == 'win32')" }, { name = "openai-whisper" }, { name = "openpyxl" }, @@ -1646,6 +1646,7 @@ standard = [ { name = "marko" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "onnxruntime", marker = "python_full_version < '3.14'" }, { name = "openpyxl" }, { name = "pillow" }, { name = "polyfactory" }, @@ -1757,7 +1758,7 @@ requires-dist = [ { name = "docling-slim", extras = ["format-docx", "format-pptx", "format-xlsx"], marker = "extra == 'format-office'" }, { name = "docling-slim", extras = ["format-html"], marker = "extra == 'format-email'" }, { name = "docling-slim", extras = ["format-html", "format-markdown"], marker = "extra == 'format-web'" }, - { name = "docling-slim", extras = ["format-pdf", "models-local", "feat-ocr-rapidocr", "format-office", "format-web", "format-latex", "format-email", "feat-chunking", "extract-core", "service-client", "cli"], marker = "extra == 'standard'" }, + { name = "docling-slim", extras = ["format-pdf", "models-local", "feat-ocr-rapidocr-onnx", "format-office", "format-web", "format-latex", "format-email", "feat-chunking", "extract-core", "service-client", "cli"], marker = "extra == 'standard'" }, { name = "docling-slim", extras = ["format-pdf-pypdfium2", "format-pdf-docling"], marker = "extra == 'format-pdf'" }, { name = "docling-slim", extras = ["standard", "models-vlm-inline", "format-audio", "format-html-render", "format-xml-xbrl", "models-remote", "models-onnxruntime", "feat-ocr-easyocr", "feat-ocr-tesserocr", "feat-ocr-mac"], marker = "extra == 'all'" }, { name = "easyocr", marker = "extra == 'feat-ocr-easyocr'", specifier = ">=1.7,<2.0" },