Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit 2cc9889

Browse files
Merge pull request #574 from eshwarprasadS/docling-version-bump
Update Docling version and improve OCR options handling with new docling ver.
2 parents 7150fde + e0c469d commit 2cc9889

7 files changed

Lines changed: 161 additions & 93 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
[build-system]
4-
requires = ["setuptools>=64", "setuptools_scm>=8"]
4+
requires = ["setuptools>=78.1.0", "setuptools_scm>=8"]
55
build-backend = "setuptools.build_meta"
66

77
[project]

requirements.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
# SPDX-License-Identifier: Apache-2.0
22
click>=8.1.7,<9.0.0
33
datasets>=2.18.0
4-
docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin'
5-
docling>=2.4.2,<=2.8.3; sys_platform == 'darwin'
6-
docling-parse>=2.0.0,<3.0.0
4+
docling[tesserocr]>=2.18.0; sys_platform != 'darwin'
5+
docling>=2.18.0; sys_platform == 'darwin'
76
GitPython>=3.1.42,<4.0.0
87
gguf>=0.6.0
98
httpx>=0.25.0,<1.0.0

src/instructlab/sdg/utils/chunkers.py

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44
from typing import Dict, Iterable, List, Optional
55
import json
66
import logging
7+
import os
78
import re
9+
import sys
810

911
# Third Party
1012
from datasets import Dataset
1113
from docling.datamodel.base_models import InputFormat
1214
from docling.datamodel.document import ConversionResult
1315
from docling.datamodel.pipeline_options import (
16+
AcceleratorDevice,
17+
AcceleratorOptions,
1418
EasyOcrOptions,
1519
OcrOptions,
1620
PdfPipelineOptions,
@@ -35,29 +39,50 @@ def _num_chars_from_tokens(num_tokens) -> int:
3539
return int(num_tokens * 4) # 1 token ~ 4 English character
3640

3741

38-
def resolve_ocr_options() -> OcrOptions:
42+
def resolve_ocr_options(
43+
docling_model_path: Optional[Path] = None,
44+
) -> Optional[OcrOptions]:
45+
# Declare ocr_options explicitly as Optional[OcrOptions]
46+
ocr_options: Optional[OcrOptions] = None
47+
3948
# First, attempt to use tesserocr
4049
try:
4150
ocr_options = TesseractOcrOptions()
4251
# pylint: disable=import-outside-toplevel
4352
# Third Party
4453
from docling.models.tesseract_ocr_model import TesseractOcrModel
4554

46-
_ = TesseractOcrModel(True, ocr_options)
55+
_ = TesseractOcrModel(
56+
enabled=True,
57+
artifacts_path=docling_model_path,
58+
options=ocr_options,
59+
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
60+
)
4761
return ocr_options
4862
except ImportError:
4963
# No tesserocr, so try something else
50-
pass
64+
logger.warning("Tesseract not found, falling back to EasyOCR.")
65+
5166
try:
52-
ocr_options = EasyOcrOptions()
53-
# Keep easyocr models on the CPU instead of GPU
54-
ocr_options.use_gpu = False
67+
ocr_options = EasyOcrOptions(
68+
lang=["en"],
69+
use_gpu=None,
70+
confidence_threshold=0.5,
71+
model_storage_directory=str(docling_model_path),
72+
recog_network="standard",
73+
download_enabled=True,
74+
)
5575
# triggers torch loading, import lazily
5676
# pylint: disable=import-outside-toplevel
5777
# Third Party
5878
from docling.models.easyocr_model import EasyOcrModel
5979

60-
_ = EasyOcrModel(True, ocr_options)
80+
_ = EasyOcrModel(
81+
enabled=True,
82+
artifacts_path=None,
83+
options=ocr_options,
84+
accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
85+
)
6186
return ocr_options
6287
except ImportError:
6388
# no easyocr either, so don't use any OCR
@@ -127,7 +152,12 @@ def _init_docling_converter(self):
127152
do_ocr=False,
128153
)
129154

130-
ocr_options = resolve_ocr_options()
155+
# deactivate MPS acceleration on Github CI
156+
if os.getenv("CI") and sys.platform == "darwin":
157+
pipeline_options.accelerator_options = AcceleratorOptions(
158+
device=AcceleratorDevice.CPU
159+
)
160+
ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
131161
if ocr_options is not None:
132162
pipeline_options.do_ocr = True
133163
pipeline_options.ocr_options = ocr_options

src/instructlab/sdg/utils/taxonomy.py

Lines changed: 15 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,14 @@
33
# Standard
44
from pathlib import Path
55
from tempfile import mkdtemp
6-
from typing import Dict, List, Tuple, Union
6+
from typing import Dict, List, Union
77
import glob
88
import logging
99
import os
1010
import re
1111

1212
# Third Party
1313
from datasets import Dataset
14-
15-
# pylint: disable=no-name-in-module
16-
from docling_parse.docling_parse import pdf_parser_v1
1714
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
1815
from instructlab.schema.taxonomy import (
1916
TaxonomyMessageFormat,
@@ -27,9 +24,6 @@
2724
# Local
2825
from .chunkers import DocumentChunker
2926

30-
# Initialize the pdf parser
31-
PDFParser = pdf_parser_v1()
32-
3327
logger = logging.getLogger(__name__)
3428

3529

@@ -126,9 +120,9 @@ def _get_documents(
126120
source: Dict[str, Union[str, List[str]]],
127121
skip_checkout: bool = False,
128122
document_output_dir: Path = None,
129-
) -> Tuple[List[str], List[Path]]:
123+
) -> List[Path]:
130124
"""
131-
Retrieve the content of files (Markdown and PDF) from a Git repository.
125+
Retrieve the file paths of files (Markdown and PDF) from a Git repository.
132126
133127
Args:
134128
source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -147,14 +141,13 @@ def _get_documents(
147141
repo_url = source.get("repo")
148142
commit_hash = source.get("commit")
149143
file_patterns = source.get("patterns", [])
150-
151-
try: # pylint: disable=too-many-nested-blocks
144+
# pylint: disable=too-many-nested-blocks
145+
try:
152146
repo = git.Repo.clone_from(repo_url, document_output_dir)
153147

154148
if not skip_checkout and commit_hash:
155149
repo.git.checkout(commit_hash)
156150

157-
file_contents = []
158151
filepaths = []
159152

160153
logger.info("Processing files...")
@@ -170,7 +163,6 @@ def _get_documents(
170163
logger.info(f"Processing file: {file_path}")
171164
try:
172165
if file_path.lower().endswith(".md"):
173-
# Process Markdown files
174166
with open(file_path, "r", encoding="utf-8") as file:
175167
content = file.read()
176168
if _string_contains_html(content):
@@ -179,75 +171,19 @@ def _get_documents(
179171
"NOTE: Continuing this might affect your data generation quality."
180172
"To get best results please format your markdown documents without the use of HTML or use a different document filetype."
181173
)
182-
file_contents.append(content)
183-
filepaths.append(Path(file_path))
184-
logger.info(
185-
f"Appended Markdown content from {file_path}"
186-
)
187-
188-
elif file_path.lower().endswith(".pdf"):
189-
# Process PDF files using docling_parse's pdf_parser_v1
190-
doc_key = f"key_{os.path.basename(file_path)}" # Unique document key
191-
logger.info(f"Loading PDF document from {file_path}")
192-
193-
success = PDFParser.load_document(doc_key, file_path)
194-
if not success:
195-
logger.warning(
196-
f"Failed to load PDF document: {file_path}"
197-
)
198-
continue
199-
200-
num_pages = PDFParser.number_of_pages(doc_key)
201-
logger.info(f"PDF '{file_path}' has {num_pages} pages.")
202-
203-
pdf_text = ""
204-
205-
for page in range(num_pages):
206-
try:
207-
json_doc = PDFParser.parse_pdf_from_key_on_page(
208-
doc_key, page
209-
)
210-
if "pages" not in json_doc or not json_doc["pages"]:
211-
logger.warning(
212-
f"Page {page + 1} could not be parsed in '{file_path}'"
213-
)
214-
continue
215-
216-
json_page = json_doc["pages"][0]
217-
218-
# Extract text from cells
219-
for cell in json_page.get("cells", []):
220-
text = cell.get("content", {}).get(
221-
"rnormalized", ""
222-
)
223-
if text.strip(): # Only append non-empty text
224-
pdf_text += text.strip() + "\n"
225-
except Exception as page_error: # pylint: disable=broad-exception-caught
226-
logger.warning(
227-
f"Error parsing page {page + 1} of '{file_path}': {page_error}"
228-
)
229-
continue
230-
231-
if pdf_text:
232-
file_contents.append(pdf_text)
233-
filepaths.append(Path(file_path))
234-
235-
# Unload the document to free memory
236-
PDFParser.unload_document(doc_key)
237-
logger.info(f"Unloaded PDF document: {file_path}")
238-
239-
else:
240-
logger.info(f"Skipping unsupported file type: {file_path}")
241-
except Exception as file_error: # pylint: disable=broad-exception-caught
174+
filepaths.append(Path(file_path))
175+
logger.info(f"Collected filepath: {file_path}")
176+
# pylint: disable=broad-exception-caught
177+
except Exception as file_error:
242178
logger.error(
243179
f"Error processing file '{file_path}': {file_error}"
244180
)
245181
continue
246182
else:
247183
logger.info(f"Skipping non-file path: {file_path}")
248184

249-
if file_contents:
250-
return file_contents, filepaths
185+
if filepaths:
186+
return filepaths
251187
raise SystemExit("Couldn't find knowledge documents")
252188

253189
except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
@@ -281,13 +217,13 @@ def _read_taxonomy_file(
281217
task_description = contents.get("task_description", None)
282218
domain = contents.get("domain")
283219
documents = contents.get("document")
284-
document_contents, doc_filepaths = None, None
220+
doc_filepaths = None
285221
if documents:
286222
os.makedirs(document_output_dir, exist_ok=True)
287223
unique_output_dir = mkdtemp(
288224
prefix=f"{leaf_node_path}_", dir=document_output_dir
289225
)
290-
document_contents, doc_filepaths = _get_documents(
226+
doc_filepaths = _get_documents(
291227
source=documents,
292228
document_output_dir=unique_output_dir,
293229
)
@@ -302,7 +238,6 @@ def _read_taxonomy_file(
302238
"questions_and_answers": question_answer_list,
303239
"context": context,
304240
"taxonomy_path": tax_path,
305-
"documents": document_contents,
306241
"filepaths": doc_filepaths,
307242
"domain": domain,
308243
"document_outline": contents.get("document_outline"),
@@ -493,7 +428,8 @@ def leaf_node_to_samples(
493428
docling_model_path=None,
494429
):
495430
samples = []
496-
if leaf_node and leaf_node[0].get("documents"):
431+
# check if the leaf node has document filepaths, if so, it's a knowledge leaf node
432+
if leaf_node and (leaf_node[0].get("filepaths")):
497433
samples = _knowledge_leaf_node_to_samples(
498434
leaf_node,
499435
server_ctx_size,

tests/functional/test_chunkers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ def test_chunk_documents(
5555
chunks = chunker.chunk_documents()
5656
assert len(chunks) > expected_chunks
5757
if contains_text:
58-
assert contains_text in chunks[0]
58+
# Normalize spaces and remove newlines for more flexible text comparison
59+
normalized_chunk = " ".join(chunks[0].replace("\n", " ").split())
60+
normalized_text = " ".join(contains_text.split())
61+
assert normalized_text in normalized_chunk
5962
for chunk in chunks:
6063
assert len(chunk) < 2500

0 commit comments

Comments
 (0)