Merge remote-tracking branch 'upstream/main' into hybrid-chunker

eshwarprasadS · eshwarprasadS · commit c7563b7c4eb1 · 2025-03-26T21:22:11.000Z
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -67,7 +67,7 @@ pull_request_rules:
         - or:
           - files~=\.py$
           - files=pyproject.toml
-          - files=^requirements.*\.txt$
+          - files~=^requirements.*\.txt$
           - files=.github/workflows/functional-gpu-nvidia-t4-x1.yml
       - and:
         - -files~=\.py$
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -74,7 +74,7 @@ jobs:
           fetch-depth: 0  
 
       - name: Setup Python 3.11
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
         with:
           python-version: 3.11
           cache: pip
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -72,7 +72,7 @@ jobs:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
             - name: "Download build artifacts"
-              uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+              uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
               with:
                   name: Packages
                   path: dist
@@ -104,7 +104,7 @@ jobs:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
             - name: "Download build artifacts"
-              uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+              uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
               with:
                   name: Packages
                   path: dist
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -80,7 +80,7 @@ jobs:
           brew install expect coreutils bash
 
       - name: Setup Python ${{ matrix.python }}
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
         with:
           python-version: ${{ matrix.python }}
           cache: pip
@@ -93,7 +93,7 @@ jobs:
           pip cache remove llama_cpp_python
 
       - name: Cache huggingface
-        uses: actions/cache@0c907a75c2c80ebcb7f088228285e798b750cf8f # v4.2.1
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
         with:
           path: ~/.cache/huggingface
           # config contains DEFAULT_MODEL
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,18 @@ Each `LLMBlock` in a `Pipeline` can now specify `model_family` or `model_id` in
 
 The parameters `model_family`, `model_id`, and `num_instructions_to_generate` are no longer required in `PipelineContext` objects. They used to be required, and if passed in will still get used as before. However, they can now be omitted if your `Pipeline` contains no `LLMBlock` entries or if your `LLMBlock` config specifies these values in the `Pipeline` yaml.
 
+## v0.7.2
+
+### Fixes
+
+* When chunking knowledge documents, PDF or Markdown documents containing a table would often result in a "list index out of range". The cases for that error resulting from the chunking of table content are now fixed. We've also had users report other cases where a "list index out of range" error can show up in the version of Docling we rely on, and those specific cases won't be fixed until we upgrade the Docling version.
+
+## v0.7.1
+
+### Fixes
+
+* When mixing datasets, we were not always properly plumbing through the user's expected system prompt into the samples of the mixed dataset. And, specifically for the new `mix_datasets` API added in v0.7.0, we were never setting the system prompt. This adds that as a parameter to that API and ensures we use it when creating our mixed datasets.
+
 ## v0.7.0
 
 ### Features
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [build-system]
-requires = ["setuptools>=64", "setuptools_scm>=8"]
+requires = ["setuptools>=78.1.0", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"
 
 [project]
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
-datasets>=2.18.0,<3.0.0
+datasets>=2.18.0
 docling-core[chunking]>=2.9.0
 docling[tesserocr]>=2.9.0; sys_platform != 'darwin'
 docling>=2.9.0; sys_platform == 'darwin'
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -5,14 +5,18 @@
 import json
 import logging
 import os
+import os
 import re
 import sys
+import sys
 
 # Third Party
 from datasets import Dataset
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
@@ -51,7 +55,12 @@ def resolve_ocr_options(
         # Third Party
         from docling.models.tesseract_ocr_model import TesseractOcrModel
 
-        _ = TesseractOcrModel(True, ocr_options)
+        _ = TesseractOcrModel(
+            enabled=True,
+            artifacts_path=docling_model_path,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
@@ -66,7 +75,6 @@ def resolve_ocr_options(
             recog_network="standard",
             download_enabled=True,
         )
-        accelerator_options = AcceleratorOptions(device="cpu")
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
@@ -76,7 +84,7 @@ def resolve_ocr_options(
             enabled=True,
             artifacts_path=None,
             options=ocr_options,
-            accelerator_options=accelerator_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
         )
         return ocr_options
     except ImportError:
@@ -146,6 +154,7 @@ def _init_docling_converter(self):
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
+
         # deactivate MPS acceleration on Github CI
         if os.getenv("CI") and sys.platform == "darwin":
             pipeline_options.accelerator_options = AcceleratorOptions(
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -3,7 +3,7 @@
 # Standard
 from pathlib import Path
 from tempfile import mkdtemp
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Union
 import glob
 import logging
 import os
@@ -122,7 +122,7 @@ def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
-) -> Tuple[List[Path], List[Path]]:
+) -> List[Path]:
     """
     Retrieve file paths (Markdown and PDFs) from a Git repository.
 
@@ -143,8 +143,8 @@ def _get_documents(
     repo_url = source.get("repo")
     commit_hash = source.get("commit")
     file_patterns = source.get("patterns", [])
-
-    try:  # pylint: disable=too-many-nested-blocks
+    # pylint: disable=too-many-nested-blocks
+    try:
         repo = git.Repo.clone_from(repo_url, document_output_dir)
 
         if not skip_checkout and commit_hash:
@@ -178,7 +178,7 @@ def _get_documents(
                     logger.info(f"Skipping non-file path: {file_path}")
 
         if filepaths:
-            return filepaths, filepaths
+            return filepaths
         raise SystemExit("Couldn't find knowledge documents")
 
     except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
@@ -212,13 +212,13 @@ def _read_taxonomy_file(
         task_description = contents.get("task_description", None)
         domain = contents.get("domain")
         documents = contents.get("document")
-        doc_filepaths, _ = None, None
+        doc_filepaths = None
         if documents:
             os.makedirs(document_output_dir, exist_ok=True)
             unique_output_dir = mkdtemp(
                 prefix=f"{leaf_node_path}_", dir=document_output_dir
             )
-            doc_filepaths, _ = _get_documents(
+            doc_filepaths = _get_documents(
                 source=documents,
                 document_output_dir=unique_output_dir,
             )
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
@@ -90,13 +90,12 @@ def test_chunk_documents(
         chunk_word_count=500,
     )
     chunks = chunker.chunk_documents()
-
-    # Check that we have more chunks than expected.
-    assert (
-        len(chunks) > expected_chunks
-    ), f"Expected more than {expected_chunks} chunks, got {len(chunks)}"
-
-    # Check that no chunk is empty and each chunk's length is within the allowed limit.
+    assert len(chunks) > expected_chunks
+    if contains_text:
+        # Normalize spaces and remove newlines for more flexible text comparison
+        normalized_chunk = " ".join(chunks[0].replace("\n", " ").split())
+        normalized_text = " ".join(contains_text.split())
+        assert normalized_text in normalized_chunk
     for chunk in chunks:
         assert chunk, "Chunk should not be empty"
         assert len(chunk) < 2500, f"Chunk length {len(chunk)} exceeds maximum allowed"
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -2,16 +2,18 @@
 
 # Standard
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, Mock, patch
 import os
 import tempfile
 
 # Third Party
 from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions
+import git
 import pytest
 
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options
+from instructlab.sdg.utils.taxonomy import _get_documents
 
 # Local
 from .testdata import testdata
@@ -120,3 +122,99 @@ def test_invalid_tokenizer(model_name):
     model_path = os.path.join(TEST_DATA_DIR, model_name)
     with pytest.raises(ValueError):
         DocumentChunker.create_tokenizer(model_path)
+
+
+def test_get_documents_basic():
+    """Test successful document retrieval with basic inputs"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {
+            "repo": "https://fake-repo-url.git",
+            "commit": "abc123",
+            "patterns": ["*.md", "*.pdf"],
+        }
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create test files
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test content")
+
+        with patch("git.Repo.clone_from", return_value=mock_repo):
+            result = _get_documents(source, document_output_dir=Path(temp_dir))
+
+        assert len(result) == 1
+        assert result[0].name == "test.md"
+
+
+def test_get_documents_html_warning():
+    """Test warning is logged when markdown contains HTML"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create test file with HTML
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test\n<div>Some HTML</div>")
+
+        with (
+            patch("git.Repo.clone_from", return_value=mock_repo),
+            patch("logging.Logger.warning") as mock_warning,
+        ):
+            result = _get_documents(source, document_output_dir=Path(temp_dir))
+
+        mock_warning.assert_called_once()
+        assert len(result) == 1
+
+
+def test_get_documents_no_files():
+    """Test error when no valid documents are found"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        with (
+            patch("git.Repo.clone_from", return_value=mock_repo),
+            pytest.raises(SystemExit),
+        ):
+            _get_documents(source, document_output_dir=Path(temp_dir))
+
+
+def test_get_documents_git_error():
+    """Test handling of git errors"""
+    source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+    with patch("git.Repo.clone_from") as mock_clone:
+        mock_clone.side_effect = git.exc.GitCommandError("clone", "error")
+        with pytest.raises(git.exc.GitCommandError):
+            _get_documents(source)
+
+
+def test_get_documents_skip_checkout():
+    """Test that commit checkout is skipped when specified"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {
+            "repo": "https://fake-repo-url.git",
+            "commit": "abc123",
+            "patterns": ["*.md"],
+        }
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create a test file so the function finds something
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test content")
+
+        with patch("git.Repo.clone_from", return_value=mock_repo) as mock_clone:
+            result = _get_documents(
+                source, skip_checkout=True, document_output_dir=Path(temp_dir)
+            )
+
+        mock_repo.git.checkout.assert_not_called()
+        assert len(result) == 1
+        assert result[0].name == "test.md"