diff --git a/Dockerfile b/Dockerfile index da2b125..fa7d589 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ && uv python install 3.11 \ && uv venv \ && cd /tmp/build \ - && uv --native-tls pip install --no-cache . \ + && uv --native-tls pip install --no-cache --override /tmp/build/etc/uv-overrides.txt . \ && uv --native-tls run --script /tmp/build/etc/fetchDefaultModels.py \ && rm -rf /tmp/build diff --git a/etc/uv-overrides.txt b/etc/uv-overrides.txt new file mode 100644 index 0000000..4a3a3a9 --- /dev/null +++ b/etc/uv-overrides.txt @@ -0,0 +1,2 @@ +numba>=0.60.0 +llvmlite>=0.43.0 diff --git a/pyproject.toml b/pyproject.toml index 7a66d85..89046a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "python-docx>=1.1.2", "python-dotenv>=1.0.1", "python-pptx>=1.0.2", + "numba>=0.60.0", "unstructured>=0.14.8", "faiss-cpu>=1.11.0", "langchain_community>=0.3.18", diff --git a/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py index 6600ee3..853d66b 100644 --- a/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py +++ b/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py @@ -1,9 +1,10 @@ import hashlib from pathlib import Path -from typing import List -from typing import Optional +from typing import List, Optional -from langchain.text_splitter import RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter +from langchain_text_splitters import ( + RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter, +) from docs2vecs.subcommands.indexer.config.config import Config from docs2vecs.subcommands.indexer.document.chunk import Chunk @@ -14,22 +15,29 @@ class RecursiveCharacterTextSplitter(IndexerSkill): DEFAULT_CHUNK_SIZE = 1000 DEFAULT_CHUNK_OVERLAP = 100 - + def __init__(self, config: dict, global_config: Config): super().__init__(config, global_config) self._set_config_defaults() def _set_config_defaults(self): if "chunk_size" not in self._config: - self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE + self._config["chunk_size"] = ( + RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE + ) if "chunk_overlap" not in self._config: - self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP + self._config["chunk_overlap"] = ( + RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP + ) def run(self, input: Optional[List[Document]] = None) -> List[Document]: self.logger.info("Running RecursiveCharacterTextSplitter...") - splitter = LCRecursiveCharacterTextSplitter(chunk_size=self._config["chunk_size"], chunk_overlap=self._config["chunk_overlap"]) + splitter = LCRecursiveCharacterTextSplitter( + chunk_size=self._config["chunk_size"], + chunk_overlap=self._config["chunk_overlap"], + ) for doc in input: self.logger.debug(f"Splitting {doc.filename}...")