Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
&& uv python install 3.11 \
&& uv venv \
&& cd /tmp/build \
&& uv --native-tls pip install --no-cache . \
&& uv --native-tls pip install --no-cache --override /tmp/build/etc/uv-overrides.txt . \
&& uv --native-tls run --script /tmp/build/etc/fetchDefaultModels.py \
&& rm -rf /tmp/build

Expand Down
2 changes: 2 additions & 0 deletions etc/uv-overrides.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numba>=0.60.0
llvmlite>=0.43.0
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"python-docx>=1.1.2",
"python-dotenv>=1.0.1",
"python-pptx>=1.0.2",
"numba>=0.60.0",
"unstructured>=0.14.8",
"faiss-cpu>=1.11.0",
"langchain_community>=0.3.18",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import hashlib
from pathlib import Path
from typing import List
from typing import Optional
from typing import List, Optional

from langchain.text_splitter import RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter
from langchain_text_splitters import (
RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter,
)

from docs2vecs.subcommands.indexer.config.config import Config
from docs2vecs.subcommands.indexer.document.chunk import Chunk
Expand All @@ -14,22 +15,29 @@
class RecursiveCharacterTextSplitter(IndexerSkill):
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_CHUNK_OVERLAP = 100

def __init__(self, config: dict, global_config: Config):
super().__init__(config, global_config)
self._set_config_defaults()

def _set_config_defaults(self):
if "chunk_size" not in self._config:
self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE
self._config["chunk_size"] = (
RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE
)

if "chunk_overlap" not in self._config:
self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP
self._config["chunk_overlap"] = (
RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP
)

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running RecursiveCharacterTextSplitter...")

splitter = LCRecursiveCharacterTextSplitter(chunk_size=self._config["chunk_size"], chunk_overlap=self._config["chunk_overlap"])
splitter = LCRecursiveCharacterTextSplitter(
chunk_size=self._config["chunk_size"],
chunk_overlap=self._config["chunk_overlap"],
)

for doc in input:
self.logger.debug(f"Splitting {doc.filename}...")
Expand Down