|
1 | 1 | import hashlib |
2 | 2 | from pathlib import Path |
3 | | -from typing import List |
4 | | -from typing import Optional |
| 3 | +from typing import List, Optional |
5 | 4 |
|
6 | | -from langchain.text_splitter import RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter |
| 5 | +from langchain_text_splitters import ( |
| 6 | + RecursiveCharacterTextSplitter as LCRecursiveCharacterTextSplitter, |
| 7 | +) |
7 | 8 |
|
8 | 9 | from docs2vecs.subcommands.indexer.config.config import Config |
9 | 10 | from docs2vecs.subcommands.indexer.document.chunk import Chunk |
|
14 | 15 | class RecursiveCharacterTextSplitter(IndexerSkill): |
15 | 16 | DEFAULT_CHUNK_SIZE = 1000 |
16 | 17 | DEFAULT_CHUNK_OVERLAP = 100 |
17 | | - |
| 18 | + |
18 | 19 | def __init__(self, config: dict, global_config: Config): |
19 | 20 | super().__init__(config, global_config) |
20 | 21 | self._set_config_defaults() |
21 | 22 |
|
22 | 23 | def _set_config_defaults(self): |
23 | 24 | if "chunk_size" not in self._config: |
24 | | - self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE |
| 25 | + self._config["chunk_size"] = ( |
| 26 | + RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE |
| 27 | + ) |
25 | 28 |
|
26 | 29 | if "chunk_overlap" not in self._config: |
27 | | - self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP |
| 30 | + self._config["chunk_overlap"] = ( |
| 31 | + RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP |
| 32 | + ) |
28 | 33 |
|
29 | 34 | def run(self, input: Optional[List[Document]] = None) -> List[Document]: |
30 | 35 | self.logger.info("Running RecursiveCharacterTextSplitter...") |
31 | 36 |
|
32 | | - splitter = LCRecursiveCharacterTextSplitter(chunk_size=self._config["chunk_size"], chunk_overlap=self._config["chunk_overlap"]) |
| 37 | + splitter = LCRecursiveCharacterTextSplitter( |
| 38 | + chunk_size=self._config["chunk_size"], |
| 39 | + chunk_overlap=self._config["chunk_overlap"], |
| 40 | + ) |
33 | 41 |
|
34 | 42 | for doc in input: |
35 | 43 | self.logger.debug(f"Splitting {doc.filename}...") |
|
0 commit comments