From cc72f0ae0c3c1a2631aef3d6ccd4fe115a8dffb7 Mon Sep 17 00:00:00 2001 From: Mingi Jeong Date: Sat, 6 Jun 2026 21:55:49 +0900 Subject: [PATCH 1/2] Fix RecursionError/IndexError in TokenTextSplitter & SentenceSplitter for oversized units A single indivisible unit larger than chunk_size (e.g. a multi-token CJK or emoji character with a small chunk_size) made _split recurse on the same text forever (RecursionError), after which TokenTextSplitter._merge popped from an empty list (IndexError). _split now keeps such a unit as an oversized split instead of recursing, so SentenceSplitter reaches its existing 'Single token exceeded chunk size' ValueError and TokenTextSplitter keeps it as a chunk. _merge stops trimming overlap once cur_chunk is empty. Adds regression tests; normal inputs unchanged. --- .../core/node_parser/text/sentence.py | 14 +++++++ .../core/node_parser/text/token.py | 12 +++++- .../test_splitter_oversized_unit.py | 38 +++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py diff --git a/llama-index-core/llama_index/core/node_parser/text/sentence.py b/llama-index-core/llama_index/core/node_parser/text/sentence.py index 88daf1b6f5c..953c61653fb 100644 --- a/llama-index-core/llama_index/core/node_parser/text/sentence.py +++ b/llama-index-core/llama_index/core/node_parser/text/sentence.py @@ -223,6 +223,20 @@ def _split(self, text: str, chunk_size: int) -> List[_Split]: token_size=token_size, ) ) + elif len(text_splits_by_fns) == 1: + # Could not split any further (a single indivisible unit larger + # than chunk_size, e.g. a multi-token CJK / emoji character with + # a small chunk_size). Recursing on the same text loops forever + # and raises RecursionError, so keep it as an oversized split -- + # _merge() then raises the intended "Single token exceeded chunk + # size" ValueError instead of crashing. + text_splits.append( + _Split( + text_split_by_fns, + is_sentence=is_sentence, + token_size=token_size, + ) + ) else: recursive_text_splits = self._split( text_split_by_fns, chunk_size=chunk_size diff --git a/llama-index-core/llama_index/core/node_parser/text/token.py b/llama-index-core/llama_index/core/node_parser/text/token.py index 672f5346ee3..9e8dd330c19 100644 --- a/llama-index-core/llama_index/core/node_parser/text/token.py +++ b/llama-index-core/llama_index/core/node_parser/text/token.py @@ -180,6 +180,14 @@ def _split(self, text: str, chunk_size: int) -> List[str]: split_len = len(self._tokenizer(split)) if split_len <= chunk_size: new_splits.append(split) + elif len(splits) == 1: + # The text could not be broken down any further (e.g. a single + # character whose token count already exceeds chunk_size, which + # happens for multi-token CJK / emoji characters with a small + # chunk_size). Recursing on the same text would loop forever and + # raise RecursionError, so keep it as an oversized split here -- + # _merge() already tolerates splits larger than chunk_size. + new_splits.append(split) else: # recursively split new_splits.extend(self._split(split, chunk_size=chunk_size)) @@ -223,7 +231,9 @@ def _merge(self, splits: List[str], chunk_size: int) -> List[str]: # keep popping off the first element of the previous chunk until: # 1. the current chunk length is less than chunk overlap # 2. the total length is less than chunk size - while cur_len > self.chunk_overlap or cur_len + split_len > chunk_size: + while cur_chunk and ( + cur_len > self.chunk_overlap or cur_len + split_len > chunk_size + ): # pop off the first element first_chunk = cur_chunk.pop(0) cur_len -= len(self._tokenizer(first_chunk)) diff --git a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py new file mode 100644 index 00000000000..beb9645f6e1 --- /dev/null +++ b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py @@ -0,0 +1,38 @@ +"""Regression tests for splitters crashing on an indivisible unit that is +larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a +small ``chunk_size``. + +Before the fix, ``_split`` recursed on the same text forever (``RecursionError``) +and ``TokenTextSplitter._merge`` then popped from an empty list (``IndexError``). +""" + +import logging + +import pytest +from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter + + +def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None: + # "πŸš€" encodes to 3 tokens; with chunk_size=1 it cannot be split further. + # (A separate, pre-existing malformed warning in _merge is handled in #21796, + # so silence that logger here to keep this test focused on the crash.) + logging.getLogger("llama_index.core.node_parser.text.token").setLevel( + logging.ERROR + ) + splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0) + assert splitter.split_text("πŸš€") == ["πŸš€"] + assert splitter.split_text("πŸš€" * 20) == ["πŸš€"] * 20 + + +def test_sentence_splitter_oversized_unit_raises_clean_error() -> None: + # SentenceSplitter._merge already intends to raise this; the recursion + # previously crashed before that path was ever reached. + splitter = SentenceSplitter(chunk_size=2, chunk_overlap=0) + with pytest.raises(ValueError, match="Single token exceeded chunk size"): + splitter.split_text("λ³΄ν—˜" * 50) + + +def test_splitters_unaffected_for_normal_text() -> None: + text = "λ³΄ν—˜κ³„μ•½μžλŠ” λ³΄ν—˜λ£Œλ₯Ό λ‚©μž…ν•  μ˜λ¬΄κ°€ μžˆλ‹€. λ³΄ν—˜μžλŠ” λ³΄ν—˜κΈˆμ„ μ§€κΈ‰ν•œλ‹€. " * 5 + assert len(SentenceSplitter(chunk_size=20, chunk_overlap=5).split_text(text)) > 1 + assert len(TokenTextSplitter(chunk_size=10, chunk_overlap=2).split_text(text)) > 1 From 175f7355d832fe1c4d54041d9620b227bcce0da4 Mon Sep 17 00:00:00 2001 From: jeongmingi Date: Wed, 10 Jun 2026 09:43:15 +0900 Subject: [PATCH 2/2] Apply ruff format and fix D213 in oversized-unit splitter test --- .../tests/text_splitter/test_splitter_oversized_unit.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py index beb9645f6e1..70360f9fe24 100644 --- a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py +++ b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py @@ -1,4 +1,5 @@ -"""Regression tests for splitters crashing on an indivisible unit that is +""" +Regression tests for splitters crashing on an indivisible unit that is larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a small ``chunk_size``. @@ -16,9 +17,7 @@ def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None: # "πŸš€" encodes to 3 tokens; with chunk_size=1 it cannot be split further. # (A separate, pre-existing malformed warning in _merge is handled in #21796, # so silence that logger here to keep this test focused on the crash.) - logging.getLogger("llama_index.core.node_parser.text.token").setLevel( - logging.ERROR - ) + logging.getLogger("llama_index.core.node_parser.text.token").setLevel(logging.ERROR) splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0) assert splitter.split_text("πŸš€") == ["πŸš€"] assert splitter.split_text("πŸš€" * 20) == ["πŸš€"] * 20