Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llama-index-core/llama_index/core/node_parser/text/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,20 @@ def _split(self, text: str, chunk_size: int) -> List[_Split]:
token_size=token_size,
)
)
elif len(text_splits_by_fns) == 1:
# Could not split any further (a single indivisible unit larger
# than chunk_size, e.g. a multi-token CJK / emoji character with
# a small chunk_size). Recursing on the same text loops forever
# and raises RecursionError, so keep it as an oversized split --
# _merge() then raises the intended "Single token exceeded chunk
# size" ValueError instead of crashing.
text_splits.append(
_Split(
text_split_by_fns,
is_sentence=is_sentence,
token_size=token_size,
)
)
else:
recursive_text_splits = self._split(
text_split_by_fns, chunk_size=chunk_size
Expand Down
12 changes: 11 additions & 1 deletion llama-index-core/llama_index/core/node_parser/text/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,14 @@ def _split(self, text: str, chunk_size: int) -> List[str]:
split_len = len(self._tokenizer(split))
if split_len <= chunk_size:
new_splits.append(split)
elif len(splits) == 1:
# The text could not be broken down any further (e.g. a single
# character whose token count already exceeds chunk_size, which
# happens for multi-token CJK / emoji characters with a small
# chunk_size). Recursing on the same text would loop forever and
# raise RecursionError, so keep it as an oversized split here --
# _merge() already tolerates splits larger than chunk_size.
new_splits.append(split)
else:
# recursively split
new_splits.extend(self._split(split, chunk_size=chunk_size))
Expand Down Expand Up @@ -223,7 +231,9 @@ def _merge(self, splits: List[str], chunk_size: int) -> List[str]:
# keep popping off the first element of the previous chunk until:
# 1. the current chunk length is less than chunk overlap
# 2. the total length is less than chunk size
while cur_len > self.chunk_overlap or cur_len + split_len > chunk_size:
while cur_chunk and (
cur_len > self.chunk_overlap or cur_len + split_len > chunk_size
):
# pop off the first element
first_chunk = cur_chunk.pop(0)
cur_len -= len(self._tokenizer(first_chunk))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Regression tests for splitters crashing on an indivisible unit that is
larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a
small ``chunk_size``.

Before the fix, ``_split`` recursed on the same text forever (``RecursionError``)
and ``TokenTextSplitter._merge`` then popped from an empty list (``IndexError``).
"""

import logging

import pytest
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter


def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None:
# "πŸš€" encodes to 3 tokens; with chunk_size=1 it cannot be split further.
# (A separate, pre-existing malformed warning in _merge is handled in #21796,
# so silence that logger here to keep this test focused on the crash.)
logging.getLogger("llama_index.core.node_parser.text.token").setLevel(
logging.ERROR
)
splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
assert splitter.split_text("πŸš€") == ["πŸš€"]
assert splitter.split_text("πŸš€" * 20) == ["πŸš€"] * 20


def test_sentence_splitter_oversized_unit_raises_clean_error() -> None:
# SentenceSplitter._merge already intends to raise this; the recursion
# previously crashed before that path was ever reached.
splitter = SentenceSplitter(chunk_size=2, chunk_overlap=0)
with pytest.raises(ValueError, match="Single token exceeded chunk size"):
splitter.split_text("λ³΄ν—˜" * 50)


def test_splitters_unaffected_for_normal_text() -> None:
text = "λ³΄ν—˜κ³„μ•½μžλŠ” λ³΄ν—˜λ£Œλ₯Ό λ‚©μž…ν•  μ˜λ¬΄κ°€ μžˆλ‹€. λ³΄ν—˜μžλŠ” λ³΄ν—˜κΈˆμ„ μ§€κΈ‰ν•œλ‹€. " * 5
assert len(SentenceSplitter(chunk_size=20, chunk_overlap=5).split_text(text)) > 1
assert len(TokenTextSplitter(chunk_size=10, chunk_overlap=2).split_text(text)) > 1