run-llama · Incheonkirin · Jun 6, 2026
diff --git a/llama-index-core/llama_index/core/node_parser/text/sentence.py b/llama-index-core/llama_index/core/node_parser/text/sentence.py
@@ -223,6 +223,20 @@ def _split(self, text: str, chunk_size: int) -> List[_Split]:
                         token_size=token_size,
                     )
                 )
+            elif len(text_splits_by_fns) == 1:
+                # Could not split any further (a single indivisible unit larger
+                # than chunk_size, e.g. a multi-token CJK / emoji character with
+                # a small chunk_size). Recursing on the same text loops forever
+                # and raises RecursionError, so keep it as an oversized split --
+                # _merge() then raises the intended "Single token exceeded chunk
+                # size" ValueError instead of crashing.
+                text_splits.append(
+                    _Split(
+                        text_split_by_fns,
+                        is_sentence=is_sentence,
+                        token_size=token_size,
+                    )
+                )
             else:
                 recursive_text_splits = self._split(
                     text_split_by_fns, chunk_size=chunk_size

diff --git a/llama-index-core/llama_index/core/node_parser/text/token.py b/llama-index-core/llama_index/core/node_parser/text/token.py
@@ -180,6 +180,14 @@ def _split(self, text: str, chunk_size: int) -> List[str]:
             split_len = len(self._tokenizer(split))
             if split_len <= chunk_size:
                 new_splits.append(split)
+            elif len(splits) == 1:
+                # The text could not be broken down any further (e.g. a single
+                # character whose token count already exceeds chunk_size, which
+                # happens for multi-token CJK / emoji characters with a small
+                # chunk_size). Recursing on the same text would loop forever and
+                # raise RecursionError, so keep it as an oversized split here --
+                # _merge() already tolerates splits larger than chunk_size.
+                new_splits.append(split)
             else:
                 # recursively split
                 new_splits.extend(self._split(split, chunk_size=chunk_size))
@@ -223,7 +231,9 @@ def _merge(self, splits: List[str], chunk_size: int) -> List[str]:
                 # keep popping off the first element of the previous chunk until:
                 #   1. the current chunk length is less than chunk overlap
                 #   2. the total length is less than chunk size
-                while cur_len > self.chunk_overlap or cur_len + split_len > chunk_size:
+                while cur_chunk and (
+                    cur_len > self.chunk_overlap or cur_len + split_len > chunk_size
+                ):
                     # pop off the first element
                     first_chunk = cur_chunk.pop(0)
                     cur_len -= len(self._tokenizer(first_chunk))

diff --git a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
@@ -0,0 +1,38 @@
+"""Regression tests for splitters crashing on an indivisible unit that is
+larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a
+small ``chunk_size``.
+
+Before the fix, ``_split`` recursed on the same text forever (``RecursionError``)
+and ``TokenTextSplitter._merge`` then popped from an empty list (``IndexError``).
+"""
+
+import logging
+
+import pytest
+from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
+
+
+def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None:
+    # "🚀" encodes to 3 tokens; with chunk_size=1 it cannot be split further.
+    # (A separate, pre-existing malformed warning in _merge is handled in #21796,
+    # so silence that logger here to keep this test focused on the crash.)
+    logging.getLogger("llama_index.core.node_parser.text.token").setLevel(
+        logging.ERROR
+    )
+    splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
+    assert splitter.split_text("🚀") == ["🚀"]
+    assert splitter.split_text("🚀" * 20) == ["🚀"] * 20
+
+
+def test_sentence_splitter_oversized_unit_raises_clean_error() -> None:
+    # SentenceSplitter._merge already intends to raise this; the recursion
+    # previously crashed before that path was ever reached.
+    splitter = SentenceSplitter(chunk_size=2, chunk_overlap=0)
+    with pytest.raises(ValueError, match="Single token exceeded chunk size"):
+        splitter.split_text("보험" * 50)
+
+
+def test_splitters_unaffected_for_normal_text() -> None:
+    text = "보험계약자는 보험료를 납입할 의무가 있다. 보험자는 보험금을 지급한다. " * 5
+    assert len(SentenceSplitter(chunk_size=20, chunk_overlap=5).split_text(text)) > 1
+    assert len(TokenTextSplitter(chunk_size=10, chunk_overlap=2).split_text(text)) > 1