From cc72f0ae0c3c1a2631aef3d6ccd4fe115a8dffb7 Mon Sep 17 00:00:00 2001
From: Mingi Jeong <incheonkirin@users.noreply.github.com>
Date: Sat, 6 Jun 2026 21:55:49 +0900
Subject: [PATCH 1/2] Fix RecursionError/IndexError in TokenTextSplitter &
 SentenceSplitter for oversized units

A single indivisible unit larger than chunk_size (e.g. a multi-token CJK or
emoji character with a small chunk_size) made _split recurse on the same text
forever (RecursionError), after which TokenTextSplitter._merge popped from an
empty list (IndexError).

_split now keeps such a unit as an oversized split instead of recursing, so
SentenceSplitter reaches its existing 'Single token exceeded chunk size'
ValueError and TokenTextSplitter keeps it as a chunk. _merge stops trimming
overlap once cur_chunk is empty. Adds regression tests; normal inputs unchanged.
---
 .../core/node_parser/text/sentence.py         | 14 +++++++
 .../core/node_parser/text/token.py            | 12 +++++-
 .../test_splitter_oversized_unit.py           | 38 +++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py

diff --git a/llama-index-core/llama_index/core/node_parser/text/sentence.py b/llama-index-core/llama_index/core/node_parser/text/sentence.py
index 88daf1b6f5c..953c61653fb 100644
--- a/llama-index-core/llama_index/core/node_parser/text/sentence.py
+++ b/llama-index-core/llama_index/core/node_parser/text/sentence.py
@@ -223,6 +223,20 @@ def _split(self, text: str, chunk_size: int) -> List[_Split]:
                         token_size=token_size,
                     )
                 )
+            elif len(text_splits_by_fns) == 1:
+                # Could not split any further (a single indivisible unit larger
+                # than chunk_size, e.g. a multi-token CJK / emoji character with
+                # a small chunk_size). Recursing on the same text loops forever
+                # and raises RecursionError, so keep it as an oversized split --
+                # _merge() then raises the intended "Single token exceeded chunk
+                # size" ValueError instead of crashing.
+                text_splits.append(
+                    _Split(
+                        text_split_by_fns,
+                        is_sentence=is_sentence,
+                        token_size=token_size,
+                    )
+                )
             else:
                 recursive_text_splits = self._split(
                     text_split_by_fns, chunk_size=chunk_size
diff --git a/llama-index-core/llama_index/core/node_parser/text/token.py b/llama-index-core/llama_index/core/node_parser/text/token.py
index 672f5346ee3..9e8dd330c19 100644
--- a/llama-index-core/llama_index/core/node_parser/text/token.py
+++ b/llama-index-core/llama_index/core/node_parser/text/token.py
@@ -180,6 +180,14 @@ def _split(self, text: str, chunk_size: int) -> List[str]:
             split_len = len(self._tokenizer(split))
             if split_len <= chunk_size:
                 new_splits.append(split)
+            elif len(splits) == 1:
+                # The text could not be broken down any further (e.g. a single
+                # character whose token count already exceeds chunk_size, which
+                # happens for multi-token CJK / emoji characters with a small
+                # chunk_size). Recursing on the same text would loop forever and
+                # raise RecursionError, so keep it as an oversized split here --
+                # _merge() already tolerates splits larger than chunk_size.
+                new_splits.append(split)
             else:
                 # recursively split
                 new_splits.extend(self._split(split, chunk_size=chunk_size))
@@ -223,7 +231,9 @@ def _merge(self, splits: List[str], chunk_size: int) -> List[str]:
                 # keep popping off the first element of the previous chunk until:
                 #   1. the current chunk length is less than chunk overlap
                 #   2. the total length is less than chunk size
-                while cur_len > self.chunk_overlap or cur_len + split_len > chunk_size:
+                while cur_chunk and (
+                    cur_len > self.chunk_overlap or cur_len + split_len > chunk_size
+                ):
                     # pop off the first element
                     first_chunk = cur_chunk.pop(0)
                     cur_len -= len(self._tokenizer(first_chunk))
diff --git a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
new file mode 100644
index 00000000000..beb9645f6e1
--- /dev/null
+++ b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
@@ -0,0 +1,38 @@
+"""Regression tests for splitters crashing on an indivisible unit that is
+larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a
+small ``chunk_size``.
+
+Before the fix, ``_split`` recursed on the same text forever (``RecursionError``)
+and ``TokenTextSplitter._merge`` then popped from an empty list (``IndexError``).
+"""
+
+import logging
+
+import pytest
+from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
+
+
+def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None:
+    # "🚀" encodes to 3 tokens; with chunk_size=1 it cannot be split further.
+    # (A separate, pre-existing malformed warning in _merge is handled in #21796,
+    # so silence that logger here to keep this test focused on the crash.)
+    logging.getLogger("llama_index.core.node_parser.text.token").setLevel(
+        logging.ERROR
+    )
+    splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
+    assert splitter.split_text("🚀") == ["🚀"]
+    assert splitter.split_text("🚀" * 20) == ["🚀"] * 20
+
+
+def test_sentence_splitter_oversized_unit_raises_clean_error() -> None:
+    # SentenceSplitter._merge already intends to raise this; the recursion
+    # previously crashed before that path was ever reached.
+    splitter = SentenceSplitter(chunk_size=2, chunk_overlap=0)
+    with pytest.raises(ValueError, match="Single token exceeded chunk size"):
+        splitter.split_text("보험" * 50)
+
+
+def test_splitters_unaffected_for_normal_text() -> None:
+    text = "보험계약자는 보험료를 납입할 의무가 있다. 보험자는 보험금을 지급한다. " * 5
+    assert len(SentenceSplitter(chunk_size=20, chunk_overlap=5).split_text(text)) > 1
+    assert len(TokenTextSplitter(chunk_size=10, chunk_overlap=2).split_text(text)) > 1

From 175f7355d832fe1c4d54041d9620b227bcce0da4 Mon Sep 17 00:00:00 2001
From: jeongmingi <incheonkirin@gmail.com>
Date: Wed, 10 Jun 2026 09:43:15 +0900
Subject: [PATCH 2/2] Apply ruff format and fix D213 in oversized-unit splitter
 test

---
 .../tests/text_splitter/test_splitter_oversized_unit.py    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
index beb9645f6e1..70360f9fe24 100644
--- a/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
+++ b/llama-index-core/tests/text_splitter/test_splitter_oversized_unit.py
@@ -1,4 +1,5 @@
-"""Regression tests for splitters crashing on an indivisible unit that is
+"""
+Regression tests for splitters crashing on an indivisible unit that is
 larger than ``chunk_size`` -- e.g. a multi-token CJK / emoji character with a
 small ``chunk_size``.
 
@@ -16,9 +17,7 @@ def test_token_splitter_oversized_unit_is_kept_not_recursed() -> None:
     # "🚀" encodes to 3 tokens; with chunk_size=1 it cannot be split further.
     # (A separate, pre-existing malformed warning in _merge is handled in #21796,
     # so silence that logger here to keep this test focused on the crash.)
-    logging.getLogger("llama_index.core.node_parser.text.token").setLevel(
-        logging.ERROR
-    )
+    logging.getLogger("llama_index.core.node_parser.text.token").setLevel(logging.ERROR)
     splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
     assert splitter.split_text("🚀") == ["🚀"]
     assert splitter.split_text("🚀" * 20) == ["🚀"] * 20