run-llama · rautaditya2606 · Jun 27, 2026
diff --git a/llama-index-core/llama_index/core/node_parser/text/semantic_double_merging_splitter.py b/llama-index-core/llama_index/core/node_parser/text/semantic_double_merging_splitter.py
@@ -359,7 +359,7 @@ def _clean_text_advanced(self, text: str) -> str:
         # Remove punctuations
         text = text.translate(str.maketrans("", "", string.punctuation))
         # Remove stopwords
-        tokens = globals_helper.punkt_tokenizer.tokenize(text)
+        tokens = re.findall(r"\w+", text)
         filtered_words = [w for w in tokens if w not in self.language_config.stopwords]
 
         return " ".join(filtered_words)
diff --git a/llama-index-core/tests/node_parser/test_semantic_double_merging_splitter.py b/llama-index-core/tests/node_parser/test_semantic_double_merging_splitter.py
@@ -144,3 +144,15 @@ def test_embed_model_single_sentence_document() -> None:
     nodes = splitter.get_nodes_from_documents([single_doc])
     assert len(nodes) == 1
     assert nodes[0].get_content() == "Only one sentence here."
+
+
+def test_clean_text_advanced() -> None:
+    """Test that _clean_text_advanced properly filters out stopwords from a string."""
+    from llama_index.core.utils import globals_helper
+
+    splitter = SemanticDoubleMergingSplitterNodeParser()
+    splitter.language_config.stopwords = set(globals_helper.stopwords)
+    cleaned = splitter._clean_text_advanced(
+        "this is a test text containing some stopwords like the and a"
+    )
+    assert cleaned == "test text containing stopwords like"