fix(memory): extend tokenizer + slug regex to Thai/Arabic/Hebrew/Cyrillic (HKUDS#104)

Teerapat-Vatpitak · web-flow · commit f85f3b8075a1 · 2026-05-14T18:46:40.000+08:00
The previous CJK tokenizer ranges (HKUDS#87, HKUDS#95) only matched ``一-鿿`` and ``㐀-䶿``, so memory entries with Thai, Arabic, Hebrew, or Cyrillic titles: - Tokenized to the empty set, making recall always miss (e.g. ``find_relevant("ถัวเฉลี่ย")`` returned nothing even when the body contained the word). - Had their slug characters stripped to ``_``, so two distinct Thai titles of equal length silently overwrote each other on disk. The new ``_NON_LATIN_SCRIPT_RANGES`` constant covers CJK + Thai + Arabic + Hebrew + Cyrillic and is reused by: - ``_TOKEN_RE`` — single alternation pattern, one ``re.findall`` per ``_tokenize`` call (one text scan instead of two; precompiled at module level so it doesn't go through ``re.compile`` cache lookup on each invocation). - ``_SLUG_DISALLOWED_RE`` — negation pattern used by ``add()``. Arabic and Hebrew are deliberately narrowed to the basic letter blocks (U+0620-U+064A, U+05D0-U+05EA) to keep bidi-control codepoints like U+061C ARABIC LETTER MARK and combining marks out of on-disk slugs, where they would render as invisible-but-distinct filenames. Tests cover tokenization for each script, slug preservation (parametrized across the four new scripts), and a Thai collision- distinction regression. Out of scope: ``agent/src/session/search.py`` has the same CJK-only range in its FTS sanitizer; worth a follow-up PR to consume the same constant.
diff --git a/agent/src/memory/persistent.py b/agent/src/memory/persistent.py
@@ -27,6 +27,21 @@
 METADATA_WEIGHT = 2.0
 MEMORY_TYPES = ("user", "feedback", "project", "reference")
 
+# Script ranges tokenized and slugged at char level (no word-boundary
+# whitespace). Arabic/Hebrew narrowed to letter blocks to exclude bidi
+# controls and combining marks from on-disk slugs.
+_NON_LATIN_SCRIPT_RANGES = (
+    "一-鿿"   # CJK Unified Ideographs   (U+4E00-U+9FFF)
+    "㐀-䶿"   # CJK Extension A          (U+3400-U+4DBF)
+    "฀-๿"   # Thai                     (U+0E00-U+0E7F)
+    "ؠ-ي"   # Arabic letters           (U+0620-U+064A)
+    "א-ת"   # Hebrew letters           (U+05D0-U+05EA)
+    "Ѐ-ӿ"   # Cyrillic                 (U+0400-U+04FF)
+)
+
+_TOKEN_RE = re.compile(rf"[a-zA-Z0-9]{{3,}}|[{_NON_LATIN_SCRIPT_RANGES}]")
+_SLUG_DISALLOWED_RE = re.compile(rf"[^a-z0-9_\-{_NON_LATIN_SCRIPT_RANGES}]")
+
 
 @dataclass(frozen=True)
 class MemoryEntry:
@@ -52,7 +67,9 @@ class MemoryEntry:
 def _tokenize(text: str) -> set[str]:
     """Split text into searchable tokens.
 
-    ASCII words >= 3 chars + CJK individual characters. Underscores are
+    ASCII words >= 3 chars + individual characters from non-Latin scripts
+    listed in ``_NON_LATIN_SCRIPT_RANGES`` (CJK, Thai, Arabic, Hebrew,
+    Cyrillic). Underscores are
     treated as word boundaries so snake_case titles (e.g. ``mcp_wiring_test``)
     match natural-language queries (``"mcp wiring"``) as well as verbatim
     lookups.
@@ -63,9 +80,7 @@ def _tokenize(text: str) -> set[str]:
     Returns:
         Set of tokens.
     """
-    ascii_tokens = set(re.findall(r"[a-zA-Z0-9]{3,}", text.lower()))
-    cjk_tokens = set(re.findall(r"[\u4e00-\u9fff\u3400-\u4dbf]", text))
-    return ascii_tokens | cjk_tokens
+    return set(_TOKEN_RE.findall(text.lower()))
 
 
 def _coerce_str(value: object, default: str = "") -> str:
@@ -223,11 +238,11 @@ def add(self, name: str, content: str, memory_type: str = "project",
         Returns:
             Path to the created memory file.
         """
-        # Preserve CJK characters in the slug — collapsing them all to ``_``
-        # caused any two same-length CJK-only names to share a filename and
-        # silently overwrite each other.
-        slug = re.sub(r"[^a-z0-9_\-一-鿿㐀-䶿]", "_",
-                      name.lower().strip())[:60]
+        # Preserve non-Latin script characters in the slug — collapsing
+        # them all to ``_`` caused two same-length non-Latin names to share a
+        # filename and silently overwrite each other (see PR #95 for CJK;
+        # this generalizes to Thai/Arabic/Hebrew/Cyrillic).
+        slug = _SLUG_DISALLOWED_RE.sub("_", name.lower().strip())[:60]
         filename = f"{memory_type}_{slug}.md"
         path = self._dir / filename
 
diff --git a/agent/tests/test_persistent_memory.py b/agent/tests/test_persistent_memory.py
@@ -84,6 +84,29 @@ def test_underscores_split(self) -> None:
         tokens = _tokenize("mcp_wiring_test")
         assert tokens == {"mcp", "wiring", "test"}
 
+    def test_thai_characters(self) -> None:
+        # Thai script (฀-๿) was not tokenized — recall on Thai
+        # queries always returned the empty set. Char-level like CJK.
+        tokens = _tokenize("นโยบายการเทรด")
+        assert "น" in tokens
+        assert "เ" in tokens
+        assert "ท" in tokens
+
+    def test_arabic_characters(self) -> None:
+        tokens = _tokenize("التداول")
+        assert "ا" in tokens
+        assert "ل" in tokens
+
+    def test_hebrew_characters(self) -> None:
+        tokens = _tokenize("מסחר")
+        assert "מ" in tokens
+        assert "ס" in tokens
+
+    def test_cyrillic_characters(self) -> None:
+        tokens = _tokenize("торговля")
+        assert "т" in tokens
+        assert "о" in tokens
+
 
 # ---------------------------------------------------------------------------
 # PersistentMemory.add
@@ -131,6 +154,24 @@ def test_overwrite_same_name(self, tmp_path: Path) -> None:
         path = tmp_path / "project_overwrite.md"
         assert "v2" in path.read_text(encoding="utf-8")
 
+    @pytest.mark.parametrize("title", ["นโยบาย", "التداول", "מסחר", "торговля"])
+    def test_slug_preserves_non_latin_chars(self, tmp_path: Path, title: str) -> None:
+        # Regression: non-Latin chars used to collapse to "_" in slug,
+        # causing two distinct titles of equal length to collide.
+        pm = PersistentMemory(memory_dir=tmp_path)
+        path = pm.add(title, "body", "user")
+        assert title in path.name
+
+    def test_slug_distinguishes_two_thai_titles(self, tmp_path: Path) -> None:
+        # Two different Thai titles must produce different files. Without the
+        # fix both would collapse to "user________.md".
+        pm = PersistentMemory(memory_dir=tmp_path)
+        a = pm.add("นโยบาย", "rule a", "user")
+        b = pm.add("กลยุทธ์", "rule b", "user")
+        assert a != b
+        assert "rule a" in a.read_text(encoding="utf-8")
+        assert "rule b" in b.read_text(encoding="utf-8")
+
     def test_index_update_not_duplicate(self, tmp_path: Path) -> None:
         pm = PersistentMemory(memory_dir=tmp_path)
         pm.add("dup-check", "v1", "project")