Skip to content

Commit f85f3b8

Browse files
fix(memory): extend tokenizer + slug regex to Thai/Arabic/Hebrew/Cyrillic (HKUDS#104)
The previous CJK tokenizer ranges (HKUDS#87, HKUDS#95) only matched ``一-鿿`` and ``㐀-䶿``, so memory entries with Thai, Arabic, Hebrew, or Cyrillic titles: - Tokenized to the empty set, making recall always miss (e.g. ``find_relevant("ถัวเฉลี่ย")`` returned nothing even when the body contained the word). - Had their slug characters stripped to ``_``, so two distinct Thai titles of equal length silently overwrote each other on disk. The new ``_NON_LATIN_SCRIPT_RANGES`` constant covers CJK + Thai + Arabic + Hebrew + Cyrillic and is reused by: - ``_TOKEN_RE`` — single alternation pattern, one ``re.findall`` per ``_tokenize`` call (one text scan instead of two; precompiled at module level so it doesn't go through ``re.compile`` cache lookup on each invocation). - ``_SLUG_DISALLOWED_RE`` — negation pattern used by ``add()``. Arabic and Hebrew are deliberately narrowed to the basic letter blocks (U+0620-U+064A, U+05D0-U+05EA) to keep bidi-control codepoints like U+061C ARABIC LETTER MARK and combining marks out of on-disk slugs, where they would render as invisible-but-distinct filenames. Tests cover tokenization for each script, slug preservation (parametrized across the four new scripts), and a Thai collision- distinction regression. Out of scope: ``agent/src/session/search.py`` has the same CJK-only range in its FTS sanitizer; worth a follow-up PR to consume the same constant.
1 parent 9bfaa4c commit f85f3b8

2 files changed

Lines changed: 65 additions & 9 deletions

File tree

agent/src/memory/persistent.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@
2727
METADATA_WEIGHT = 2.0
2828
MEMORY_TYPES = ("user", "feedback", "project", "reference")
2929

30+
# Script ranges tokenized and slugged at char level (no word-boundary
31+
# whitespace). Arabic/Hebrew narrowed to letter blocks to exclude bidi
32+
# controls and combining marks from on-disk slugs.
33+
_NON_LATIN_SCRIPT_RANGES = (
34+
"一-鿿" # CJK Unified Ideographs (U+4E00-U+9FFF)
35+
"㐀-䶿" # CJK Extension A (U+3400-U+4DBF)
36+
"฀-๿" # Thai (U+0E00-U+0E7F)
37+
"ؠ-ي" # Arabic letters (U+0620-U+064A)
38+
"א-ת" # Hebrew letters (U+05D0-U+05EA)
39+
"Ѐ-ӿ" # Cyrillic (U+0400-U+04FF)
40+
)
41+
42+
_TOKEN_RE = re.compile(rf"[a-zA-Z0-9]{{3,}}|[{_NON_LATIN_SCRIPT_RANGES}]")
43+
_SLUG_DISALLOWED_RE = re.compile(rf"[^a-z0-9_\-{_NON_LATIN_SCRIPT_RANGES}]")
44+
3045

3146
@dataclass(frozen=True)
3247
class MemoryEntry:
@@ -52,7 +67,9 @@ class MemoryEntry:
5267
def _tokenize(text: str) -> set[str]:
5368
"""Split text into searchable tokens.
5469
55-
ASCII words >= 3 chars + CJK individual characters. Underscores are
70+
ASCII words >= 3 chars + individual characters from non-Latin scripts
71+
listed in ``_NON_LATIN_SCRIPT_RANGES`` (CJK, Thai, Arabic, Hebrew,
72+
Cyrillic). Underscores are
5673
treated as word boundaries so snake_case titles (e.g. ``mcp_wiring_test``)
5774
match natural-language queries (``"mcp wiring"``) as well as verbatim
5875
lookups.
@@ -63,9 +80,7 @@ def _tokenize(text: str) -> set[str]:
6380
Returns:
6481
Set of tokens.
6582
"""
66-
ascii_tokens = set(re.findall(r"[a-zA-Z0-9]{3,}", text.lower()))
67-
cjk_tokens = set(re.findall(r"[\u4e00-\u9fff\u3400-\u4dbf]", text))
68-
return ascii_tokens | cjk_tokens
83+
return set(_TOKEN_RE.findall(text.lower()))
6984

7085

7186
def _coerce_str(value: object, default: str = "") -> str:
@@ -223,11 +238,11 @@ def add(self, name: str, content: str, memory_type: str = "project",
223238
Returns:
224239
Path to the created memory file.
225240
"""
226-
# Preserve CJK characters in the slug — collapsing them all to ``_``
227-
# caused any two same-length CJK-only names to share a filename and
228-
# silently overwrite each other.
229-
slug = re.sub(r"[^a-z0-9_\-一-鿿㐀-䶿]", "_",
230-
name.lower().strip())[:60]
241+
# Preserve non-Latin script characters in the slug — collapsing
242+
# them all to ``_`` caused two same-length non-Latin names to share a
243+
# filename and silently overwrite each other (see PR #95 for CJK;
244+
# this generalizes to Thai/Arabic/Hebrew/Cyrillic).
245+
slug = _SLUG_DISALLOWED_RE.sub("_", name.lower().strip())[:60]
231246
filename = f"{memory_type}_{slug}.md"
232247
path = self._dir / filename
233248

agent/tests/test_persistent_memory.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,29 @@ def test_underscores_split(self) -> None:
8484
tokens = _tokenize("mcp_wiring_test")
8585
assert tokens == {"mcp", "wiring", "test"}
8686

87+
def test_thai_characters(self) -> None:
88+
# Thai script (฀-๿) was not tokenized — recall on Thai
89+
# queries always returned the empty set. Char-level like CJK.
90+
tokens = _tokenize("นโยบายการเทรด")
91+
assert "น" in tokens
92+
assert "เ" in tokens
93+
assert "ท" in tokens
94+
95+
def test_arabic_characters(self) -> None:
96+
tokens = _tokenize("التداول")
97+
assert "ا" in tokens
98+
assert "ل" in tokens
99+
100+
def test_hebrew_characters(self) -> None:
101+
tokens = _tokenize("מסחר")
102+
assert "מ" in tokens
103+
assert "ס" in tokens
104+
105+
def test_cyrillic_characters(self) -> None:
106+
tokens = _tokenize("торговля")
107+
assert "т" in tokens
108+
assert "о" in tokens
109+
87110

88111
# ---------------------------------------------------------------------------
89112
# PersistentMemory.add
@@ -131,6 +154,24 @@ def test_overwrite_same_name(self, tmp_path: Path) -> None:
131154
path = tmp_path / "project_overwrite.md"
132155
assert "v2" in path.read_text(encoding="utf-8")
133156

157+
@pytest.mark.parametrize("title", ["นโยบาย", "التداول", "מסחר", "торговля"])
158+
def test_slug_preserves_non_latin_chars(self, tmp_path: Path, title: str) -> None:
159+
# Regression: non-Latin chars used to collapse to "_" in slug,
160+
# causing two distinct titles of equal length to collide.
161+
pm = PersistentMemory(memory_dir=tmp_path)
162+
path = pm.add(title, "body", "user")
163+
assert title in path.name
164+
165+
def test_slug_distinguishes_two_thai_titles(self, tmp_path: Path) -> None:
166+
# Two different Thai titles must produce different files. Without the
167+
# fix both would collapse to "user________.md".
168+
pm = PersistentMemory(memory_dir=tmp_path)
169+
a = pm.add("นโยบาย", "rule a", "user")
170+
b = pm.add("กลยุทธ์", "rule b", "user")
171+
assert a != b
172+
assert "rule a" in a.read_text(encoding="utf-8")
173+
assert "rule b" in b.read_text(encoding="utf-8")
174+
134175
def test_index_update_not_duplicate(self, tmp_path: Path) -> None:
135176
pm = PersistentMemory(memory_dir=tmp_path)
136177
pm.add("dup-check", "v1", "project")

0 commit comments

Comments
 (0)