Skip to content

Commit f51c00b

Browse files
authored
Fix more problematic attribute values for LangCache (#438)
Following on #437, we find and encode/decode more problematic characters used within attribute values in LangCache. This PR also adds a comprehensive punctuation test for other characters.
1 parent d1f4e76 commit f51c00b

File tree

3 files changed

+94
-5
lines changed

3 files changed

+94
-5
lines changed

redisvl/extensions/cache/llm/langcache.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
{
2020
",": ",", # U+FF0C FULLWIDTH COMMA
2121
"/": "∕", # U+2215 DIVISION SLASH
22+
"\\": "\", # U+FF3C FULLWIDTH REVERSE SOLIDUS (backslash)
23+
"?": "?", # U+FF1F FULLWIDTH QUESTION MARK
2224
}
2325
)
2426

tests/integration/test_langcache_semantic_cache_integration.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,92 @@ def test_attribute_value_with_comma_and_slash_is_encoded_for_llm_string(
268268
hit.get("metadata", {}).get("llm_string") == raw_llm_string for hit in hits
269269
)
270270

271+
def test_attribute_value_with_all_tokenizer_separators_round_trip_and_filter(
272+
self, langcache_with_attrs: LangCacheSemanticCache
273+
) -> None:
274+
"""All tokenizer separator characters should round-trip via filters.
275+
276+
This exercises the set of punctuation described in the underlying
277+
RediSearch text-field tokenization docs to ensure that our
278+
client-side encoding/decoding and LangCache's attribute handling
279+
together can store and filter on values containing these characters.
280+
"""
281+
282+
separators = ",.<>{}[]\"':;!@#$%^&*()-+=~"
283+
raw_llm_string = f"tenant {separators} value"
284+
285+
prompt = "Attribute encoding for all tokenizer separators"
286+
response = "Response for all tokenizer separators."
287+
288+
entry_id = langcache_with_attrs.store(
289+
prompt=prompt,
290+
response=response,
291+
metadata={"llm_string": raw_llm_string},
292+
)
293+
assert entry_id
294+
295+
hits = langcache_with_attrs.check(
296+
prompt=prompt,
297+
attributes={"llm_string": raw_llm_string},
298+
num_results=5,
299+
)
300+
301+
assert hits, "No hits returned for llm_string value with separators"
302+
assert any(
303+
hit.get("prompt") == prompt
304+
and hit.get("response") == response
305+
and hit.get("metadata", {}).get("llm_string") == raw_llm_string
306+
for hit in hits
307+
)
308+
309+
@pytest.mark.parametrize(
310+
"raw_value",
311+
[
312+
r"tenant\\with\\backslash",
313+
"tenant?with?question",
314+
],
315+
)
316+
def test_attribute_values_with_special_chars_round_trip_and_filter(
317+
self,
318+
langcache_with_attrs: LangCacheSemanticCache,
319+
raw_value: str,
320+
) -> None:
321+
"""Backslash and question-mark values should round-trip via filters.
322+
323+
These values previously failed attribute filtering on this LangCache
324+
instance; with client-side encoding/decoding they should now be
325+
filterable and round-trip correctly.
326+
"""
327+
328+
prompt = f"Special chars attribute {raw_value}"
329+
response = f"Response for {raw_value}"
330+
331+
entry_id = langcache_with_attrs.store(
332+
prompt=prompt,
333+
response=response,
334+
metadata={"llm_string": raw_value},
335+
)
336+
assert entry_id
337+
338+
hits = langcache_with_attrs.check(
339+
prompt=prompt,
340+
attributes={"llm_string": raw_value},
341+
num_results=5,
342+
)
343+
344+
# Look for a matching hit for this prompt/response/metadata triple.
345+
match_found = any(
346+
hit.get("prompt") == prompt
347+
and hit.get("response") == response
348+
and hit.get("metadata", {}).get("llm_string") == raw_value
349+
for hit in hits
350+
)
351+
352+
assert match_found, (
353+
"Expected llm_string value to be filterable, but no matching "
354+
f"hit was found: {raw_value!r}"
355+
)
356+
271357

272358
@pytest.mark.requires_api_keys
273359
class TestLangCacheSemanticCacheIntegrationWithoutAttributes:

tests/unit/test_langcache_semantic_cache.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def test_check_with_attributes(self, mock_langcache_client):
268268
# should decode them before exposing them to callers.
269269
"attributes": {
270270
"language": "python",
271-
"topic": "programming,with∕encoding",
271+
"topic": "programming,with∕encoding\and?",
272272
},
273273
}
274274

@@ -289,7 +289,7 @@ def test_check_with_attributes(self, mock_langcache_client):
289289
prompt="What is Python?",
290290
attributes={
291291
"language": "python",
292-
"topic": "programming,with/encoding",
292+
"topic": r"programming,with/encoding\and?",
293293
},
294294
)
295295

@@ -301,14 +301,15 @@ def test_check_with_attributes(self, mock_langcache_client):
301301
call_kwargs = mock_client.search.call_args.kwargs
302302
assert call_kwargs["attributes"] == {
303303
"language": "python",
304-
# The comma and slash should be encoded for LangCache.
305-
"topic": "programming,with∕encoding",
304+
# The comma, slash, backslash, and question mark should be encoded
305+
# for LangCache.
306+
"topic": "programming,with∕encoding\and?",
306307
}
307308

308309
# And the decoded, original values should appear in metadata
309310
assert results[0]["metadata"] == {
310311
"language": "python",
311-
"topic": "programming,with/encoding",
312+
"topic": r"programming,with/encoding\and?",
312313
}
313314

314315
def test_store_with_empty_metadata_does_not_send_attributes(

0 commit comments

Comments
 (0)