Fix more problematic attribute values for LangCache (#438)

abrookins · web-flow · commit f51c00bd0360 · 2025-11-21T11:52:18.000-08:00
Following on #437, we find and encode/decode more problematic characters used within attribute values in LangCache. This PR also adds a comprehensive punctuation test for other characters.
diff --git a/redisvl/extensions/cache/llm/langcache.py b/redisvl/extensions/cache/llm/langcache.py
@@ -19,6 +19,8 @@
     {
         ",": "，",  # U+FF0C FULLWIDTH COMMA
         "/": "∕",  # U+2215 DIVISION SLASH
+        "\\": "＼",  # U+FF3C FULLWIDTH REVERSE SOLIDUS (backslash)
+        "?": "？",  # U+FF1F FULLWIDTH QUESTION MARK
     }
 )
 
diff --git a/tests/integration/test_langcache_semantic_cache_integration.py b/tests/integration/test_langcache_semantic_cache_integration.py
@@ -268,6 +268,92 @@ def test_attribute_value_with_comma_and_slash_is_encoded_for_llm_string(
             hit.get("metadata", {}).get("llm_string") == raw_llm_string for hit in hits
         )
 
+    def test_attribute_value_with_all_tokenizer_separators_round_trip_and_filter(
+        self, langcache_with_attrs: LangCacheSemanticCache
+    ) -> None:
+        """All tokenizer separator characters should round-trip via filters.
+
+        This exercises the set of punctuation described in the underlying
+        RediSearch text-field tokenization docs to ensure that our
+        client-side encoding/decoding and LangCache's attribute handling
+        together can store and filter on values containing these characters.
+        """
+
+        separators = ",.<>{}[]\"':;!@#$%^&*()-+=~"
+        raw_llm_string = f"tenant {separators} value"
+
+        prompt = "Attribute encoding for all tokenizer separators"
+        response = "Response for all tokenizer separators."
+
+        entry_id = langcache_with_attrs.store(
+            prompt=prompt,
+            response=response,
+            metadata={"llm_string": raw_llm_string},
+        )
+        assert entry_id
+
+        hits = langcache_with_attrs.check(
+            prompt=prompt,
+            attributes={"llm_string": raw_llm_string},
+            num_results=5,
+        )
+
+        assert hits, "No hits returned for llm_string value with separators"
+        assert any(
+            hit.get("prompt") == prompt
+            and hit.get("response") == response
+            and hit.get("metadata", {}).get("llm_string") == raw_llm_string
+            for hit in hits
+        )
+
+    @pytest.mark.parametrize(
+        "raw_value",
+        [
+            r"tenant\\with\\backslash",
+            "tenant?with?question",
+        ],
+    )
+    def test_attribute_values_with_special_chars_round_trip_and_filter(
+        self,
+        langcache_with_attrs: LangCacheSemanticCache,
+        raw_value: str,
+    ) -> None:
+        """Backslash and question-mark values should round-trip via filters.
+
+        These values previously failed attribute filtering on this LangCache
+        instance; with client-side encoding/decoding they should now be
+        filterable and round-trip correctly.
+        """
+
+        prompt = f"Special chars attribute {raw_value}"
+        response = f"Response for {raw_value}"
+
+        entry_id = langcache_with_attrs.store(
+            prompt=prompt,
+            response=response,
+            metadata={"llm_string": raw_value},
+        )
+        assert entry_id
+
+        hits = langcache_with_attrs.check(
+            prompt=prompt,
+            attributes={"llm_string": raw_value},
+            num_results=5,
+        )
+
+        # Look for a matching hit for this prompt/response/metadata triple.
+        match_found = any(
+            hit.get("prompt") == prompt
+            and hit.get("response") == response
+            and hit.get("metadata", {}).get("llm_string") == raw_value
+            for hit in hits
+        )
+
+        assert match_found, (
+            "Expected llm_string value to be filterable, but no matching "
+            f"hit was found: {raw_value!r}"
+        )
+
 
 @pytest.mark.requires_api_keys
 class TestLangCacheSemanticCacheIntegrationWithoutAttributes:
diff --git a/tests/unit/test_langcache_semantic_cache.py b/tests/unit/test_langcache_semantic_cache.py
@@ -268,7 +268,7 @@ def test_check_with_attributes(self, mock_langcache_client):
             # should decode them before exposing them to callers.
             "attributes": {
                 "language": "python",
-                "topic": "programming，with∕encoding",
+                "topic": "programming，with∕encoding＼and？",
             },
         }
 
@@ -289,7 +289,7 @@ def test_check_with_attributes(self, mock_langcache_client):
             prompt="What is Python?",
             attributes={
                 "language": "python",
-                "topic": "programming,with/encoding",
+                "topic": r"programming,with/encoding\and?",
             },
         )
 
@@ -301,14 +301,15 @@ def test_check_with_attributes(self, mock_langcache_client):
         call_kwargs = mock_client.search.call_args.kwargs
         assert call_kwargs["attributes"] == {
             "language": "python",
-            # The comma and slash should be encoded for LangCache.
-            "topic": "programming，with∕encoding",
+            # The comma, slash, backslash, and question mark should be encoded
+            # for LangCache.
+            "topic": "programming，with∕encoding＼and？",
         }
 
         # And the decoded, original values should appear in metadata
         assert results[0]["metadata"] == {
             "language": "python",
-            "topic": "programming,with/encoding",
+            "topic": r"programming,with/encoding\and?",
         }
 
         def test_store_with_empty_metadata_does_not_send_attributes(

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@`
`19`	`19`	`{`
`20`	`20`	`",": "，", # U+FF0C FULLWIDTH COMMA`
`21`	`21`	`"/": "∕", # U+2215 DIVISION SLASH`
	`22`	`+ "\\": "＼", # U+FF3C FULLWIDTH REVERSE SOLIDUS (backslash)`
	`23`	`+ "?": "？", # U+FF1F FULLWIDTH QUESTION MARK`
`22`	`24`	`}`
`23`	`25`	`)`
`24`	`26`