Skip to content

Commit 0c1a952

Browse files
robzor92claude
andauthored
[FSTORE-1970][APPEND] Fix similarity-search find_neighbors on OpenSearch 2.19.5 (k-too-large parsing + faiss efficient filtering) (#1008)
* [FSTORE-1970][APPEND] Parse OpenSearch 2.19.5 k-NN "k too large" error OpenSearch 2.19.5 (k-NN plugin) reports an out-of-range k via KNNQueryBuilder.Builder.validate() as "[knn] requires k to be in the range (0, N]", whereas 1.3.6 reported "[knn] requires k <= N". The vector-DB error parser only matched the old form, so the max-k discovery probe in VectorDbClient._find_neighbors could not extract the limit, left the exception info empty, and re-raised instead of caching the limit. find_neighbors() on a project index therefore failed with "Requested k is too large". Match both message forms (the new (0, N] range and the retained <= N form, both present in the 2.19.5 source). Also tighten the guard so it only classifies genuine upper-bound violations as REQUESTED_K_TOO_LARGE: "[knn] requires k > 0" and "[knn] requires exactly one of k, distance or score to be set" now fall through to OTHERS. The "Result window is too large" parser is unchanged; its bracketed-number format is verified against DefaultSearchContext.java at tag 2.19.5. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * [FSTORE-1970][APPEND] Use faiss efficient k-NN filtering so find_neighbors returns k The ee OpenSearch 2.19.5 upgrade switched the embedding-index engine from nmslib to faiss. PR #951 had moved the similarity-search query to faiss efficient filtering (the filter nested inside the knn clause), but #1005 reverted it to a bool/must post-filter after hitting "[knn] unknown token [START_OBJECT] after [filter]" — the signature of an engine that does not support in-knn filtering (nmslib), most likely seen against an index not yet recreated under faiss. Post-filtering retrieves the k nearest first and prunes with the filter afterwards, so a selective filter (the per-feature-group exists clause on a shared project index) returns fewer than k results — e.g. find_neighbors k=10 returning 7. faiss supports efficient filtering since OpenSearch 2.9 (GA in 2.19) and applies the filter during graph traversal, guaranteeing k results when at least k exist. Re-apply the efficient-filter query form (reverting #1005). The explicit oversized-k path (find_neighbors k=2**31-1) still raises VectorDatabaseException via the first search, and the parser fix in this branch keeps the "k too large" message parseable on 2.19.5. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent dcc96b2 commit 0c1a952

4 files changed

Lines changed: 55 additions & 26 deletions

File tree

python/hopsworks_common/core/opensearch.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,19 @@ def _close(self):
231231

232232
def _create_vector_database_exception(self, message):
233233
"""Create appropriate VectorDatabaseException based on error message."""
234-
if "[knn] requires k" in message:
235-
pattern = r"\[knn\] requires k <= (\d+)"
236-
match = re.search(pattern, message)
234+
# Only an upper-bound violation means "k too large". Older OpenSearch
235+
# reported "[knn] requires k <= N"; newer versions (2.x) report
236+
# "[knn] requires k to be in the range (0, N]". The upper bound N is the
237+
# inclusive max in both forms. Other "[knn] requires k ..." messages
238+
# (e.g. "requires k > 0") are not too-large errors and fall through to
239+
# OTHERS.
240+
if (
241+
"[knn] requires k <=" in message
242+
or "[knn] requires k to be in the range" in message
243+
):
244+
match = re.search(r"\[knn\] requires k <= (\d+)", message) or re.search(
245+
r"\[knn\] requires k to be in the range \(\d+, (\d+)\]", message
246+
)
237247
if match:
238248
k = match.group(1)
239249
reason = VectorDatabaseException.REQUESTED_K_TOO_LARGE

python/hsfs/core/vector_db_client.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,17 +133,18 @@ def _find_neighbors(
133133
)
134134
self._check_filter(filter, embedding_feature.feature_group)
135135
col_name = embedding_feature.embedding_index.col_prefix + embedding_feature.name
136+
filter_clauses = [
137+
{"exists": {"field": col_name}},
138+
] + self._get_query_filter(filter, embedding_feature.embedding_index.col_prefix)
136139
query = {
137140
"size": k,
138141
"query": {
139-
"bool": {
140-
"must": [
141-
{"knn": {col_name: {"vector": embedding, "k": k}}},
142-
{"exists": {"field": col_name}},
143-
]
144-
+ self._get_query_filter(
145-
filter, embedding_feature.embedding_index.col_prefix
146-
)
142+
"knn": {
143+
col_name: {
144+
"vector": embedding,
145+
"k": k,
146+
"filter": {"bool": {"must": filter_clauses}},
147+
}
147148
}
148149
},
149150
"_source": list(
@@ -170,7 +171,7 @@ def _find_neighbors(
170171
# Get the max number of results allowed to request if it is not available.
171172
# This is expected to be executed once only.
172173
if not VectorDbClient._index_result_limit_k.get(index_name):
173-
query["query"]["bool"]["must"][0]["knn"][col_name]["k"] = 2**31 - 1
174+
query["query"]["knn"][col_name]["k"] = 2**31 - 1
174175
try:
175176
# It is expected that this request ALWAYS fails because requested k is too large.
176177
# The purpose here is to get the max k allowed from the vector database, and cache it.
@@ -189,7 +190,7 @@ def _find_neighbors(
189190
)
190191
else:
191192
raise e
192-
query["query"]["bool"]["must"][0]["knn"][col_name]["k"] = min(
193+
query["query"]["knn"][col_name]["k"] = min(
193194
VectorDbClient._index_result_limit_k.get(index_name, k), 3 * k
194195
)
195196
results = opensearch_client._search(

python/tests/core/test_opensearch.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class TestOpenSearchClientSingleton:
3636
VectorDatabaseException.REQUESTED_K_TOO_LARGE,
3737
{},
3838
),
39+
(
40+
# Newer OpenSearch k-NN range format
41+
"[knn] requires k to be in the range (0, 10000]",
42+
VectorDatabaseException.REQUESTED_K_TOO_LARGE,
43+
{VectorDatabaseException.REQUESTED_K_TOO_LARGE_INFO_K: 10000},
44+
),
3945
(
4046
"Result window is too large, from + size must be less than or equal to: [10000] but was [80000]",
4147
VectorDatabaseException.REQUESTED_NUM_RESULT_TOO_LARGE,
@@ -47,6 +53,18 @@ class TestOpenSearchClientSingleton:
4753
VectorDatabaseException.REQUESTED_NUM_RESULT_TOO_LARGE,
4854
{},
4955
),
56+
(
57+
# Lower-bound violation is not a "too large" error
58+
"[knn] requires k > 0",
59+
VectorDatabaseException.OTHERS,
60+
{},
61+
),
62+
(
63+
# Unrelated knn validation must not be misclassified as too large
64+
"[knn] requires exactly one of k, distance or score to be set",
65+
VectorDatabaseException.OTHERS,
66+
{},
67+
),
5068
(
5169
"Some other error message",
5270
VectorDatabaseException.OTHERS,

python/tests/core/test_vector_db_client.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -494,25 +494,25 @@ def test_find_neighbors_builds_knn_query_without_filter(self):
494494

495495
body = self.mock_os_wrapper._search.call_args.kwargs["body"]
496496
assert body["size"] == 5
497-
# The knn clause and filters are combined in a bool query. OpenSearch
498-
# 2.19 rejects a `filter` nested inside the knn clause.
499-
must = body["query"]["bool"]["must"]
500-
knn = must[0]["knn"]["f2"]
497+
knn = body["query"]["knn"]["f2"]
501498
assert knn["vector"] == [1.0, 2.0, 3.0]
502499
assert knn["k"] == 5
503-
assert "filter" not in knn
504-
assert must[1] == {"exists": {"field": "f2"}}
500+
# The filter lives inside the knn clause (OpenSearch 2.x KNN syntax),
501+
# not as a sibling bool/post_filter.
502+
assert knn["filter"] == {"bool": {"must": [{"exists": {"field": "f2"}}]}}
505503

506504
def test_find_neighbors_builds_knn_query_with_filter(self):
507505
self.target._find_neighbors(
508506
[1.0, 2.0, 3.0], feature=self.f2, k=5, filter=self.f3 > 10
509507
)
510508

511509
body = self.mock_os_wrapper._search.call_args.kwargs["body"]
512-
must = body["query"]["bool"]["must"]
513-
knn = must[0]["knn"]["f2"]
514-
assert "filter" not in knn
515-
assert must[1:] == [
516-
{"exists": {"field": "f2"}},
517-
{"range": {"f3": {"gt": 10}}},
518-
]
510+
knn = body["query"]["knn"]["f2"]
511+
assert knn["filter"] == {
512+
"bool": {
513+
"must": [
514+
{"exists": {"field": "f2"}},
515+
{"range": {"f3": {"gt": 10}}},
516+
]
517+
}
518+
}

0 commit comments

Comments
 (0)