Skip to content

[Bug]: Milvus produce wrong results when inserting more data #43456

@JZuming

Description

@JZuming

Is there an existing issue for this?

  • I have searched the existing issues

Environment

- Milvus version: commit 26d6918010f4867f5f32a144594cc1460c246cb6
- Deployment mode(standalone or cluster): standalone 
- MQ type(rocksmq, pulsar or kafka):    
- SDK version(e.g. pymilvus v2.0.0rc2): 2.5.10
- OS(Ubuntu or CentOS): Ubuntu 20.04.4 LTS
- CPU/Memory: 
- GPU: 
- Others:

Current Behavior

Milvus returns search results that include data with the primary key -296356. This data does not satisfy the filter.

Expected Behavior

The search results should not include the data with primary key -296356.

Steps To Reproduce

When there is only one data (primary key: -296356), the data is not in the result set. Here is the script.

from pymilvus import MilvusClient, DataType, MilvusException, AnnSearchRequest, RRFRanker, WeightedRanker
# connect to database
client = MilvusClient("http://127.0.0.1:19530", dbname="testdb")
# reset database
for c in client.list_collections():
    client.release_collection(c)
    client.drop_collection(c)
# create schema
schema = MilvusClient.create_schema(auto_id=False, enable_dynamic_field=True)
schema.add_field(field_name="f0", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f1", datatype=DataType.INT64, is_primary=True, nullable=False)
schema.add_field(field_name="f2", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f3", datatype=DataType.JSON, nullable=False)
schema.add_field(field_name="f4", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f5", datatype=DataType.VARCHAR, is_primary=False, nullable=False, max_length=10)
schema.add_field(field_name="f6", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="f7", datatype=DataType.VARCHAR, is_primary=False, nullable=False, max_length=10)
schema.add_field(field_name="f8", datatype=DataType.BOOL, nullable=False)
schema.add_field(field_name="f9", datatype=DataType.FLOAT_VECTOR, dim=89)
schema.add_field(field_name="f10", datatype=DataType.FLOAT_VECTOR, dim=26)
# create collection
client.create_collection(collection_name="test_collection", schema=schema)

data_list = [
{'f0': 0, 'f1': -296356, 'f2': 0.76461, 'f3': {'e3': -331233, 'e4': 'l$D39', 'e1': '', 'e2': 0.95488, 'e0': False}, 'f4': -0.13584, 'f5': '', 'f6': {14: 0.82363, 52: 0.85714, 2: 0.05396, 5: 0.7158, 30: 0.43401, 32: 0.695, 10: 0.56168, 48: 0.47008, 50: 0.0668, 12: 0.50963, 36: 0.08056, 7: 0.0464, 15: 0.07577, 43: 0.29906, 1: 0.40775, 27: 0.2364, 11: 0.13596, 31: 0.62922, 28: 0.92623}, 'f7': '>r`iA/', 'f8': True, 'f9': [0.58333, 0.7374, 0.11658, 0.6786, 0.81488, 0.31308, 0.27886, 0.11212, 0.34482, 0.27999, 0.66568, 0.43929, 0.92385, 0.07938, 0.03509, 0.83712, 0.8836, 0.21087, 0.9604, 0.71506, 0.67393, 0.8263, 0.44555, 0.038, 0.36016, 0.59057, 0.08762, 0.93871, 0.89521, 0.5914, 0.31742, 0.89466, 0.73824, 0.84408, 0.24381, 0.0906, 0.3234, 0.6078, 0.09166, 0.23097, 0.27033, 0.47886, 0.25724, 0.37703, 0.82721, 0.6242, 0.5836, 0.50032, 0.90689, 0.34716, 0.04097, 0.22751, 0.02406, 0.00611, 0.04144, 0.29239, 0.1643, 0.76008, 0.10569, 0.60408, 0.28502, 0.92491, 0.68588, 0.45493, 0.86768, 0.60066, 0.83632, 0.18423, 0.66455, 0.39108, 0.08326, 0.12457, 0.27832, 0.04571, 0.02868, 0.42971, 0.31233, 0.40252, 0.59094, 0.52488, 0.40057, 0.60177, 0.03917, 0.65095, 0.48649, 0.26287, 0.5662, 0.91348, 0.38051], 'f10': [0.96784, 0.67922, 0.49219, 0.02456, 0.94469, 0.70542, 0.07587, 0.67856, 0.70405, 0.37983, 0.47559, 0.95118, 0.26859, 0.35598, 0.18755, 0.39294, 0.96862, 0.20765, 0.9424, 0.89446, 0.09588, 0.24696, 0.01866, 0.58048, 0.51235, 0.06671]},
]
client.upsert(collection_name='test_collection', data=data_list)
client.flush("test_collection")

# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f0", index_type="INVERTED", metric_type="", index_name="i0", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f1", index_type="STL_SORT", metric_type="", index_name="i1", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {'json_path': 'f3["e2"]', 'json_cast_type': 'double'}
index_params.add_index(field_name="f3", index_type="INVERTED", metric_type="", index_name="i2", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f5", index_type="INVERTED", metric_type="", index_name="i3", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f6", index_type="SPARSE_INVERTED_INDEX", metric_type="IP", index_name="i4", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f9", index_type="FLAT", metric_type="IP", index_name="i5", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f10", index_type="FLAT", metric_type="COSINE", index_name="i6", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
client.load_collection(collection_name="test_collection")

res = client.search(
                        collection_name='test_collection',
                        data=[[0, 0.53811, -0.82241, 0.65733, 0, -0.01755, -0.34571, 0.32396, -0.19506, 0, 0, -0.1337, -0.36827, -0.01782, 0.2446, 0.7622, 0, 0.88877, -0.54753, -0.51149, 0, 0.02048, -0.10037, 0.56787, -0.29471, -0.10367, 0.50114, 0.8728, 0.28117, 0.68249, 0.17557, 0.13498, 0, 0.93964, 0, 0, -0.12996, 0.78098, -0.50289, -0.03264, 0, -0.46652, 0.01055, 0.57724, -0.7345, -0.95856, 0, -0.87085, -0.28906, 0.33854, 0, -0.96181, 0.37202, 0, 0.53103, 0.07325, -0.93922, 0.90769, -0.30736, 0, -0.15911, 0, 0.78893, 0, 0.34724, -0.43212, 0.51935, -0.68594, 0.39554, 0.30814, -0.15307, -0.19373, 0.63046, -0.58007, 0.06308, 0.2789, -0.95066, -0.18576, -0.55116, -0.9511, -0.74247, 0.49651, -0.1782, -0.71782, 0.30096, -0.56449, 0.7167, 0, -0.48895]],
                        anns_field='f9', 
                        filter='f0 != ((-0.177856) * 0.0)', 
                        limit=5050, 
                        output_fields=['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10'], 
                        search_params={'params': {}})
res = res[0]
res_old = res

def get_key_list_from_res(search_res, primary_field):
    list = []
    for data in search_res:
        list.append(data[primary_field])
    return list

primary_field = 'f1'
old_key_list = get_key_list_from_res(res_old, primary_field)

a = -296356

import sys
if old_key_list.count(a) >= 1:
    print("a is in the result set")
    print(f"data[a]: {res_old[old_key_list.index(a)]}")
    sys.exit(0) # interesting

print("a is not in the result set")
sys.exit(-1)

It outputs a is not in the result set

However, after I inserted additional 1000+ data, the search result unexpectedly contains the data with primary key -296356, which does not satisfy the filter condition. Here is the simplified script (it does not contain all the 1000+ data. To reproduce the bug, we need to insert additional 1000+ data). The completed script is attached as test_1000.py.txt.

from pymilvus import MilvusClient, DataType, MilvusException, AnnSearchRequest, RRFRanker, WeightedRanker
# connect to database
client = MilvusClient("http://127.0.0.1:19530", dbname="testdb")
# reset database
for c in client.list_collections():
    client.release_collection(c)
    client.drop_collection(c)
# create schema
schema = MilvusClient.create_schema(auto_id=False, enable_dynamic_field=True)
schema.add_field(field_name="f0", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f1", datatype=DataType.INT64, is_primary=True, nullable=False)
schema.add_field(field_name="f2", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f3", datatype=DataType.JSON, nullable=False)
schema.add_field(field_name="f4", datatype=DataType.FLOAT, nullable=False)
schema.add_field(field_name="f5", datatype=DataType.VARCHAR, is_primary=False, nullable=False, max_length=10)
schema.add_field(field_name="f6", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="f7", datatype=DataType.VARCHAR, is_primary=False, nullable=False, max_length=10)
schema.add_field(field_name="f8", datatype=DataType.BOOL, nullable=False)
schema.add_field(field_name="f9", datatype=DataType.FLOAT_VECTOR, dim=89)
schema.add_field(field_name="f10", datatype=DataType.FLOAT_VECTOR, dim=26)
# create collection
client.create_collection(collection_name="test_collection", schema=schema)

data_list = [
{'f0': 0, 'f1': -296356, 'f2': 0.76461, 'f3': {'e3': -331233, 'e4': 'l$D39', 'e1': '', 'e2': 0.95488, 'e0': False}, 'f4': -0.13584, 'f5': '', 'f6': {14: 0.82363, 52: 0.85714, 2: 0.05396, 5: 0.7158, 30: 0.43401, 32: 0.695, 10: 0.56168, 48: 0.47008, 50: 0.0668, 12: 0.50963, 36: 0.08056, 7: 0.0464, 15: 0.07577, 43: 0.29906, 1: 0.40775, 27: 0.2364, 11: 0.13596, 31: 0.62922, 28: 0.92623}, 'f7': '>r`iA/', 'f8': True, 'f9': [0.58333, 0.7374, 0.11658, 0.6786, 0.81488, 0.31308, 0.27886, 0.11212, 0.34482, 0.27999, 0.66568, 0.43929, 0.92385, 0.07938, 0.03509, 0.83712, 0.8836, 0.21087, 0.9604, 0.71506, 0.67393, 0.8263, 0.44555, 0.038, 0.36016, 0.59057, 0.08762, 0.93871, 0.89521, 0.5914, 0.31742, 0.89466, 0.73824, 0.84408, 0.24381, 0.0906, 0.3234, 0.6078, 0.09166, 0.23097, 0.27033, 0.47886, 0.25724, 0.37703, 0.82721, 0.6242, 0.5836, 0.50032, 0.90689, 0.34716, 0.04097, 0.22751, 0.02406, 0.00611, 0.04144, 0.29239, 0.1643, 0.76008, 0.10569, 0.60408, 0.28502, 0.92491, 0.68588, 0.45493, 0.86768, 0.60066, 0.83632, 0.18423, 0.66455, 0.39108, 0.08326, 0.12457, 0.27832, 0.04571, 0.02868, 0.42971, 0.31233, 0.40252, 0.59094, 0.52488, 0.40057, 0.60177, 0.03917, 0.65095, 0.48649, 0.26287, 0.5662, 0.91348, 0.38051], 'f10': [0.96784, 0.67922, 0.49219, 0.02456, 0.94469, 0.70542, 0.07587, 0.67856, 0.70405, 0.37983, 0.47559, 0.95118, 0.26859, 0.35598, 0.18755, 0.39294, 0.96862, 0.20765, 0.9424, 0.89446, 0.09588, 0.24696, 0.01866, 0.58048, 0.51235, 0.06671]},
... 1k more data
]
client.upsert(collection_name='test_collection', data=data_list)
client.flush("test_collection")

# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f0", index_type="INVERTED", metric_type="", index_name="i0", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f1", index_type="STL_SORT", metric_type="", index_name="i1", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {'json_path': 'f3["e2"]', 'json_cast_type': 'double'}
index_params.add_index(field_name="f3", index_type="INVERTED", metric_type="", index_name="i2", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f5", index_type="INVERTED", metric_type="", index_name="i3", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f6", index_type="SPARSE_INVERTED_INDEX", metric_type="IP", index_name="i4", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f9", index_type="FLAT", metric_type="IP", index_name="i5", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
# create index
index_params = client.prepare_index_params()
params = {}
index_params.add_index(field_name="f10", index_type="FLAT", metric_type="COSINE", index_name="i6", params=params)
client.create_index(collection_name="test_collection", index_params=index_params)
client.load_collection(collection_name="test_collection")

res = client.search(
                        collection_name='test_collection',
                        data=[[0, 0.53811, -0.82241, 0.65733, 0, -0.01755, -0.34571, 0.32396, -0.19506, 0, 0, -0.1337, -0.36827, -0.01782, 0.2446, 0.7622, 0, 0.88877, -0.54753, -0.51149, 0, 0.02048, -0.10037, 0.56787, -0.29471, -0.10367, 0.50114, 0.8728, 0.28117, 0.68249, 0.17557, 0.13498, 0, 0.93964, 0, 0, -0.12996, 0.78098, -0.50289, -0.03264, 0, -0.46652, 0.01055, 0.57724, -0.7345, -0.95856, 0, -0.87085, -0.28906, 0.33854, 0, -0.96181, 0.37202, 0, 0.53103, 0.07325, -0.93922, 0.90769, -0.30736, 0, -0.15911, 0, 0.78893, 0, 0.34724, -0.43212, 0.51935, -0.68594, 0.39554, 0.30814, -0.15307, -0.19373, 0.63046, -0.58007, 0.06308, 0.2789, -0.95066, -0.18576, -0.55116, -0.9511, -0.74247, 0.49651, -0.1782, -0.71782, 0.30096, -0.56449, 0.7167, 0, -0.48895]],
                        anns_field='f9', 
                        filter='f0 != ((-0.177856) * 0.0)', 
                        limit=5050, 
                        output_fields=['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10'], 
                        search_params={'params': {}})
res = res[0]
res_old = res

def get_key_list_from_res(search_res, primary_field):
    list = []
    for data in search_res:
        list.append(data[primary_field])
    return list

primary_field = 'f1'
old_key_list = get_key_list_from_res(res_old, primary_field)

a = -296356

import sys
if old_key_list.count(a) >= 1:
    print("a is in the result set")
    print(f"data[a]: {res_old[old_key_list.index(a)]}")
    sys.exit(0) # interesting

print("a is not in the result set")
sys.exit(-1)

It outputs a is in the result set.

The two complete scripts (test_1.py.txt and test_1000.py.txt) are attached for reproducing the bug.

Milvus Log

No response

Anything else?

test_1.py.txt
test_1000.py.txt

Metadata

Metadata

Assignees

Labels

kind/bugIssues or changes related a bugtriage/acceptedIndicates an issue or PR is ready to be actively worked on.

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions