Skip to content

Commit d955fa8

Browse files
Add support for OpenSearch as a database (#300)
## Problem Describe the purpose of this change. What problem is being solved and why? ## Solution Describe the approach you took. Link to any relevant bugs, issues, docs, or other resources. ## Type of Change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] This change requires a documentation update - [ ] Infrastructure change (CI configs, etc) - [ ] Non-code change (docs, etc) - [ ] None of the above: (explain here) ## Test Plan Describe specific steps for validating this change. --------- Co-authored-by: Vamshi Krishna Enabothala <[email protected]>
1 parent eebcb76 commit d955fa8

File tree

12 files changed

+682
-31
lines changed

12 files changed

+682
-31
lines changed

Diff for: .github/workflows/python-package.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
build:
2020

2121
runs-on: ubuntu-latest
22-
timeout-minutes: 15
22+
timeout-minutes: 30
2323
strategy:
2424
fail-fast: false
2525
matrix:
@@ -52,6 +52,7 @@ jobs:
5252
- name: Set up Docker Compose
5353
run: |
5454
docker compose -f docker/pgvector/docker-compose.yml up -d
55+
docker compose -f docker/opensearch/docker-compose.yml up -d
5556
- name: Test with pytest
5657
run: |
5758
# Maximum observed test runtime in CI is ~60s. Set a per-test timeout of
@@ -76,3 +77,4 @@ jobs:
7677
if: always()
7778
run: |
7879
docker compose -f docker/pgvector/docker-compose.yml down
80+
docker compose -f docker/opensearch/docker-compose.yml down

Diff for: .gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,9 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
.idea/
161+
162+
# reports
163+
reports/
164+
165+
# DS_Store
166+
.DS_Store

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ The following databases are currently supported by VSB:
110110

111111
* [Pinecone](vsb/databases/pinecone/README.md)
112112
* [pgvector](vsb/databases/pgvector/README.md)
113+
* [OpenSearch](vsb/databases/opensearch/README.md)
113114

114115
> [!TIP]
115116
> You can also display the list of supported databases using the following command:

Diff for: docker/opensearch/docker-compose.yml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
services:
2+
opensearch:
3+
image: opensearchproject/opensearch:2
4+
container_name: vsb_opensearch
5+
environment:
6+
discovery.type: single-node
7+
# Necessary to be able to use non-TLS (for simplifying local testing)
8+
DISABLE_SECURITY_PLUGIN: true
9+
deploy:
10+
resources:
11+
# By default, limit to 8GB RAM & 2 CPU cores - reasonable values which
12+
# should be sufficient for a range of workloads and typical commodity
13+
# hardware can support.
14+
# Adjust as needed for larger workloads.
15+
limits:
16+
memory: 8GB
17+
cpus: '2'
18+
shm_size: 8GB # Equal to limits.memory.
19+
ports:
20+
- "9200:9200"
21+
- "9600:9600"
22+
restart: always

Diff for: poetry.lock

+55-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ hdrhistogram = "^0.10.3"
2626
tenacity = "^9.0.0"
2727
rich = "^13.8.1"
2828
filelock = "^3.18.0"
29+
opensearch-py = "^2.8.0"
30+
requests-aws4auth = "^1.3.1"
2931

3032
[tool.poetry.scripts]
3133
vsb = "vsb.main:main"

Diff for: tests/integration/test_common.py

+61-29
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,12 @@
2626
spawn_vsb_pinecone,
2727
)
2828
from test_pgvector import spawn_vsb_pgvector
29+
from test_opensearch import spawn_vsb_opensearch
2930

3031

31-
@pytest.mark.parametrize("spawn_vsb", [spawn_vsb_pgvector, spawn_vsb_pinecone])
32+
@pytest.mark.parametrize(
33+
"spawn_vsb", [spawn_vsb_pgvector, spawn_vsb_pinecone, spawn_vsb_opensearch]
34+
)
3235
class TestCommon:
3336

3437
# Unfortunately pytest won't let us selectively parametrize with fixtures, so
@@ -52,8 +55,8 @@ def test_mnist_single(
5255
check_request_counts(
5356
stdout,
5457
{
55-
# Populate num_requests counts batches, not individual records.
56-
"Populate": {"num_requests": lambda x: x <= 2, "num_failures": 0},
58+
# Populate num_requests counts batches, not individual records (600).
59+
"Populate": {"num_requests": lambda x: x < 600, "num_failures": 0},
5760
"Search": {
5861
"num_requests": 20,
5962
"num_failures": 0,
@@ -82,10 +85,10 @@ def test_mnist_concurrent(
8285
stdout,
8386
{
8487
# For multiple users the populate phase will chunk the records to be
85-
# loaded into num_users chunks - i.e. 4 here. Given the size of each
86-
# chunk will be less than the batch size (600 / 4 < 1000), then the
87-
# number of requests will be equal to the number of users - i.e. 4
88-
"Populate": {"num_requests": 4, "num_failures": 0},
88+
# loaded into num_users chunks - i.e. 4 here. Different DBs
89+
# use different batch sizes, so just check we have fewer than
90+
# number of records (600) / number of users (4).
91+
"Populate": {"num_requests": lambda x: x < 600 / 4, "num_failures": 0},
8992
"Search": {
9093
"num_requests": 20,
9194
"num_failures": 0,
@@ -114,10 +117,10 @@ def test_mnist_multiprocess(
114117
stdout,
115118
{
116119
# For multiple users the populate phase will chunk the records to be
117-
# loaded into num_users chunks - i.e. 4 here. Given the size of each
118-
# chunk will be less than the batch size (600 / 4 < 1000), then the
119-
# number of requests will be equal to the number of users - i.e. 4
120-
"Populate": {"num_requests": 4, "num_failures": 0},
120+
# loaded into num_users chunks - i.e. 4 here. Different DBs
121+
# use different batch sizes, so just check we have fewer than
122+
# number of records (600) / number of users (4).
123+
"Populate": {"num_requests": lambda x: x < 600 / 4, "num_failures": 0},
121124
# The number of Search requests should equal the number in the dataset
122125
# (20 for mnist-test).
123126
"Search": {
@@ -145,15 +148,22 @@ def test_mnist_double(
145148
check_request_counts(
146149
stdout,
147150
{
148-
"test1.Populate": {"num_requests": lambda x: x <= 2, "num_failures": 0},
151+
# Populate num_requests counts batches, not individual records (600).
152+
"test1.Populate": {
153+
"num_requests": lambda x: x < 600,
154+
"num_failures": 0,
155+
},
149156
# The number of Search requests should equal the number in the dataset
150157
# (20 for mnist-test).
151158
"test1.Search": {
152159
"num_requests": 20,
153160
"num_failures": 0,
154161
"Recall": check_recall_stats,
155162
},
156-
"test2.Populate": {"num_requests": lambda x: x <= 2, "num_failures": 0},
163+
"test2.Populate": {
164+
"num_requests": lambda x: x < 600,
165+
"num_failures": 0,
166+
},
157167
"test2.Search": {
158168
"num_requests": 20,
159169
"num_failures": 0,
@@ -182,18 +192,24 @@ def test_mnist_double_concurrent(
182192
stdout,
183193
{
184194
# For multiple users the populate phase will chunk the records to be
185-
# loaded into num_users chunks - i.e. 4 here. Given the size of each
186-
# chunk will be less than the batch size (600 / 4 < 200), then the
187-
# number of requests will be equal to the number of users - i.e. 4
188-
"test1.Populate": {"num_requests": 4, "num_failures": 0},
195+
# loaded into num_users chunks - i.e. 4 here. Different DBs
196+
# use different batch sizes, so just check we have fewer than
197+
# number of records (600) / number of users (4).
198+
"test1.Populate": {
199+
"num_requests": lambda x: x < 600 / 4,
200+
"num_failures": 0,
201+
},
189202
# The number of Search requests should equal the number in the dataset
190203
# (20 for mnist-test).
191204
"test1.Search": {
192205
"num_requests": 20,
193206
"num_failures": 0,
194207
"Recall": check_recall_stats,
195208
},
196-
"test2.Populate": {"num_requests": 4, "num_failures": 0},
209+
"test2.Populate": {
210+
"num_requests": lambda x: x < 600 / 4,
211+
"num_failures": 0,
212+
},
197213
"test2.Search": {
198214
"num_requests": 20,
199215
"num_failures": 0,
@@ -222,18 +238,28 @@ def test_mnist_double_multiprocess(
222238
stdout,
223239
{
224240
# For multiple users the populate phase will chunk the records to be
225-
# loaded into num_users chunks - i.e. 4 here. Given the size of each
226-
# chunk will be less than the batch size (600 / 4 < 200), then the
227-
# number of requests will be equal to the number of users - i.e. 4
228-
"test1.Populate": {"num_requests": 4, "num_failures": 0},
241+
# loaded into num_users chunks - i.e. 4 here. Different DBs
242+
# use different batch sizes, so just check we have fewer than
243+
# number of records (600) / number of users (4).
244+
"test1.Populate": {
245+
"num_requests": lambda x: x < 600 / 4,
246+
"num_failures": 0,
247+
},
229248
# The number of Search requests should equal the number in the dataset
230249
# (20 for mnist-test).
231250
"test1.Search": {
232251
"num_requests": 20,
233252
"num_failures": 0,
234253
"Recall": check_recall_stats,
235254
},
236-
"test2.Populate": {"num_requests": 4, "num_failures": 0},
255+
# For multiple users the populate phase will chunk the records to be
256+
# loaded into num_users chunks - i.e. 4 here. Different DBs
257+
# use different batch sizes, so just check we have fewer than
258+
# number of records (600) / number of users (4).
259+
"test2.Populate": {
260+
"num_requests": lambda x: x < 600 / 4,
261+
"num_failures": 0,
262+
},
237263
"test2.Search": {
238264
"num_requests": 20,
239265
"num_failures": 0,
@@ -262,8 +288,8 @@ def test_mnist_skip_populate(
262288
check_request_counts(
263289
stdout,
264290
{
265-
# Populate num_requests counts batches, not individual records.
266-
"Populate": {"num_requests": lambda x: x <= 2, "num_failures": 0},
291+
# Populate num_requests counts batches, not individual records (600).
292+
"Populate": {"num_requests": lambda x: x < 600, "num_failures": 0},
267293
"Search": {"num_requests": 20, "num_failures": 0},
268294
},
269295
)
@@ -307,7 +333,7 @@ def test_filtered(
307333
{
308334
# Populate num_requests counts batches, not individual records.
309335
"Populate": {
310-
"num_requests": lambda x: x == 10 or x == 210,
336+
"num_requests": lambda x: x > 1 and x < 10000,
311337
"num_failures": 0,
312338
},
313339
"Search": {
@@ -335,7 +361,7 @@ def test_synthetic(
335361
check_request_counts(
336362
stdout,
337363
{
338-
"Populate": {"num_requests": 10, "num_failures": 0},
364+
"Populate": {"num_failures": 0},
339365
"Search": {
340366
"num_requests": 100,
341367
"num_failures": 0,
@@ -367,7 +393,7 @@ def test_synthetic_runbook(
367393
check_request_counts(
368394
stdout,
369395
{
370-
"Populate": {"num_requests": lambda x: x <= 4, "num_failures": 0},
396+
"Populate": {"num_failures": 0},
371397
"Search": {
372398
"num_requests": 500,
373399
"num_failures": 0,
@@ -382,6 +408,12 @@ def test_synthetic_proportional(
382408
pinecone_api_key,
383409
pinecone_index_synthetic,
384410
):
411+
if spawn_vsb == spawn_vsb_opensearch:
412+
pytest.skip(
413+
"Synthetic proportional test not supported on OpenSearch ("
414+
"fetch_batch not yet implemented for OpenSearch)"
415+
)
416+
385417
(proc, stdout, stderr) = spawn_vsb(
386418
pinecone_api_key=pinecone_api_key,
387419
pinecone_index=pinecone_index_synthetic,
@@ -404,7 +436,7 @@ def test_synthetic_proportional(
404436
check_request_counts(
405437
stdout,
406438
{
407-
"Populate": {"num_requests": lambda x: x <= 4, "num_failures": 0},
439+
"Populate": {"num_failures": 0},
408440
"Search": {
409441
"num_requests": lambda x: (x >= 150 and x <= 250),
410442
"num_failures": 0,

Diff for: tests/integration/test_opensearch.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from conftest import (
2+
spawn_vsb_inner,
3+
)
4+
5+
6+
# used in test_common
7+
def spawn_vsb_opensearch(workload, timeout=120, extra_args=None, **kwargs):
8+
"""Spawn an instance of pgvector vsb with the given arguments, returning the proc object,
9+
its stdout and stderr.
10+
"""
11+
extra_env = {
12+
"VSB__OPENSEARCH_USERNAME": "admin",
13+
"VSB__OPENSEARCH_PASSWORD": "opensearch",
14+
"VSB__OPENSEARCH_USE_TLS": "false",
15+
}
16+
return spawn_vsb_inner("opensearch", workload, timeout, extra_args, extra_env)

0 commit comments

Comments
 (0)