Skip to content

Commit 6ae4870

Browse files
feat(supabase): add SupabaseGroongaDocumentStore and SupabaseGroongaRetriever (#3266)
Co-authored-by: David S. Batista <dsbatista@gmail.com>
1 parent 4976119 commit 6ae4870

14 files changed

Lines changed: 1369 additions & 5 deletions

File tree

.github/workflows/supabase.yml

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,31 @@ jobs:
120120
name: coverage-comment-supabase
121121
path: python-coverage-comment-action-supabase.txt
122122

123-
- name: Run integration tests
123+
- name: Run pgvector integration tests
124124
if: runner.os == 'Linux'
125125
env:
126126
SUPABASE_DB_URL: "postgresql://postgres:postgres@localhost:5432/postgres"
127-
run: hatch run test:integration-cov-append-retry
127+
run: hatch run test:integration-cov-append-retry --ignore=tests/test_groonga_integration.py
128+
129+
- name: Start PGroonga + PostgREST stack
130+
if: runner.os == 'Linux'
131+
run: docker compose -f docker-compose-groonga.yml up -d --build
132+
133+
- name: Wait for PGroonga stack to be ready
134+
if: runner.os == 'Linux'
135+
run: |
136+
for i in $(seq 1 30); do
137+
if curl -sf http://localhost:8000/rest/v1/ -H "apikey: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hj04zWl196z2-SBc0"; then
138+
echo "PGroonga stack is ready"
139+
break
140+
fi
141+
echo "Waiting for PGroonga stack... ($i/30)"
142+
sleep 5
143+
done
144+
145+
- name: Run PGroonga integration tests
146+
if: runner.os == 'Linux'
147+
run: hatch run test:integration-cov-append-retry tests/test_groonga_integration.py
128148

129149
- name: Store combined coverage
130150
if: github.event_name == 'push'
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
FROM postgres:17-bookworm
2+
3+
RUN apt-get update && \
4+
apt-get install -y wget gnupg2 && \
5+
wget -q -O /tmp/groonga-apt-source.deb \
6+
https://packages.groonga.org/debian/groonga-apt-source-latest-bookworm.deb && \
7+
dpkg -i /tmp/groonga-apt-source.deb && \
8+
apt-get update && \
9+
apt-get install -y postgresql-17-pgdg-pgroonga && \
10+
rm -rf /var/lib/apt/lists/* /tmp/groonga-apt-source.deb
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
services:
2+
pgroonga-postgres:
3+
build:
4+
context: .
5+
dockerfile: Dockerfile.pgroonga
6+
environment:
7+
POSTGRES_USER: postgres
8+
POSTGRES_PASSWORD: postgres
9+
POSTGRES_DB: postgres
10+
ports:
11+
- "5433:5432"
12+
volumes:
13+
- ./init-pgroonga.sql:/docker-entrypoint-initdb.d/init-pgroonga.sql
14+
healthcheck:
15+
test: ["CMD-SHELL", "pg_isready -U postgres"]
16+
interval: 10s
17+
timeout: 5s
18+
retries: 10
19+
20+
postgrest:
21+
image: postgrest/postgrest:v12.2.0
22+
environment:
23+
PGRST_DB_URI: postgres://postgres:postgres@pgroonga-postgres:5432/postgres
24+
PGRST_DB_SCHEMAS: public
25+
# No PGRST_JWT_SECRET → JWT validation disabled; all requests run as PGRST_DB_ANON_ROLE.
26+
# supabase-py still sends an apikey header but PostgREST ignores it.
27+
PGRST_DB_ANON_ROLE: postgres
28+
PGRST_LOG_LEVEL: info
29+
ports:
30+
- "3000:3000"
31+
depends_on:
32+
pgroonga-postgres:
33+
condition: service_healthy
34+
35+
nginx:
36+
image: nginx:alpine
37+
ports:
38+
- "8000:8000"
39+
volumes:
40+
- ./nginx-groonga.conf:/etc/nginx/nginx.conf:ro
41+
depends_on:
42+
- postgrest
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
-- Enable PGroonga extension
2+
CREATE EXTENSION IF NOT EXISTS pgroonga;
3+
4+
-- PostgreSQL role that PostgREST switches to when a service_role JWT is presented.
5+
-- The role must exist before PostgREST connects.
6+
DO $$
7+
BEGIN
8+
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN
9+
CREATE ROLE service_role NOLOGIN;
10+
END IF;
11+
END
12+
$$;
13+
14+
GRANT ALL ON SCHEMA public TO service_role;
15+
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO service_role;
16+
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO service_role;
17+
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON FUNCTIONS TO service_role;
18+
19+
-- exec_sql: allows the document store to create/drop tables and indexes via RPC.
20+
CREATE OR REPLACE FUNCTION exec_sql(query TEXT)
21+
RETURNS VOID AS $$
22+
BEGIN
23+
EXECUTE query;
24+
END;
25+
$$ LANGUAGE plpgsql SECURITY DEFINER;
26+
27+
GRANT EXECUTE ON FUNCTION exec_sql(TEXT) TO service_role;
28+
29+
-- groonga_search: full-text search via PGroonga, called by _groonga_retrieval().
30+
CREATE OR REPLACE FUNCTION groonga_search(query_text TEXT, table_name TEXT, top_k INT)
31+
RETURNS TABLE(id TEXT, content TEXT, meta JSONB, score REAL) AS $$
32+
DECLARE
33+
sql TEXT;
34+
BEGIN
35+
sql := format(
36+
'SELECT id, content, meta, pgroonga_score(tableoid, ctid)::REAL AS score
37+
FROM %I
38+
WHERE content &@~ %L
39+
ORDER BY score DESC
40+
LIMIT %s',
41+
table_name, query_text, top_k
42+
);
43+
RETURN QUERY EXECUTE sql;
44+
END;
45+
$$ LANGUAGE plpgsql;
46+
47+
GRANT EXECUTE ON FUNCTION groonga_search(TEXT, TEXT, INT) TO service_role;
48+
49+
-- Pre-create the test table so PostgREST includes it in its schema cache at startup.
50+
-- Tests use this fixed table and clear data between runs instead of recreating the table.
51+
CREATE TABLE IF NOT EXISTS haystack_groonga_test (
52+
id TEXT PRIMARY KEY,
53+
content TEXT,
54+
meta JSONB,
55+
score REAL
56+
);
57+
58+
CREATE INDEX IF NOT EXISTS pgroonga_haystack_groonga_test_index
59+
ON haystack_groonga_test
60+
USING pgroonga (content);
61+
62+
GRANT ALL ON TABLE haystack_groonga_test TO postgres;
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Minimal reverse proxy so supabase-py (which appends /rest/v1/) reaches PostgREST.
2+
events {}
3+
4+
http {
5+
server {
6+
listen 8000;
7+
8+
location /rest/v1/ {
9+
rewrite ^/rest/v1/(.*)$ /$1 break;
10+
proxy_pass http://postgrest:3000;
11+
proxy_set_header Host $host;
12+
# Strip auth headers — PostgREST has no JWT secret configured,
13+
# so all requests run as the anon role (postgres).
14+
proxy_set_header Authorization "";
15+
proxy_set_header apikey "";
16+
}
17+
}
18+
}

integrations/supabase/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ classifiers = [
2323
"Programming Language :: Python :: Implementation :: CPython",
2424
"Programming Language :: Python :: Implementation :: PyPy",
2525
]
26-
dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.9.0"]
26+
dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.23.0"]
2727

2828
[project.urls]
2929
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase#readme"
@@ -58,6 +58,7 @@ dependencies = [
5858
"pytest-rerunfailures",
5959
"mypy",
6060
"pip",
61+
"supabase",
6162
]
6263

6364
[tool.hatch.envs.test.scripts]
@@ -153,6 +154,7 @@ show_missing = true
153154
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
154155

155156
[tool.pytest.ini_options]
157+
asyncio_mode = "auto"
156158
addopts = "--strict-markers"
157159
markers = [
158160
"integration: integration tests",

integrations/supabase/pytest

Whitespace-only changes.

integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from .embedding_retriever import SupabasePgvectorEmbeddingRetriever
6+
from .groonga_bm25_retriever import SupabaseGroongaBM25Retriever
67
from .keyword_retriever import SupabasePgvectorKeywordRetriever
78

8-
__all__ = ["SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever"]
9+
__all__ = [
10+
"SupabaseGroongaBM25Retriever",
11+
"SupabasePgvectorEmbeddingRetriever",
12+
"SupabasePgvectorKeywordRetriever",
13+
]
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import copy
6+
from typing import Any
7+
8+
from haystack import component, default_from_dict, default_to_dict
9+
from haystack.dataclasses import Document
10+
from haystack.document_stores.types import FilterPolicy
11+
from haystack.document_stores.types.filter_policy import apply_filter_policy
12+
13+
from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore
14+
15+
16+
@component
17+
class SupabaseGroongaBM25Retriever:
18+
"""
19+
Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search.
20+
21+
This retriever works without embeddings — it searches documents using plain text queries.
22+
It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines.
23+
24+
Note: async operations are not supported as the supabase-py sync client does not expose
25+
awaitable query methods. Use the sync run() method instead.
26+
27+
Example usage:
28+
29+
```python
30+
from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore
31+
from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever
32+
from haystack.utils import Secret
33+
34+
document_store = SupabaseGroongaDocumentStore(
35+
supabase_url="https://<project>.supabase.co",
36+
supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"),
37+
table_name="haystack_fts_documents",
38+
)
39+
document_store.warm_up()
40+
41+
retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10)
42+
result = retriever.run(query="python programming")
43+
print(result["documents"])
44+
```
45+
"""
46+
47+
def __init__(
48+
self,
49+
*,
50+
document_store: SupabaseGroongaDocumentStore,
51+
filters: dict[str, Any] | None = None,
52+
top_k: int = 10,
53+
filter_policy: str | FilterPolicy = FilterPolicy.REPLACE,
54+
) -> None:
55+
"""
56+
Initialize the SupabaseGroongaBM25Retriever.
57+
58+
:param document_store: An instance of SupabaseGroongaDocumentStore.
59+
:param filters: Optional filters applied to retrieved Documents.
60+
:param top_k: Maximum number of Documents to return. Defaults to 10.
61+
:param filter_policy: Policy to determine how filters are applied.
62+
:raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore.
63+
"""
64+
if not isinstance(document_store, SupabaseGroongaDocumentStore):
65+
msg = "document_store must be an instance of SupabaseGroongaDocumentStore"
66+
raise ValueError(msg)
67+
68+
self.document_store = document_store
69+
self.filters = filters or {}
70+
self.top_k = top_k
71+
self.filter_policy = (
72+
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
73+
)
74+
75+
@component.output_types(documents=list[Document])
76+
def run(
77+
self,
78+
query: str,
79+
filters: dict[str, Any] | None = None,
80+
top_k: int | None = None,
81+
) -> dict[str, list[Document]]:
82+
"""
83+
Runs the retriever on the given query.
84+
85+
:param query: The text query to search for.
86+
:param filters: Optional runtime filters. Merged or replaced based on filter_policy.
87+
:param top_k: Optional override for maximum number of documents to return.
88+
:returns: Dictionary with key "documents" containing list of matching Documents.
89+
"""
90+
if not query:
91+
return {"documents": []}
92+
93+
merged_filters = apply_filter_policy(self.filter_policy, self.filters, filters)
94+
effective_top_k = top_k if top_k is not None else self.top_k
95+
96+
documents = self.document_store._groonga_retrieval(
97+
query=query,
98+
top_k=effective_top_k,
99+
filters=merged_filters,
100+
)
101+
102+
return {"documents": documents}
103+
104+
@component.output_types(documents=list[Document])
105+
async def run_async(
106+
self,
107+
query: str,
108+
filters: dict[str, Any] | None = None,
109+
top_k: int | None = None,
110+
) -> dict[str, list[Document]]:
111+
"""
112+
Async version of run().
113+
114+
Note: supabase-py's sync client does not support native async queries.
115+
This method runs the synchronous retrieval and returns the result.
116+
For fully async support, consider using acreate_client() from supabase-py
117+
and refactoring the document store accordingly.
118+
119+
:param query: The text query to search for.
120+
:param filters: Optional runtime filters. Merged or replaced based on filter_policy.
121+
:param top_k: Optional override for maximum number of documents to return.
122+
:returns: Dictionary with key "documents" containing list of matching Documents.
123+
"""
124+
return self.run(query=query, filters=filters, top_k=top_k)
125+
126+
def to_dict(self) -> dict[str, Any]:
127+
"""
128+
Serializes the component to a dictionary.
129+
130+
:returns: Dictionary with serialized data.
131+
"""
132+
return default_to_dict(
133+
self,
134+
filters=self.filters,
135+
top_k=self.top_k,
136+
filter_policy=self.filter_policy.value,
137+
document_store=self.document_store.to_dict(),
138+
)
139+
140+
@classmethod
141+
def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaBM25Retriever":
142+
"""
143+
Deserializes the component from a dictionary.
144+
145+
:param data: Dictionary to deserialize from.
146+
:returns: Deserialized component.
147+
"""
148+
data = copy.deepcopy(data)
149+
doc_store_params = data["init_parameters"]["document_store"]
150+
data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params)
151+
if filter_policy := data["init_parameters"].get("filter_policy"):
152+
data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy)
153+
return default_from_dict(cls, data)

integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,9 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44
from .document_store import SupabasePgvectorDocumentStore
5+
from .groonga_document_store import SupabaseGroongaDocumentStore
56

6-
__all__ = ["SupabasePgvectorDocumentStore"]
7+
__all__ = [
8+
"SupabaseGroongaDocumentStore",
9+
"SupabasePgvectorDocumentStore",
10+
]

0 commit comments

Comments
 (0)