Skip to content

Commit ac55936

Browse files
authored
Merge pull request #91 from GenerateNU/71-storage-research
71 storage research
2 parents 3ef67d5 + a488564 commit ac55936

File tree

9 files changed

+253
-11
lines changed

9 files changed

+253
-11
lines changed

.env.example

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ DB_PASSWORD=postgres
3434
SUPABASE_URL=
3535
SUPABASE_SERVICE_ROLE_KEY=
3636

37-
# ── Webhooks (optional) ─────────────────────
38-
WEBHOOK_BASE_URL=http://localhost:8000
39-
WEBHOOK_SECRET=
37+
ENABLE_BACKEND_ACCESS_CONTROL=false
38+
39+
# Cloudfare
40+
CLOUDFLARE_R2_ENDPOINT=
41+
`CLOUDFLARE_R2_ACCESS_KEY_ID=
42+
CLOUDFLARE_R2_SECRET_KEY=
43+
CLOUDFLARE_R2_BUCKET_NAME=

backend/app/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from app.services.supabase_check import wait_for_supabase # noqa: E402
2828

2929
from app.api import api_router # noqa: E402
30-
from app.cognee_config import setup_cognee
30+
from app.cognee_config import setup_cognee # noqa: E402
3131

3232

3333
@asynccontextmanager

backend/app/routes/documents.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22
Document routes for Cognee-powered document upload and search.
33
"""
44

5+
import os
56
import shutil
67
import uuid
78
from pathlib import Path
8-
from typing import Optional
99

10-
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
10+
from backend.app.services.ingest import ingest_document, search_knowledge_graph
11+
from backend.app.services.storage import (
12+
download_file_cloudflare,
13+
upload_file_cloudflare,
14+
)
15+
from fastapi import APIRouter, BackgroundTasks, File, HTTPException, Query, UploadFile
1116
from pydantic import BaseModel
1217

1318
#from app.services.ingest import ingest_document, search_knowledge_graph
@@ -83,6 +88,8 @@ async def upload_document(
8388
suffix = Path(file.filename).suffix if file.filename else ".bin"
8489
temp_path = UPLOAD_DIR / f"{document_id}{suffix}"
8590

91+
upload_file_cloudflare(temp_path, bucket=os.getenv("CLOUDFLARE_R2_BUCKET_NAME"), key=f"{dataset_name}/{document_id}{suffix}")
92+
8693
try:
8794
# Save uploaded file to disk
8895
with temp_path.open("wb") as f:
@@ -102,7 +109,7 @@ async def upload_document(
102109
)
103110

104111
except Exception as e:
105-
raise HTTPException(status_code=500, detail=f"Upload failed: {e}")
112+
raise HTTPException(status_code=500, detail=f"Upload failed: {e}") from e
106113

107114
finally:
108115
# Clean up temp file — never leave orphans
@@ -135,4 +142,13 @@ async def search_documents(
135142
)
136143

137144
except Exception as e:
138-
raise HTTPException(status_code=500, detail=f"Search failed: {e}")
145+
raise HTTPException(status_code=500, detail=f"Search failed: {e}") from e
146+
147+
@router.get("/{document_id}", response_model=bytes)
148+
async def get_document(document_id: str, dataset: str):
149+
"""
150+
Download a document by ID.
151+
"""
152+
key = f"{dataset}/{document_id}"
153+
file_bytes = await download_file_cloudflare(bucket=os.getenv("CLOUDFLARE_R2_BUCKET_NAME"), key=key)
154+
return file_bytes

backend/app/services/storage.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Storage service for handling file uploads and downloads.
3+
"""
4+
import os
5+
6+
import boto3
7+
8+
s3 = boto3.client(
9+
"s3",
10+
endpoint_url=os.getenv("CLOUDFLARE_R2_ENDPOINT"),
11+
aws_access_key_id=os.getenv("R2_ACCESS_KEY"),
12+
aws_secret_access_key=os.getenv("R2_SECRET_KEY"),
13+
region_name="auto",
14+
)
15+
16+
async def upload_file_cloudflare(file_path, bucket, key) -> str:
17+
s3.upload_file(file_path, bucket, key)
18+
return f"s3://{bucket}/{key}"
19+
20+
async def download_file_cloudflare(bucket, key) -> bytes:
21+
response = s3.get_object(Bucket=bucket, Key=key)
22+
return response["Body"].read()
23+
24+
from supabase import create_client # noqa: E402
25+
26+
supabase = create_client(
27+
os.getenv("SUPABASE_URL"),
28+
os.getenv("SUPABASE_KEY"),
29+
)
30+
31+
async def upload_file_supabase(file_path, bucket, key) -> str:
32+
with open(file_path, "rb") as f:
33+
supabase.storage.from_(bucket).upload(
34+
path=key,
35+
file=f,
36+
file_options={"content-type": "application/octet-stream"},
37+
)
38+
return f"{bucket}/{key}"
39+
40+
async def download_file_supabase(bucket, key) -> bytes:
41+
return supabase.storage.from_(bucket).download(key)

backend/tests/conftest.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
Pytest configuration and shared fixtures.
3+
4+
Sets fake env vars at module level so that storage.py's module-level
5+
boto3.client() and supabase.create_client() don't blow up at import time.
6+
"""
7+
import os
8+
9+
os.environ.setdefault("CLOUDFLARE_R2_ENDPOINT", "https://fake.r2.cloudflarestorage.com")
10+
os.environ.setdefault("R2_ACCESS_KEY", "fake-access-key")
11+
os.environ.setdefault("R2_SECRET_KEY", "fake-secret-key")
12+
os.environ.setdefault("SUPABASE_URL", "https://fake.supabase.co")
13+
os.environ.setdefault("SUPABASE_KEY", "fake-supabase-key")

backend/tests/test_cognee.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
load_dotenv(override=True)
44

5-
import asyncio
5+
import asyncio # noqa: E402
66

7-
import cognee
8-
from cognee.api.v1.search import SearchType
7+
import cognee # noqa: E402
8+
from cognee.api.v1.search import SearchType # noqa: E402
99

1010

1111
async def setup_cognee():

backend/tests/test_storage.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""
2+
Tests for storage service.
3+
"""
4+
from unittest.mock import ANY, MagicMock, mock_open, patch
5+
6+
import pytest
7+
8+
from app.services.storage import (
9+
download_file_cloudflare,
10+
download_file_supabase,
11+
upload_file_cloudflare,
12+
upload_file_supabase,
13+
)
14+
15+
# ── Cloudflare R2 Tests ────────────────────────────────────────────────────────
16+
17+
class TestUploadFileCloudflare:
18+
@pytest.mark.asyncio
19+
@patch("app.services.storage.s3")
20+
async def test_upload_returns_s3_uri(self, mock_s3):
21+
mock_s3.upload_file.return_value = None
22+
result = await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
23+
24+
assert result == "s3://my-bucket/folder/file.txt"
25+
26+
@pytest.mark.asyncio
27+
@patch("app.services.storage.s3")
28+
async def test_upload_calls_s3_with_correct_args(self, mock_s3):
29+
mock_s3.upload_file.return_value = None
30+
31+
await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
32+
33+
mock_s3.upload_file.assert_called_once_with("local/file.txt", "my-bucket", "folder/file.txt")
34+
35+
@pytest.mark.asyncio
36+
@patch("app.services.storage.s3")
37+
async def test_upload_propagates_s3_exception(self, mock_s3):
38+
mock_s3.upload_file.side_effect = Exception("S3 upload failed")
39+
40+
with pytest.raises(Exception, match="S3 upload failed"):
41+
await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
42+
43+
44+
class TestDownloadFileCloudflare:
45+
@pytest.mark.asyncio
46+
@patch("app.services.storage.s3")
47+
async def test_download_returns_bytes(self, mock_s3):
48+
mock_body = MagicMock()
49+
mock_body.read.return_value = b"file content"
50+
mock_s3.get_object.return_value = {"Body": mock_body}
51+
52+
result = await download_file_cloudflare("my-bucket", "folder/file.txt")
53+
54+
assert result == b"file content"
55+
56+
@pytest.mark.asyncio
57+
@patch("app.services.storage.s3")
58+
async def test_download_calls_get_object_with_correct_args(self, mock_s3):
59+
mock_body = MagicMock()
60+
mock_body.read.return_value = b""
61+
mock_s3.get_object.return_value = {"Body": mock_body}
62+
63+
await download_file_cloudflare("my-bucket", "folder/file.txt")
64+
65+
mock_s3.get_object.assert_called_once_with(Bucket="my-bucket", Key="folder/file.txt")
66+
67+
@pytest.mark.asyncio
68+
@patch("app.services.storage.s3")
69+
async def test_download_propagates_s3_exception(self, mock_s3):
70+
mock_s3.get_object.side_effect = Exception("Key not found")
71+
72+
with pytest.raises(Exception, match="Key not found"):
73+
await download_file_cloudflare("my-bucket", "folder/file.txt")
74+
75+
76+
# ── Supabase Tests ─────────────────────────────────────────────────────────────
77+
78+
class TestUploadFileSupabase:
79+
@pytest.mark.asyncio
80+
@patch("builtins.open", mock_open(read_data=b"file content"))
81+
@patch("app.services.storage.supabase")
82+
async def test_upload_returns_bucket_key_path(self, mock_supabase):
83+
mock_supabase.storage.from_().upload.return_value = None
84+
85+
result = await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
86+
87+
assert result == "my-bucket/folder/file.txt"
88+
89+
@pytest.mark.asyncio
90+
@patch("builtins.open", mock_open(read_data=b"file content"))
91+
@patch("app.services.storage.supabase")
92+
async def test_upload_calls_storage_with_correct_args(self, mock_supabase):
93+
mock_storage = MagicMock()
94+
mock_supabase.storage.from_.return_value = mock_storage
95+
96+
await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
97+
98+
mock_supabase.storage.from_.assert_called_once_with("my-bucket")
99+
mock_storage.upload.assert_called_once_with(
100+
path="folder/file.txt",
101+
file=ANY,
102+
file_options={"content-type": "application/octet-stream"},
103+
)
104+
105+
@pytest.mark.asyncio
106+
@patch("builtins.open", mock_open(read_data=b"file content"))
107+
@patch("app.services.storage.supabase")
108+
async def test_upload_propagates_storage_exception(self, mock_supabase):
109+
mock_supabase.storage.from_().upload.side_effect = Exception("Upload failed")
110+
111+
with pytest.raises(Exception, match="Upload failed"):
112+
await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
113+
114+
115+
class TestDownloadFileSupabase:
116+
@pytest.mark.asyncio
117+
@patch("app.services.storage.supabase")
118+
async def test_download_returns_bytes(self, mock_supabase):
119+
mock_supabase.storage.from_().download.return_value = b"file content"
120+
121+
result = await download_file_supabase("my-bucket", "folder/file.txt")
122+
123+
assert result == b"file content"
124+
125+
@pytest.mark.asyncio
126+
@patch("app.services.storage.supabase")
127+
async def test_download_calls_storage_with_correct_args(self, mock_supabase):
128+
mock_storage = MagicMock()
129+
mock_storage.download.return_value = b""
130+
mock_supabase.storage.from_.return_value = mock_storage
131+
132+
await download_file_supabase("my-bucket", "folder/file.txt")
133+
134+
mock_supabase.storage.from_.assert_called_once_with("my-bucket")
135+
mock_storage.download.assert_called_once_with("folder/file.txt")
136+
137+
@pytest.mark.asyncio
138+
@patch("app.services.storage.supabase")
139+
async def test_download_propagates_storage_exception(self, mock_supabase):
140+
mock_supabase.storage.from_().download.side_effect = Exception("File not found")
141+
142+
with pytest.raises(Exception, match="File not found"):
143+
await download_file_supabase("my-bucket", "folder/file.txt")

docs/object-storage-research.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
Supabase Storage:
3+
- Limited free tier
4+
- Can work with s3 but requires a translation layer
5+
- Probably easiest to set up since its aready in the stack
6+
7+
Cloudfare R2:
8+
- 10 GB free
9+
- Can use boto3 -> comptable with s3
10+
- better for lots of downloads + if you want predictable costs if we were to pay for it
11+
12+
Blackblaze B2:
13+
- 10 GB free
14+
- Can use boto3 -> comptable with s3
15+
- Better for cheap large storage with infrequent downloads
16+
17+
MiniIo:
18+
- Can store as much data as hardware lets
19+
- Can use boto3 -> comptable with s3
20+
- Probably the most difficult to set up
21+
22+
Supabase Storage is the best choice for simplest setup but has limited storage in the free tier. Cloudflare R2 is the better option for more free storage and easier S3 compatibility.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[tool.pytest.ini_options]
2+
pythonpath = ["backend"]
3+
testpaths = ["backend/tests"]

0 commit comments

Comments
 (0)