Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion server/app/ml/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""ML pipeline orchestration functions."""

import shutil
import time
from pathlib import Path
from typing import Any
Expand All @@ -12,6 +13,7 @@
build_inference_command,
build_polygonize_command,
)
from app.services.cache_service import check_s2_scene_exists, save_to_cache

logger = get_logger(__name__)

Expand All @@ -25,17 +27,39 @@ async def download_images(
) -> dict[str, Any]:
"""Download satellite images using ftw tool. Handles both single and dual window."""
download_start = time.time()
logger.info("Downloading images", context)

# Check cache first (only for dual window scenarios)
if win_b is not None:
cached, cached_path = await check_s2_scene_exists(win_a, win_b, bbox)

if cached and cached_path is not None:
logger.info("Using cached S2 scene", extra=context)
shutil.copy2(cached_path, image_file)
download_time = round((time.time() - download_start) * 1000, 2)
image_size_mb = round(image_file.stat().st_size / (1024 * 1024), 2)

return {
"download_time_ms": download_time,
"image_size_mb": image_size_mb,
"from_cache": True,
}
Comment on lines +31 to +45

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't look like there is a way to cache single windows. We want to cache the individual files, and then call them if they are needed. It should work for models that only need a single window, as well as models with two windows.


# Not cached or single window - download from S3
logger.info("Downloading images from S3", extra=context)
cmd = build_download_command(image_file, win_a, win_b, bbox)
await run_async(cmd)

download_time = round((time.time() - download_start) * 1000, 2)
image_size_mb = round(image_file.stat().st_size / (1024 * 1024), 2)

# Save to cache for next time (only for dual window scenarios)
if win_b is not None:
await save_to_cache(image_file, win_a, win_b, bbox)

return {
"download_time_ms": download_time,
"image_size_mb": image_size_mb,
"from_cache": False,
}


Expand Down
91 changes: 91 additions & 0 deletions server/app/services/cache_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""S2 Scene caching service for improved performance."""

import hashlib
import shutil
from pathlib import Path

import aiofiles.os

from app.core.config import get_settings
from app.core.logging import get_logger

logger = get_logger(__name__)


def extract_year_from_window(win_a: str) -> str:
"""Extract year from window string for partitioning."""
try:
first_date = win_a.split("_")[0]
year = first_date.split("-")[0]
# Validate it's actually a year (4 digits)
if len(year) == 4 and year.isdigit():
return year
logger.warning(
f"Could not extract valid year from win_a: {win_a}, using 'unknown'"
)
return "unknown"
except (IndexError, AttributeError):
logger.warning(f"Could not extract year from win_a: {win_a}, using 'unknown'")
return "unknown"


def generate_scene_id(win_a: str, win_b: str, bbox: list[float]) -> str:
"""Generate unique scene ID from parameters."""
scene_params = f"{win_a}|{win_b}|{','.join(map(str, bbox))}"
scene_hash = hashlib.sha256(scene_params.encode()).hexdigest()[:12]
year = extract_year_from_window(win_a)
return f"{year}_{scene_hash}"


def get_cache_dir() -> Path:
"""Get the cache directory path from settings."""
settings = get_settings()
cache_dir = Path(settings.storage.output_dir).parent / "cache" / "scenes"
return cache_dir


def get_scene_cache_path(win_a: str, win_b: str, bbox: list[float]) -> Path:
"""Get the full cache path for a scene, organized by year."""
cache_dir = get_cache_dir()
year = extract_year_from_window(win_a)
scene_id = generate_scene_id(win_a, win_b, bbox)
year_dir = cache_dir / year
cache_path = year_dir / f"scene_{scene_id}.tif"
return cache_path


async def check_s2_scene_exists(
win_a: str,
win_b: str,
bbox: list[float],
) -> tuple[bool, Path | None]:
"""Check if S2 scene is already cached locally."""
cache_path = get_scene_cache_path(win_a, win_b, bbox)

if await aiofiles.os.path.exists(cache_path):
logger.info(f"Cache HIT: Found cached scene at {cache_path}")
return True, cache_path

logger.info(f"Cache MISS: Scene not cached (would be at {cache_path})")
return False, None


async def save_to_cache(
image_file: Path,
win_a: str,
win_b: str,
bbox: list[float],
) -> None:
"""Save downloaded image to cache for future use."""
cache_path = get_scene_cache_path(win_a, win_b, bbox)
cache_path.parent.mkdir(parents=True, exist_ok=True)

try:
shutil.copy2(image_file, cache_path)
size_mb = cache_path.stat().st_size / (1024 * 1024)
logger.info(
f"Saved scene to cache: {cache_path} ({size_mb:.2f} MB)",
extra={"cache_size_mb": size_mb, "cache_path": str(cache_path)},
)
except Exception as e:
logger.error(f"Failed to save scene to cache: {e}", exc_info=True)
156 changes: 156 additions & 0 deletions server/tests/test_cache_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Integration tests for S2 scene caching."""

import tempfile
from pathlib import Path

import pytest
from app.services.cache_service import (
check_s2_scene_exists,
save_to_cache,
)


@pytest.fixture
def temp_cache_dir(monkeypatch):
"""Create a temporary cache directory for testing."""
with tempfile.TemporaryDirectory() as tmpdir:
# Mock get_cache_dir to use our temp directory
temp_path = Path(tmpdir) / "cache" / "scenes"
monkeypatch.setattr(
"app.services.cache_service.get_cache_dir", lambda: temp_path
)
yield temp_path


@pytest.fixture
def sample_image_file():
"""Create a sample image file for testing."""
with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as f:
f.write(b"fake image data for testing")
temp_path = Path(f.name)
yield temp_path
# Cleanup
if temp_path.exists():
temp_path.unlink()


class TestCacheIntegration:
"""Integration tests for cache functionality."""

@pytest.mark.asyncio
async def test_cache_miss_then_hit(self, temp_cache_dir, sample_image_file):
"""Test that cache miss works, then cache hit after saving."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

# First check - should be a cache MISS
exists, path = await check_s2_scene_exists(win_a, win_b, bbox)
assert exists is False
assert path is None

# Save to cache
await save_to_cache(sample_image_file, win_a, win_b, bbox)

# Second check - should be a cache HIT
exists, path = await check_s2_scene_exists(win_a, win_b, bbox)
assert exists is True
assert path is not None
assert path.exists()
assert path.suffix == ".tif"

@pytest.mark.asyncio
async def test_cache_organizes_by_year(self, temp_cache_dir, sample_image_file):
"""Test that cache files are organized in year directories."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

await save_to_cache(sample_image_file, win_a, win_b, bbox)

# Check that year directory was created
year_dir = temp_cache_dir / "2023"
assert year_dir.exists()
assert year_dir.is_dir()

# Check that file is in the year directory
cached_files = list(year_dir.glob("scene_*.tif"))
assert len(cached_files) == 1

@pytest.mark.asyncio
async def test_multiple_scenes_cached(self, temp_cache_dir, sample_image_file):
"""Test that multiple different scenes can be cached."""
scenes = [
(
"2023-01-01_2023-03-31",
"2023-04-01_2023-06-30",
[10.0, 20.0, 10.5, 20.5],
),
(
"2023-07-01_2023-09-30",
"2023-10-01_2023-12-31",
[11.0, 21.0, 11.5, 21.5],
),
(
"2024-01-01_2024-03-31",
"2024-04-01_2024-06-30",
[12.0, 22.0, 12.5, 22.5],
),
]

# Cache all scenes
for win_a, win_b, bbox in scenes:
await save_to_cache(sample_image_file, win_a, win_b, bbox)

# Verify all can be found
for win_a, win_b, bbox in scenes:
exists, path = await check_s2_scene_exists(win_a, win_b, bbox)
assert exists is True
assert path is not None

@pytest.mark.asyncio
async def test_cached_file_content_preserved(
self, temp_cache_dir, sample_image_file
):
"""Test that cached file content is preserved correctly."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

# Read original content
with open(sample_image_file, "rb") as f:
original_content = f.read()

# Save to cache
await save_to_cache(sample_image_file, win_a, win_b, bbox)

# Get cached file
exists, cached_path = await check_s2_scene_exists(win_a, win_b, bbox)
assert exists is True

# Verify content matches
with open(cached_path, "rb") as f:
cached_content = f.read()

assert cached_content == original_content

@pytest.mark.asyncio
async def test_different_bbox_different_cache(
self, temp_cache_dir, sample_image_file
):
"""Test that different bboxes create different cache entries."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox1 = [10.0, 20.0, 10.5, 20.5]
bbox2 = [10.0, 20.0, 10.6, 20.6] # Slightly different

# Cache first bbox
await save_to_cache(sample_image_file, win_a, win_b, bbox1)

# First bbox should hit
exists1, _ = await check_s2_scene_exists(win_a, win_b, bbox1)
assert exists1 is True

# Second bbox should miss
exists2, _ = await check_s2_scene_exists(win_a, win_b, bbox2)
assert exists2 is False
77 changes: 77 additions & 0 deletions server/tests/test_cache_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Tests for cache service functionality."""

from pathlib import Path

from app.services.cache_service import (
extract_year_from_window,
generate_scene_id,
get_scene_cache_path,
)


class TestCacheService:
"""Test cache service functions."""

def test_extract_year_from_window(self):
"""Test year extraction from window string."""
assert extract_year_from_window("2023-01-01_2023-03-31") == "2023"
assert extract_year_from_window("2024-06-01_2024-08-31") == "2024"
assert extract_year_from_window("2022-12-01_2023-02-28") == "2022"

def test_extract_year_from_invalid_window(self):
"""Test year extraction with invalid input."""
assert extract_year_from_window("invalid") == "unknown"
assert extract_year_from_window("") == "unknown"

def test_generate_scene_id(self):
"""Test scene ID generation."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

scene_id = generate_scene_id(win_a, win_b, bbox)

# Should start with year
assert scene_id.startswith("2023_")
# Should have a hash component
assert len(scene_id) > 5

def test_generate_scene_id_deterministic(self):
"""Test that same inputs generate same scene ID."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

scene_id1 = generate_scene_id(win_a, win_b, bbox)
scene_id2 = generate_scene_id(win_a, win_b, bbox)

assert scene_id1 == scene_id2

def test_generate_scene_id_different_inputs(self):
"""Test that different inputs generate different scene IDs."""
win_a = "2023-01-01_2023-03-31"
win_b1 = "2023-04-01_2023-06-30"
win_b2 = "2023-07-01_2023-09-30"
bbox = [10.0, 20.0, 10.5, 20.5]

scene_id1 = generate_scene_id(win_a, win_b1, bbox)
scene_id2 = generate_scene_id(win_a, win_b2, bbox)

assert scene_id1 != scene_id2

def test_get_scene_cache_path(self):
"""Test cache path generation."""
win_a = "2023-01-01_2023-03-31"
win_b = "2023-04-01_2023-06-30"
bbox = [10.0, 20.0, 10.5, 20.5]

cache_path = get_scene_cache_path(win_a, win_b, bbox)

# Should be a Path object
assert isinstance(cache_path, Path)
# Should contain year directory
assert "2023" in str(cache_path)
# Should be a .tif file
assert cache_path.suffix == ".tif"
# Should contain "scene_" prefix
assert "scene_" in cache_path.name