diff --git a/server/app/ml/pipeline.py b/server/app/ml/pipeline.py index d4464eb..8dd9640 100644 --- a/server/app/ml/pipeline.py +++ b/server/app/ml/pipeline.py @@ -1,5 +1,6 @@ """ML pipeline orchestration functions.""" +import shutil import time from pathlib import Path from typing import Any @@ -12,6 +13,7 @@ build_inference_command, build_polygonize_command, ) +from app.services.cache_service import check_s2_scene_exists, save_to_cache logger = get_logger(__name__) @@ -25,17 +27,39 @@ async def download_images( ) -> dict[str, Any]: """Download satellite images using ftw tool. Handles both single and dual window.""" download_start = time.time() - logger.info("Downloading images", context) + # Check cache first (only for dual window scenarios) + if win_b is not None: + cached, cached_path = await check_s2_scene_exists(win_a, win_b, bbox) + + if cached and cached_path is not None: + logger.info("Using cached S2 scene", extra=context) + shutil.copy2(cached_path, image_file) + download_time = round((time.time() - download_start) * 1000, 2) + image_size_mb = round(image_file.stat().st_size / (1024 * 1024), 2) + + return { + "download_time_ms": download_time, + "image_size_mb": image_size_mb, + "from_cache": True, + } + + # Not cached or single window - download from S3 + logger.info("Downloading images from S3", extra=context) cmd = build_download_command(image_file, win_a, win_b, bbox) await run_async(cmd) download_time = round((time.time() - download_start) * 1000, 2) image_size_mb = round(image_file.stat().st_size / (1024 * 1024), 2) + # Save to cache for next time (only for dual window scenarios) + if win_b is not None: + await save_to_cache(image_file, win_a, win_b, bbox) + return { "download_time_ms": download_time, "image_size_mb": image_size_mb, + "from_cache": False, } diff --git a/server/app/services/cache_service.py b/server/app/services/cache_service.py new file mode 100644 index 0000000..3d86c60 --- /dev/null +++ b/server/app/services/cache_service.py @@ -0,0 +1,91 @@ +"""S2 Scene caching service for improved performance.""" + +import hashlib +import shutil +from pathlib import Path + +import aiofiles.os + +from app.core.config import get_settings +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +def extract_year_from_window(win_a: str) -> str: + """Extract year from window string for partitioning.""" + try: + first_date = win_a.split("_")[0] + year = first_date.split("-")[0] + # Validate it's actually a year (4 digits) + if len(year) == 4 and year.isdigit(): + return year + logger.warning( + f"Could not extract valid year from win_a: {win_a}, using 'unknown'" + ) + return "unknown" + except (IndexError, AttributeError): + logger.warning(f"Could not extract year from win_a: {win_a}, using 'unknown'") + return "unknown" + + +def generate_scene_id(win_a: str, win_b: str, bbox: list[float]) -> str: + """Generate unique scene ID from parameters.""" + scene_params = f"{win_a}|{win_b}|{','.join(map(str, bbox))}" + scene_hash = hashlib.sha256(scene_params.encode()).hexdigest()[:12] + year = extract_year_from_window(win_a) + return f"{year}_{scene_hash}" + + +def get_cache_dir() -> Path: + """Get the cache directory path from settings.""" + settings = get_settings() + cache_dir = Path(settings.storage.output_dir).parent / "cache" / "scenes" + return cache_dir + + +def get_scene_cache_path(win_a: str, win_b: str, bbox: list[float]) -> Path: + """Get the full cache path for a scene, organized by year.""" + cache_dir = get_cache_dir() + year = extract_year_from_window(win_a) + scene_id = generate_scene_id(win_a, win_b, bbox) + year_dir = cache_dir / year + cache_path = year_dir / f"scene_{scene_id}.tif" + return cache_path + + +async def check_s2_scene_exists( + win_a: str, + win_b: str, + bbox: list[float], +) -> tuple[bool, Path | None]: + """Check if S2 scene is already cached locally.""" + cache_path = get_scene_cache_path(win_a, win_b, bbox) + + if await aiofiles.os.path.exists(cache_path): + logger.info(f"Cache HIT: Found cached scene at {cache_path}") + return True, cache_path + + logger.info(f"Cache MISS: Scene not cached (would be at {cache_path})") + return False, None + + +async def save_to_cache( + image_file: Path, + win_a: str, + win_b: str, + bbox: list[float], +) -> None: + """Save downloaded image to cache for future use.""" + cache_path = get_scene_cache_path(win_a, win_b, bbox) + cache_path.parent.mkdir(parents=True, exist_ok=True) + + try: + shutil.copy2(image_file, cache_path) + size_mb = cache_path.stat().st_size / (1024 * 1024) + logger.info( + f"Saved scene to cache: {cache_path} ({size_mb:.2f} MB)", + extra={"cache_size_mb": size_mb, "cache_path": str(cache_path)}, + ) + except Exception as e: + logger.error(f"Failed to save scene to cache: {e}", exc_info=True) diff --git a/server/tests/test_cache_integration.py b/server/tests/test_cache_integration.py new file mode 100644 index 0000000..9e6e4b8 --- /dev/null +++ b/server/tests/test_cache_integration.py @@ -0,0 +1,156 @@ +"""Integration tests for S2 scene caching.""" + +import tempfile +from pathlib import Path + +import pytest +from app.services.cache_service import ( + check_s2_scene_exists, + save_to_cache, +) + + +@pytest.fixture +def temp_cache_dir(monkeypatch): + """Create a temporary cache directory for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Mock get_cache_dir to use our temp directory + temp_path = Path(tmpdir) / "cache" / "scenes" + monkeypatch.setattr( + "app.services.cache_service.get_cache_dir", lambda: temp_path + ) + yield temp_path + + +@pytest.fixture +def sample_image_file(): + """Create a sample image file for testing.""" + with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as f: + f.write(b"fake image data for testing") + temp_path = Path(f.name) + yield temp_path + # Cleanup + if temp_path.exists(): + temp_path.unlink() + + +class TestCacheIntegration: + """Integration tests for cache functionality.""" + + @pytest.mark.asyncio + async def test_cache_miss_then_hit(self, temp_cache_dir, sample_image_file): + """Test that cache miss works, then cache hit after saving.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + # First check - should be a cache MISS + exists, path = await check_s2_scene_exists(win_a, win_b, bbox) + assert exists is False + assert path is None + + # Save to cache + await save_to_cache(sample_image_file, win_a, win_b, bbox) + + # Second check - should be a cache HIT + exists, path = await check_s2_scene_exists(win_a, win_b, bbox) + assert exists is True + assert path is not None + assert path.exists() + assert path.suffix == ".tif" + + @pytest.mark.asyncio + async def test_cache_organizes_by_year(self, temp_cache_dir, sample_image_file): + """Test that cache files are organized in year directories.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + await save_to_cache(sample_image_file, win_a, win_b, bbox) + + # Check that year directory was created + year_dir = temp_cache_dir / "2023" + assert year_dir.exists() + assert year_dir.is_dir() + + # Check that file is in the year directory + cached_files = list(year_dir.glob("scene_*.tif")) + assert len(cached_files) == 1 + + @pytest.mark.asyncio + async def test_multiple_scenes_cached(self, temp_cache_dir, sample_image_file): + """Test that multiple different scenes can be cached.""" + scenes = [ + ( + "2023-01-01_2023-03-31", + "2023-04-01_2023-06-30", + [10.0, 20.0, 10.5, 20.5], + ), + ( + "2023-07-01_2023-09-30", + "2023-10-01_2023-12-31", + [11.0, 21.0, 11.5, 21.5], + ), + ( + "2024-01-01_2024-03-31", + "2024-04-01_2024-06-30", + [12.0, 22.0, 12.5, 22.5], + ), + ] + + # Cache all scenes + for win_a, win_b, bbox in scenes: + await save_to_cache(sample_image_file, win_a, win_b, bbox) + + # Verify all can be found + for win_a, win_b, bbox in scenes: + exists, path = await check_s2_scene_exists(win_a, win_b, bbox) + assert exists is True + assert path is not None + + @pytest.mark.asyncio + async def test_cached_file_content_preserved( + self, temp_cache_dir, sample_image_file + ): + """Test that cached file content is preserved correctly.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + # Read original content + with open(sample_image_file, "rb") as f: + original_content = f.read() + + # Save to cache + await save_to_cache(sample_image_file, win_a, win_b, bbox) + + # Get cached file + exists, cached_path = await check_s2_scene_exists(win_a, win_b, bbox) + assert exists is True + + # Verify content matches + with open(cached_path, "rb") as f: + cached_content = f.read() + + assert cached_content == original_content + + @pytest.mark.asyncio + async def test_different_bbox_different_cache( + self, temp_cache_dir, sample_image_file + ): + """Test that different bboxes create different cache entries.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox1 = [10.0, 20.0, 10.5, 20.5] + bbox2 = [10.0, 20.0, 10.6, 20.6] # Slightly different + + # Cache first bbox + await save_to_cache(sample_image_file, win_a, win_b, bbox1) + + # First bbox should hit + exists1, _ = await check_s2_scene_exists(win_a, win_b, bbox1) + assert exists1 is True + + # Second bbox should miss + exists2, _ = await check_s2_scene_exists(win_a, win_b, bbox2) + assert exists2 is False diff --git a/server/tests/test_cache_service.py b/server/tests/test_cache_service.py new file mode 100644 index 0000000..305171e --- /dev/null +++ b/server/tests/test_cache_service.py @@ -0,0 +1,77 @@ +"""Tests for cache service functionality.""" + +from pathlib import Path + +from app.services.cache_service import ( + extract_year_from_window, + generate_scene_id, + get_scene_cache_path, +) + + +class TestCacheService: + """Test cache service functions.""" + + def test_extract_year_from_window(self): + """Test year extraction from window string.""" + assert extract_year_from_window("2023-01-01_2023-03-31") == "2023" + assert extract_year_from_window("2024-06-01_2024-08-31") == "2024" + assert extract_year_from_window("2022-12-01_2023-02-28") == "2022" + + def test_extract_year_from_invalid_window(self): + """Test year extraction with invalid input.""" + assert extract_year_from_window("invalid") == "unknown" + assert extract_year_from_window("") == "unknown" + + def test_generate_scene_id(self): + """Test scene ID generation.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + scene_id = generate_scene_id(win_a, win_b, bbox) + + # Should start with year + assert scene_id.startswith("2023_") + # Should have a hash component + assert len(scene_id) > 5 + + def test_generate_scene_id_deterministic(self): + """Test that same inputs generate same scene ID.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + scene_id1 = generate_scene_id(win_a, win_b, bbox) + scene_id2 = generate_scene_id(win_a, win_b, bbox) + + assert scene_id1 == scene_id2 + + def test_generate_scene_id_different_inputs(self): + """Test that different inputs generate different scene IDs.""" + win_a = "2023-01-01_2023-03-31" + win_b1 = "2023-04-01_2023-06-30" + win_b2 = "2023-07-01_2023-09-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + scene_id1 = generate_scene_id(win_a, win_b1, bbox) + scene_id2 = generate_scene_id(win_a, win_b2, bbox) + + assert scene_id1 != scene_id2 + + def test_get_scene_cache_path(self): + """Test cache path generation.""" + win_a = "2023-01-01_2023-03-31" + win_b = "2023-04-01_2023-06-30" + bbox = [10.0, 20.0, 10.5, 20.5] + + cache_path = get_scene_cache_path(win_a, win_b, bbox) + + # Should be a Path object + assert isinstance(cache_path, Path) + # Should contain year directory + assert "2023" in str(cache_path) + # Should be a .tif file + assert cache_path.suffix == ".tif" + # Should contain "scene_" prefix + assert "scene_" in cache_path.name