-
Notifications
You must be signed in to change notification settings - Fork 5
Add S2 scene caching for faster downloads #61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
avis-giri
wants to merge
3
commits into
main
Choose a base branch
from
feature/s2-scene-caching
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| """S2 Scene caching service for improved performance.""" | ||
|
|
||
| import hashlib | ||
| import shutil | ||
| from pathlib import Path | ||
|
|
||
| import aiofiles.os | ||
|
|
||
| from app.core.config import get_settings | ||
| from app.core.logging import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| def extract_year_from_window(win_a: str) -> str: | ||
| """Extract year from window string for partitioning.""" | ||
| try: | ||
| first_date = win_a.split("_")[0] | ||
| year = first_date.split("-")[0] | ||
| # Validate it's actually a year (4 digits) | ||
| if len(year) == 4 and year.isdigit(): | ||
| return year | ||
| logger.warning( | ||
| f"Could not extract valid year from win_a: {win_a}, using 'unknown'" | ||
| ) | ||
| return "unknown" | ||
| except (IndexError, AttributeError): | ||
| logger.warning(f"Could not extract year from win_a: {win_a}, using 'unknown'") | ||
| return "unknown" | ||
|
|
||
|
|
||
| def generate_scene_id(win_a: str, win_b: str, bbox: list[float]) -> str: | ||
| """Generate unique scene ID from parameters.""" | ||
| scene_params = f"{win_a}|{win_b}|{','.join(map(str, bbox))}" | ||
| scene_hash = hashlib.sha256(scene_params.encode()).hexdigest()[:12] | ||
| year = extract_year_from_window(win_a) | ||
| return f"{year}_{scene_hash}" | ||
|
|
||
|
|
||
| def get_cache_dir() -> Path: | ||
| """Get the cache directory path from settings.""" | ||
| settings = get_settings() | ||
| cache_dir = Path(settings.storage.output_dir).parent / "cache" / "scenes" | ||
| return cache_dir | ||
|
|
||
|
|
||
| def get_scene_cache_path(win_a: str, win_b: str, bbox: list[float]) -> Path: | ||
| """Get the full cache path for a scene, organized by year.""" | ||
| cache_dir = get_cache_dir() | ||
| year = extract_year_from_window(win_a) | ||
| scene_id = generate_scene_id(win_a, win_b, bbox) | ||
| year_dir = cache_dir / year | ||
| cache_path = year_dir / f"scene_{scene_id}.tif" | ||
| return cache_path | ||
|
|
||
|
|
||
| async def check_s2_scene_exists( | ||
| win_a: str, | ||
| win_b: str, | ||
| bbox: list[float], | ||
| ) -> tuple[bool, Path | None]: | ||
| """Check if S2 scene is already cached locally.""" | ||
| cache_path = get_scene_cache_path(win_a, win_b, bbox) | ||
|
|
||
| if await aiofiles.os.path.exists(cache_path): | ||
| logger.info(f"Cache HIT: Found cached scene at {cache_path}") | ||
| return True, cache_path | ||
|
|
||
| logger.info(f"Cache MISS: Scene not cached (would be at {cache_path})") | ||
| return False, None | ||
|
|
||
|
|
||
| async def save_to_cache( | ||
| image_file: Path, | ||
| win_a: str, | ||
| win_b: str, | ||
| bbox: list[float], | ||
| ) -> None: | ||
| """Save downloaded image to cache for future use.""" | ||
| cache_path = get_scene_cache_path(win_a, win_b, bbox) | ||
| cache_path.parent.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| try: | ||
| shutil.copy2(image_file, cache_path) | ||
| size_mb = cache_path.stat().st_size / (1024 * 1024) | ||
| logger.info( | ||
| f"Saved scene to cache: {cache_path} ({size_mb:.2f} MB)", | ||
| extra={"cache_size_mb": size_mb, "cache_path": str(cache_path)}, | ||
| ) | ||
| except Exception as e: | ||
| logger.error(f"Failed to save scene to cache: {e}", exc_info=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,156 @@ | ||
| """Integration tests for S2 scene caching.""" | ||
|
|
||
| import tempfile | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
| from app.services.cache_service import ( | ||
| check_s2_scene_exists, | ||
| save_to_cache, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def temp_cache_dir(monkeypatch): | ||
| """Create a temporary cache directory for testing.""" | ||
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| # Mock get_cache_dir to use our temp directory | ||
| temp_path = Path(tmpdir) / "cache" / "scenes" | ||
| monkeypatch.setattr( | ||
| "app.services.cache_service.get_cache_dir", lambda: temp_path | ||
| ) | ||
| yield temp_path | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def sample_image_file(): | ||
| """Create a sample image file for testing.""" | ||
| with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as f: | ||
| f.write(b"fake image data for testing") | ||
| temp_path = Path(f.name) | ||
| yield temp_path | ||
| # Cleanup | ||
| if temp_path.exists(): | ||
| temp_path.unlink() | ||
|
|
||
|
|
||
| class TestCacheIntegration: | ||
| """Integration tests for cache functionality.""" | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_cache_miss_then_hit(self, temp_cache_dir, sample_image_file): | ||
| """Test that cache miss works, then cache hit after saving.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| # First check - should be a cache MISS | ||
| exists, path = await check_s2_scene_exists(win_a, win_b, bbox) | ||
| assert exists is False | ||
| assert path is None | ||
|
|
||
| # Save to cache | ||
| await save_to_cache(sample_image_file, win_a, win_b, bbox) | ||
|
|
||
| # Second check - should be a cache HIT | ||
| exists, path = await check_s2_scene_exists(win_a, win_b, bbox) | ||
| assert exists is True | ||
| assert path is not None | ||
| assert path.exists() | ||
| assert path.suffix == ".tif" | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_cache_organizes_by_year(self, temp_cache_dir, sample_image_file): | ||
| """Test that cache files are organized in year directories.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| await save_to_cache(sample_image_file, win_a, win_b, bbox) | ||
|
|
||
| # Check that year directory was created | ||
| year_dir = temp_cache_dir / "2023" | ||
| assert year_dir.exists() | ||
| assert year_dir.is_dir() | ||
|
|
||
| # Check that file is in the year directory | ||
| cached_files = list(year_dir.glob("scene_*.tif")) | ||
| assert len(cached_files) == 1 | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_multiple_scenes_cached(self, temp_cache_dir, sample_image_file): | ||
| """Test that multiple different scenes can be cached.""" | ||
| scenes = [ | ||
| ( | ||
| "2023-01-01_2023-03-31", | ||
| "2023-04-01_2023-06-30", | ||
| [10.0, 20.0, 10.5, 20.5], | ||
| ), | ||
| ( | ||
| "2023-07-01_2023-09-30", | ||
| "2023-10-01_2023-12-31", | ||
| [11.0, 21.0, 11.5, 21.5], | ||
| ), | ||
| ( | ||
| "2024-01-01_2024-03-31", | ||
| "2024-04-01_2024-06-30", | ||
| [12.0, 22.0, 12.5, 22.5], | ||
| ), | ||
| ] | ||
|
|
||
| # Cache all scenes | ||
| for win_a, win_b, bbox in scenes: | ||
| await save_to_cache(sample_image_file, win_a, win_b, bbox) | ||
|
|
||
| # Verify all can be found | ||
| for win_a, win_b, bbox in scenes: | ||
| exists, path = await check_s2_scene_exists(win_a, win_b, bbox) | ||
| assert exists is True | ||
| assert path is not None | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_cached_file_content_preserved( | ||
| self, temp_cache_dir, sample_image_file | ||
| ): | ||
| """Test that cached file content is preserved correctly.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| # Read original content | ||
| with open(sample_image_file, "rb") as f: | ||
| original_content = f.read() | ||
|
|
||
| # Save to cache | ||
| await save_to_cache(sample_image_file, win_a, win_b, bbox) | ||
|
|
||
| # Get cached file | ||
| exists, cached_path = await check_s2_scene_exists(win_a, win_b, bbox) | ||
| assert exists is True | ||
|
|
||
| # Verify content matches | ||
| with open(cached_path, "rb") as f: | ||
| cached_content = f.read() | ||
|
|
||
| assert cached_content == original_content | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_different_bbox_different_cache( | ||
| self, temp_cache_dir, sample_image_file | ||
| ): | ||
| """Test that different bboxes create different cache entries.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox1 = [10.0, 20.0, 10.5, 20.5] | ||
| bbox2 = [10.0, 20.0, 10.6, 20.6] # Slightly different | ||
|
|
||
| # Cache first bbox | ||
| await save_to_cache(sample_image_file, win_a, win_b, bbox1) | ||
|
|
||
| # First bbox should hit | ||
| exists1, _ = await check_s2_scene_exists(win_a, win_b, bbox1) | ||
| assert exists1 is True | ||
|
|
||
| # Second bbox should miss | ||
| exists2, _ = await check_s2_scene_exists(win_a, win_b, bbox2) | ||
| assert exists2 is False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| """Tests for cache service functionality.""" | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| from app.services.cache_service import ( | ||
| extract_year_from_window, | ||
| generate_scene_id, | ||
| get_scene_cache_path, | ||
| ) | ||
|
|
||
|
|
||
| class TestCacheService: | ||
| """Test cache service functions.""" | ||
|
|
||
| def test_extract_year_from_window(self): | ||
| """Test year extraction from window string.""" | ||
| assert extract_year_from_window("2023-01-01_2023-03-31") == "2023" | ||
| assert extract_year_from_window("2024-06-01_2024-08-31") == "2024" | ||
| assert extract_year_from_window("2022-12-01_2023-02-28") == "2022" | ||
|
|
||
| def test_extract_year_from_invalid_window(self): | ||
| """Test year extraction with invalid input.""" | ||
| assert extract_year_from_window("invalid") == "unknown" | ||
| assert extract_year_from_window("") == "unknown" | ||
|
|
||
| def test_generate_scene_id(self): | ||
| """Test scene ID generation.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| scene_id = generate_scene_id(win_a, win_b, bbox) | ||
|
|
||
| # Should start with year | ||
| assert scene_id.startswith("2023_") | ||
| # Should have a hash component | ||
| assert len(scene_id) > 5 | ||
|
|
||
| def test_generate_scene_id_deterministic(self): | ||
| """Test that same inputs generate same scene ID.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| scene_id1 = generate_scene_id(win_a, win_b, bbox) | ||
| scene_id2 = generate_scene_id(win_a, win_b, bbox) | ||
|
|
||
| assert scene_id1 == scene_id2 | ||
|
|
||
| def test_generate_scene_id_different_inputs(self): | ||
| """Test that different inputs generate different scene IDs.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b1 = "2023-04-01_2023-06-30" | ||
| win_b2 = "2023-07-01_2023-09-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| scene_id1 = generate_scene_id(win_a, win_b1, bbox) | ||
| scene_id2 = generate_scene_id(win_a, win_b2, bbox) | ||
|
|
||
| assert scene_id1 != scene_id2 | ||
|
|
||
| def test_get_scene_cache_path(self): | ||
| """Test cache path generation.""" | ||
| win_a = "2023-01-01_2023-03-31" | ||
| win_b = "2023-04-01_2023-06-30" | ||
| bbox = [10.0, 20.0, 10.5, 20.5] | ||
|
|
||
| cache_path = get_scene_cache_path(win_a, win_b, bbox) | ||
|
|
||
| # Should be a Path object | ||
| assert isinstance(cache_path, Path) | ||
| # Should contain year directory | ||
| assert "2023" in str(cache_path) | ||
| # Should be a .tif file | ||
| assert cache_path.suffix == ".tif" | ||
| # Should contain "scene_" prefix | ||
| assert "scene_" in cache_path.name |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't look like there is a way to cache single windows. We want to cache the individual files, and then call them if they are needed. It should work for models that only need a single window, as well as models with two windows.