From 9c9ea8e80c2c0cb1e9cd3b739a4e9047b65085f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:43:19 +0000 Subject: [PATCH 1/7] Initial plan From a954c4158a4f212e4ffb9f3974b34eaffc4136ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:57:39 +0000 Subject: [PATCH 2/7] Add temp file filtering to tile discovery functions Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- geotessera/tiles.py | 32 +++++- tests/cli.t.err | 150 +++++++++++++++++++++++++++ tests/hash.t.err | 140 +++++++++++++++++++++++++ tests/test_tile_discovery.py | 132 ++++++++++++++++++++++++ tests/viz.t.err | 194 +++++++++++++++++++++++++++++++++++ tests/zarr.t.err | 152 +++++++++++++++++++++++++++ 6 files changed, 798 insertions(+), 2 deletions(-) create mode 100644 tests/cli.t.err create mode 100644 tests/hash.t.err create mode 100644 tests/test_tile_discovery.py create mode 100644 tests/viz.t.err create mode 100644 tests/zarr.t.err diff --git a/geotessera/tiles.py b/geotessera/tiles.py index aa22d11..2ba01d3 100644 --- a/geotessera/tiles.py +++ b/geotessera/tiles.py @@ -7,6 +7,22 @@ from .registry import EMBEDDINGS_DIR_NAME, LANDMASKS_DIR_NAME, tile_to_landmask_filename +def _is_temp_file(path: Path) -> bool: + """Check if a file is a temporary download file. + + Temporary files are created during download with pattern: .{filename}_tmp_{random} + and should be ignored during tile discovery. + + Args: + path: Path to check + + Returns: + True if the file is a temporary file, False otherwise + """ + # Check if filename starts with '.' (hidden) and contains '_tmp_' + return path.name.startswith('.') and '_tmp_' in path.name + + class Tile: """A single embedding tile that abstracts storage format. @@ -326,11 +342,11 @@ def discover_tiles(directory: Path) -> List[Tile]: # Preferred order is NPY, tiff, zarr, as NPY (more efficient, includes scales) embeddings_dir = directory / EMBEDDINGS_DIR_NAME if embeddings_dir.exists() and embeddings_dir.is_dir(): - # Check if there are any .npy files (not just _scales.npy) + # Check if there are any .npy files (not just _scales.npy or temporary files) npy_files = [ f for f in embeddings_dir.rglob("*.npy") - if not f.name.endswith("_scales.npy") + if not f.name.endswith("_scales.npy") and not _is_temp_file(f) ] if npy_files: return discover_npy_tiles(directory) @@ -367,6 +383,10 @@ def discover_npy_tiles(base_dir: Path) -> List[Tile]: # Skip scales files if npy_file.name.endswith("_scales.npy"): continue + + # Skip temporary download files + if _is_temp_file(npy_file): + continue try: tile = Tile.from_npy(npy_file, base_dir) @@ -398,6 +418,10 @@ def discover_geotiff_tiles(directory: Path) -> List[Tile]: # Skip landmask files (they're in a different directory and have different naming) if LANDMASKS_DIR_NAME in geotiff_file.parts: continue + + # Skip temporary download files + if _is_temp_file(geotiff_file): + continue try: tile = Tile.from_geotiff(geotiff_file) @@ -423,6 +447,10 @@ def discover_zarr_tiles(directory: Path) -> List[Tile]: for pattern in ["*.zarr"]: for zarr_file in directory.rglob(pattern): + # Skip temporary download files + if _is_temp_file(zarr_file): + continue + # Skip landmask files (they're in a different directory and have different naming) try: tile = Tile.from_zarr(zarr_file) diff --git a/tests/cli.t.err b/tests/cli.t.err new file mode 100644 index 0000000..bff7999 --- /dev/null +++ b/tests/cli.t.err @@ -0,0 +1,150 @@ +GeoTessera CLI Tests +===================== + +These are tests for the `geotessera` command-line interface. + +Setup +----- + +Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): + + $ export TERM=dumb + +Create a temporary directory for test outputs and cache: + + $ export TESTDIR="$CRAMTMP/test_outputs" + $ mkdir -p "$TESTDIR" + +Override XDG cache directory to use temporary location (for test isolation): + + $ export XDG_CACHE_HOME="$CRAMTMP/cache" + $ mkdir -p "$XDG_CACHE_HOME" + +Test: Version Command +--------------------- + +The version command should print the version number. + + $ uv run -m geotessera.cli version + /bin/sh: 12: uv: not found + [127] + +Test: Info Command (Library Info) +---------------------------------- + +Test the info command without arguments to see library information. +We just verify key information is present, ignoring formatting: + + $ uv run -m geotessera.cli info --dataset-version v1 | grep -E 'Available years' + /bin/sh: 14: uv: not found + [1] + +Test: Download Dry Run for UK Tile +----------------------------------- + +Test downloading a single tile covering London, UK using --dry-run to avoid actual downloads. +Verify key information is present: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format tiff \ + > --dry-run \ + > --dataset-version v1 2>&1 | grep -E '(Format|Year|Compression|Dataset version|Found|Files to download|Total download|Tiles in region)' | sed 's/ *$//' + +Test: Download Single UK Tile (TIFF format) +-------------------------------------------- + +Download a single tile in TIFF format to a temporary directory: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format tiff \ + > --output "$TESTDIR/uk_tiles_tiff" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Verify TIFF files were created in the registry structure: + + $ [ -n "$(find "$TESTDIR/uk_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null)" ] && echo "TIFF files created" + [1] + + $ find "$TESTDIR/uk_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" | wc -l | tr -d ' ' + find: '/tmp/cramtests-rsi5b614/test_outputs/uk_tiles_tiff/global_0.1_degree_representation/2024': No such file or directory + 0 + +Test: Download Single UK Tile (NPY format) +------------------------------------------- + +Download the same tile in NPY format (quantized arrays with scales): + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/uk_tiles_npy" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Verify NPY directory structure was created: + + $ test -d "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" && echo "Embeddings directory created" + [1] + + $ test -d "$TESTDIR/uk_tiles_npy/global_0.1_degree_tiff_all" && echo "Landmasks directory created" + [1] + +Verify NPY files exist in grid subdirectories: + + $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null)" ] && echo "Embedding NPY files created" + [1] + + $ find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "*.npy" | wc -l | tr -d ' ' + find: '/tmp/cramtests-rsi5b614/test_outputs/uk_tiles_npy/global_0.1_degree_representation/2024': No such file or directory + 0 + + $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "*_scales.npy" 2>/dev/null)" ] && echo "Scales NPY files created" + [1] + + $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_tiff_all" -name "*.tif*" 2>/dev/null)" ] && echo "Landmask TIFF files created" + [1] + +Test: Info Command on Downloaded TIFF Tiles +-------------------------------------------- + +Test the info command on the downloaded TIFF tiles. +Both TIFF and NPY formats should be present (NPY files are retained for efficient reprocessing): + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_tiff" + /bin/sh: 53: uv: not found + [127] + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_tiff" + /bin/sh: 55: uv: not found + [127] + +Test: Info Command on Downloaded NPY Tiles +------------------------------------------- + +Test the info command on the downloaded NPY tiles: + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_npy" + /bin/sh: 57: uv: not found + [127] + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_npy" + /bin/sh: 59: uv: not found + [127] + +Test: Resume Capability for NPY Downloads +------------------------------------------ + +Test that re-running the NPY download skips existing files: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/uk_tiles_npy" \ + > --dataset-version v1 2>&1 | grep -E '(Skipped|existing files)' + [1] + diff --git a/tests/hash.t.err b/tests/hash.t.err new file mode 100644 index 0000000..f1455d3 --- /dev/null +++ b/tests/hash.t.err @@ -0,0 +1,140 @@ +GeoTessera Hash Verification Tests +==================================== + +These are tests for SHA256 hash verification functionality in the `geotessera` CLI. + +Setup +----- + +Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): + + $ export TERM=dumb + +Create a temporary directory for test outputs and cache: + + $ export TESTDIR="$CRAMTMP/test_outputs" + $ mkdir -p "$TESTDIR" + +Override XDG cache directory to use temporary location (for test isolation): + + $ export XDG_CACHE_HOME="$CRAMTMP/cache" + $ mkdir -p "$XDG_CACHE_HOME" + +Test: Hash Verification Failure +-------------------------------- + +Test that corrupted files are detected and downloading fails with hash mismatch error. + +First, download a small tile for testing: + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/hash_test" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Find and corrupt the embedding file by appending data: + + $ CORRUPT_FILE=$(find "$TESTDIR/hash_test" -name "grid_*.npy" -not -name "*_scales.npy" | head -1) + find: '/tmp/cramtests-rsi5b614/test_outputs/hash_test': No such file or directory + $ echo "corrupted data" >> "$CORRUPT_FILE" + /bin/sh: 21: cannot create : Directory nonexistent + [2] + $ echo "File corrupted successfully" + File corrupted successfully + +Try to re-download with hash verification enabled (should fail): + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/hash_test" \ + > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Hash verification detected corruption" + Hash verification detected corruption + +Test: Skip Hash Verification with --skip-hash Flag +--------------------------------------------------- + +Re-download the same corrupted file with --skip-hash flag (should succeed by overwriting): + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/hash_test_skip" \ + > --skip-hash \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Test: Skip Hash via Environment Variable +----------------------------------------- + +Test that GEOTESSERA_SKIP_HASH environment variable works: + + $ export GEOTESSERA_SKIP_HASH=1 + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/hash_test_env" \ + > --dataset-version v1 2>&1 | grep -E '(Hash verification disabled|SUCCESS)' | sed 's/ *$//' + +Clean up environment: + + $ unset GEOTESSERA_SKIP_HASH + +Test: Hash Verification for Scales Files +----------------------------------------- + +Test that scales files are also hash-verified. Download a tile and corrupt the scales file: + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/scales_hash_test" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Corrupt the scales file: + + $ CORRUPT_SCALES=$(find "$TESTDIR/scales_hash_test" -name "*_scales.npy" | head -1) + find: '/tmp/cramtests-rsi5b614/test_outputs/scales_hash_test': No such file or directory + $ echo "corrupted scales" >> "$CORRUPT_SCALES" + /bin/sh: 60: cannot create : Directory nonexistent + [2] + $ echo "Scales file corrupted successfully" + Scales file corrupted successfully + +Try to re-download (should detect corrupted scales file): + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/scales_hash_test" \ + > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Scales hash verification detected corruption" + Scales hash verification detected corruption + +Test: Hash Verification for Landmask Files +------------------------------------------- + +Test that landmask files are also hash-verified. Corrupt a landmask file: + + $ CORRUPT_LANDMASK=$(find "$TESTDIR/hash_test" -path "*/global_0.1_degree_tiff_all/*.tif*" | head -1) + find: '/tmp/cramtests-rsi5b614/test_outputs/hash_test': No such file or directory + $ echo "corrupted landmask" >> "$CORRUPT_LANDMASK" + /bin/sh: 73: cannot create : Directory nonexistent + [2] + $ echo "Landmask file corrupted successfully" + Landmask file corrupted successfully + +Try to re-download (should detect corrupted landmask): + + $ uv run -m geotessera.cli download \ + > --bbox "0.18952,52.18602,0.18953,52.18603" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/hash_test" \ + > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Landmask hash verification detected corruption" + Landmask hash verification detected corruption diff --git a/tests/test_tile_discovery.py b/tests/test_tile_discovery.py new file mode 100644 index 0000000..e744489 --- /dev/null +++ b/tests/test_tile_discovery.py @@ -0,0 +1,132 @@ +"""Tests for tile discovery with temporary file filtering.""" + +from pathlib import Path +import tempfile +import numpy as np + +from geotessera.tiles import ( + discover_npy_tiles, + discover_geotiff_tiles, + discover_zarr_tiles, + discover_tiles, + _is_temp_file, +) + + +def test_is_temp_file(): + """Test the _is_temp_file helper function.""" + # Temporary files should be detected + assert _is_temp_file(Path(".grid_0.15_52.05.npy_tmp_abc123")) + assert _is_temp_file(Path(".grid_0.15_52.05_2024.tif_tmp_xyz789")) + assert _is_temp_file(Path(".grid_0.15_52.05_2024.zarr_tmp_def456")) + + # Regular files should not be detected as temp + assert not _is_temp_file(Path("grid_0.15_52.05.npy")) + assert not _is_temp_file(Path("grid_0.15_52.05_2024.tif")) + assert not _is_temp_file(Path("grid_0.15_52.05_2024.zarr")) + + # Files with just dot prefix or just _tmp_ but not both + assert not _is_temp_file(Path(".hidden_file.npy")) + assert not _is_temp_file(Path("file_tmp_name.npy")) + + +def test_discover_npy_tiles_filters_temp_files(): + """Test that discover_npy_tiles filters out temporary files.""" + from unittest.mock import patch + + with tempfile.TemporaryDirectory() as tmpdir: + base_dir = Path(tmpdir) + embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" + embeddings_dir.mkdir(parents=True, exist_ok=True) + + # Create a regular tile file + regular_file = embeddings_dir / "grid_0.15_52.05.npy" + test_array = np.random.rand(10, 10, 16).astype(np.float32) + np.save(regular_file, test_array) + + # Create a temporary file (should be ignored) + # Note: np.save adds .npy automatically, so specify without .npy + temp_file_base = embeddings_dir / ".grid_0.15_52.05_tmp_abc123" + np.save(temp_file_base, test_array) + temp_file = embeddings_dir / ".grid_0.15_52.05_tmp_abc123.npy" + + # Verify both files exist + assert regular_file.exists() + assert temp_file.exists() + + # Mock the Tile.from_npy to track which files are attempted to be loaded + attempted_files = [] + with patch('geotessera.tiles.Tile.from_npy') as mock_from_npy: + mock_from_npy.side_effect = lambda path, base_dir: attempted_files.append(path) or None + + discover_npy_tiles(base_dir) + + # Check that only the regular file was attempted, not the temp file + assert len(attempted_files) == 1, f"Expected 1 file to be attempted, got {len(attempted_files)}" + assert attempted_files[0] == regular_file, "Wrong file was attempted" + assert temp_file not in attempted_files, "Temporary file should not have been attempted" + + +def test_discover_geotiff_tiles_filters_temp_files(): + """Test that discover_geotiff_tiles filters out temporary files.""" + with tempfile.TemporaryDirectory() as tmpdir: + base_dir = Path(tmpdir) + + # Create regular and temp GeoTIFF files + regular_file = base_dir / "grid_0.15_52.05_2024.tif" + temp_file = base_dir / ".grid_0.15_52.05_2024.tif_tmp_xyz789" + + # Create mock GeoTIFF files (just touch them - actual GeoTIFF creation would require rasterio) + regular_file.touch() + temp_file.touch() + + # This will fail to parse the files, but we're testing that temp files are filtered + tiles = discover_geotiff_tiles(base_dir) + + # Neither file should be loaded (because they're not valid GeoTIFFs), + # but temp file should have been filtered before attempting to load + assert len(tiles) == 0 + + +def test_discover_zarr_tiles_filters_temp_files(): + """Test that discover_zarr_tiles filters out temporary files.""" + with tempfile.TemporaryDirectory() as tmpdir: + base_dir = Path(tmpdir) + + # Create regular and temp zarr directories + regular_file = base_dir / "grid_0.15_52.05_2024.zarr" + temp_file = base_dir / ".grid_0.15_52.05_2024.zarr_tmp_def456" + + # Create mock zarr directories + regular_file.mkdir() + temp_file.mkdir() + + # This will fail to parse the files, but we're testing that temp files are filtered + tiles = discover_zarr_tiles(base_dir) + + # Neither file should be loaded (because they're not valid zarr stores), + # but temp file should have been filtered before attempting to load + assert len(tiles) == 0 + + +def test_discover_tiles_filters_temp_files(): + """Test that discover_tiles filters out temporary files in initial check.""" + with tempfile.TemporaryDirectory() as tmpdir: + base_dir = Path(tmpdir) + embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" + embeddings_dir.mkdir(parents=True, exist_ok=True) + + landmasks_dir = base_dir / "landmasks" + landmasks_dir.mkdir(parents=True, exist_ok=True) + + # Create only a temporary NPY file + temp_file = embeddings_dir / ".grid_0.15_52.05.npy_tmp_abc123" + test_array = np.random.rand(10, 10, 16).astype(np.float32) + np.save(temp_file, test_array) + + # discover_tiles should not find any NPY files and fall back to other formats + # Since no valid tiles exist, it should return an empty list + tiles = discover_tiles(base_dir) + + # Should find no tiles since only temp file exists + assert len(tiles) == 0 diff --git a/tests/viz.t.err b/tests/viz.t.err new file mode 100644 index 0000000..8f4596f --- /dev/null +++ b/tests/viz.t.err @@ -0,0 +1,194 @@ +GeoTessera Visualization Tests +=============================== + +These are tests for the `geotessera visualize` and `geotessera webmap` commands. + +Setup +----- + +Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): + + $ export TERM=dumb + +Create a temporary directory for test outputs and cache: + + $ export TESTDIR="$CRAMTMP/test_outputs" + $ mkdir -p "$TESTDIR" + +Override XDG cache directory to use temporary location (for test isolation): + + $ export XDG_CACHE_HOME="$CRAMTMP/cache" + $ mkdir -p "$XDG_CACHE_HOME" + +Test: Download Tiles for Cambridge Region (Bbox) +------------------------------------------------- + +Download tiles covering a small area of Cambridge using a bounding box. +This bbox covers just 4 tiles for faster testing: + + $ uv run -m geotessera.cli download \ + > --bbox "0.086174,52.183432,0.151062,52.206318" \ + > --year 2024 \ + > --format tiff \ + > --output "$TESTDIR/cb_tiles_tiff" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS|Found.*tiles' | sed 's/ *$//' + +Verify TIFF files were created: + + $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null)" ] && echo "TIFF files created" + [1] + +Verify NPY files were also created (intermediate format retained for efficient reprocessing): + + $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null)" ] && echo "NPY embedding files created" + [1] + + $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*_scales.npy" 2>/dev/null)" ] && echo "NPY scales files created" + [1] + +Verify both formats coexist (count files of each type): + + $ find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null | wc -l | tr -d ' ' + 0 + + $ find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null | wc -l | tr -d ' ' + 0 + +Test: Visualize - Create PCA Mosaic from TIFF Files +---------------------------------------------------- + +Create a PCA visualization from the downloaded TIFF files. +The visualize command should: +1. Load all TIFF tiles +2. Apply PCA to reduce 128 channels to RGB +3. Create a mosaic in the target CRS (default EPSG:3857) + + $ uv run -m geotessera.cli visualize \ + > "$TESTDIR/cb_tiles_tiff" \ + > "$TESTDIR/cb_pca_mosaic.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' + +Verify PCA mosaic was created: + + $ [ -f "$TESTDIR/cb_pca_mosaic.tif" ] && echo "PCA mosaic created" + [1] + +Check that it's a valid GeoTIFF with 3 bands (RGB): + + $ uv run python -c "import rasterio; r = rasterio.open('$TESTDIR/cb_pca_mosaic.tif'); print(f'Bands: {r.count}, CRS: {r.crs}')" + /bin/sh: 35: uv: not found + [127] + +Test: Visualize - Custom CRS and Balance Options +------------------------------------------------- + +Test creating a visualization with custom CRS and histogram balancing: + + $ uv run -m geotessera.cli visualize \ + > "$TESTDIR/cb_tiles_tiff" \ + > "$TESTDIR/cb_pca_4326.tif" \ + > --crs EPSG:4326 \ + > --balance histogram 2>&1 | grep -A 1 -E 'Created PCA mosaic' | sed 's/ *$//' + +Verify custom CRS mosaic was created: + + $ [ -f "$TESTDIR/cb_pca_4326.tif" ] && echo "Custom CRS mosaic created" + [1] + + $ uv run python -c "import rasterio; r = rasterio.open('$TESTDIR/cb_pca_4326.tif'); print(f'CRS: {r.crs}')" + /bin/sh: 45: uv: not found + [127] + +Test: Visualize - NPY Format Input +----------------------------------- + +Download the same region in NPY format and create a visualization: + + $ uv run -m geotessera.cli download \ + > --bbox "0.086174,52.183432,0.151062,52.206318" \ + > --year 2024 \ + > --format npy \ + > --output "$TESTDIR/cb_tiles_npy" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Create visualization from NPY format: + + $ uv run -m geotessera.cli visualize \ + > "$TESTDIR/cb_tiles_npy" \ + > "$TESTDIR/cb_pca_from_npy.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' + +Verify NPY-based mosaic was created: + + $ [ -f "$TESTDIR/cb_pca_from_npy.tif" ] && echo "NPY format mosaic created" + [1] + +Test: Webmap - Generate Web Tiles and Viewer +--------------------------------------------- + +Generate web tiles from the PCA mosaic and create an interactive web viewer. +This should: +1. Reproject the mosaic if needed (to EPSG:3857 for web) +2. Generate XYZ web tiles at multiple zoom levels +3. Create an HTML viewer with Leaflet + + $ uv run -m geotessera.cli webmap \ + > "$TESTDIR/cb_pca_mosaic.tif" \ + > --output "$TESTDIR/cb_webmap" \ + > --min-zoom 10 \ + > --max-zoom 13 2>&1 | grep -A 1 -E 'Web visualization ready|Created web' | grep -v '^--$' | sed 's/ *$//' + +Verify web map directory structure was created: + + $ test -d "$TESTDIR/cb_webmap" && echo "Web map directory created" + [1] + + $ test -f "$TESTDIR/cb_webmap/viewer.html" && echo "HTML viewer created" + [1] + + $ [ -n "$(find "$TESTDIR/cb_webmap/tiles" -name "*.png" 2>/dev/null)" ] && echo "Web tiles (PNG) created" + [1] + +Check that tiles exist at multiple zoom levels: + + $ find "$TESTDIR/cb_webmap/tiles" -type d -name "1[0-3]" | wc -l | tr -d ' ' | grep -E '[2-4]' + find: '/tmp/cramtests-rsi5b614/test_outputs/cb_webmap/tiles': No such file or directory + [1] + +Test: Webmap - Custom Output and Settings +------------------------------------------ + +Test webmap with custom initial zoom and center: + + $ uv run -m geotessera.cli webmap \ + > "$TESTDIR/cb_pca_4326.tif" \ + > --output "$TESTDIR/cb_webmap_custom" \ + > --min-zoom 10 \ + > --max-zoom 12 \ + > --initial-zoom 11 2>&1 | grep -A 1 -E 'Web visualization ready' | sed 's/ *$//' + +Verify custom web map was created: + + $ test -f "$TESTDIR/cb_webmap_custom/viewer.html" && echo "Custom web map created" + [1] + +Test: Info Command on Visualization Outputs +-------------------------------------------- + +Test that info command works on the created PCA mosaics: + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/cb_tiles_tiff" 2>&1 | grep -E 'Total tiles|Format|Years' | sed 's/ *$//' + +Test: Error Handling - Invalid Input +------------------------------------- + +Test that visualize fails gracefully with non-existent input: + + $ uv run -m geotessera.cli visualize \ + > "$TESTDIR/nonexistent" \ + > "$TESTDIR/output.tif" 2>&1 | grep -A 1 -E 'No tiles found|Error' | grep -v '^--$' + [1] + +Test that webmap fails gracefully with non-TIFF input: + + $ uv run -m geotessera.cli webmap \ + > "$TESTDIR/cb_tiles_tiff" 2>&1 | grep -E 'Error.*must be.*tif' + [1] diff --git a/tests/zarr.t.err b/tests/zarr.t.err new file mode 100644 index 0000000..46a4049 --- /dev/null +++ b/tests/zarr.t.err @@ -0,0 +1,152 @@ +GeoTessera Zarr Format Tests +============================= + +These are tests for the zarr format support added in this branch. + +Setup +----- + +Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): + + $ export TERM=dumb + +Create a temporary directory for test outputs and cache: + + $ export TESTDIR="$CRAMTMP/test_outputs" + $ mkdir -p "$TESTDIR" + +Override XDG cache directory to use temporary location (for test isolation): + + $ export XDG_CACHE_HOME="$CRAMTMP/cache" + $ mkdir -p "$XDG_CACHE_HOME" + +Test: Zarr Format Validation +----------------------------- + +Test that zarr format is recognized as a valid option: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format zarr \ + > --dry-run \ + > --dataset-version v1 2>&1 | grep -E '^.*Format:.*ZARR' | head -1 | sed 's/ *$//' + +Test: Invalid Format Rejected +------------------------------ + +Test that invalid formats are properly rejected: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format invalid \ + > --dry-run \ + > --dataset-version v1 2>&1 | grep -E "Invalid format.*Must be" + [1] + +Test: Download Dry Run for UK Tile (Zarr format) +------------------------------------------------- + +Test dry-run with zarr format to verify it's processed correctly: + + $ uv run -m geotessera.cli download \ + > --bbox "-0.1,51.3,0.1,51.5" \ + > --year 2024 \ + > --format zarr \ + > --dry-run \ + > --dataset-version v1 2>&1 | grep -E '(Format|Year|Dataset version|Found|Tiles in region)' | sed 's/ *$//' + +Test: Download Cambridge Tiles in Zarr Format +---------------------------------------------- + +Download a small region in zarr format (4 tiles for faster testing): + + $ uv run -m geotessera.cli download \ + > --bbox "0.086174,52.183432,0.151062,52.206318" \ + > --year 2024 \ + > --format zarr \ + > --output "$TESTDIR/cb_tiles_zarr" \ + > --dataset-version v1 2>&1 | grep -E '(SUCCESS|Found.*tiles)' | sed 's/ *$//' + +Verify zarr archives were created in the registry structure: + + $ [ -n "$(find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" 2>/dev/null)" ] && echo "Zarr archives created" + [1] + + $ find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" | wc -l | tr -d ' ' + find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr/global_0.1_degree_representation/2024': No such file or directory + 0 + +Test: Info Command on Downloaded Zarr Tiles +-------------------------------------------- + +Test the info command on the downloaded zarr tiles. +Note that the info command may detect NPY files that are created alongside zarr: + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/cb_tiles_zarr" 2>&1 | grep -E 'Total tiles|Format|Years' | sed 's/ *$//' + +Test: Zarr Archive Structure +----------------------------- + +Verify that a zarr archive can be opened and contains expected data: + + $ ZARR_FILE=$(find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" | head -1) + find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr/global_0.1_degree_representation/2024': No such file or directory + $ uv run python -c " + > import xarray as xr + > ds = xr.open_dataset('$ZARR_FILE', decode_coords='all') + > print(f'Variables: {list(ds.data_vars.keys())}') + > coords = sorted([c for c in ds.coords.keys() if c != 'spatial_ref']) + > print(f'Coordinates: {coords}') + > print(f'CRS present: {hasattr(ds, \"rio\") and ds.rio.crs is not None}') + > print(f'Transform present: {hasattr(ds, \"rio\") and ds.rio.transform() is not None}') + > " + /bin/sh: 48: uv: not found + [127] + +Test: Band Selection with Zarr Format +-------------------------------------- + +Download zarr tiles with specific band selection: + + $ uv run -m geotessera.cli download \ + > --bbox "0.086174,52.183432,0.151062,52.206318" \ + > --year 2024 \ + > --format zarr \ + > --bands "0,1,2" \ + > --output "$TESTDIR/cb_tiles_zarr_bands" \ + > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' + +Verify band count in band-selected zarr archive: + + $ ZARR_FILE=$(find "$TESTDIR/cb_tiles_zarr_bands/global_0.1_degree_representation/2024" -name "*.zarr" | head -1) + find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr_bands/global_0.1_degree_representation/2024': No such file or directory + $ uv run python -c " + > import xarray as xr + > ds = xr.open_dataset('$ZARR_FILE', decode_coords='all') + > print(f'Band count: {len(ds.band)}') + > " + /bin/sh: 68: uv: not found + [127] + +Test: Visualization with Zarr Format +------------------------------------- + +Create a PCA visualization from the downloaded zarr tiles. +Note that tiles may be detected as npy format since both formats coexist: + + $ uv run -m geotessera.cli visualize "$TESTDIR/cb_tiles_zarr" "$TESTDIR/pca_zarr.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' + +Verify the PCA visualization file was created: + + $ test -f "$TESTDIR/pca_zarr.tif" && echo "PCA visualization created" + [1] + +Test: CLI Help Shows Zarr Format +--------------------------------- + +Verify that the CLI help text mentions zarr as a format option: + + $ uv run -m geotessera.cli download --help | grep -i zarr | head -1 + /bin/sh: 78: uv: not found From 464338a3b9168d45ea349edfcffb7b7819c0d76e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:58:22 +0000 Subject: [PATCH 3/7] Update .gitignore to exclude test artifacts Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- .gitignore | 2 + tests/cli.t.err | 150 ------------------------------------ tests/hash.t.err | 140 ---------------------------------- tests/viz.t.err | 194 ----------------------------------------------- tests/zarr.t.err | 152 ------------------------------------- 5 files changed, 2 insertions(+), 636 deletions(-) delete mode 100644 tests/cli.t.err delete mode 100644 tests/hash.t.err delete mode 100644 tests/viz.t.err delete mode 100644 tests/zarr.t.err diff --git a/.gitignore b/.gitignore index 60e98db..4a393b8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,5 @@ geotessera_vis.png .venv build/ repomix-* +*.err +.pytest_cache/ \ No newline at end of file diff --git a/tests/cli.t.err b/tests/cli.t.err deleted file mode 100644 index bff7999..0000000 --- a/tests/cli.t.err +++ /dev/null @@ -1,150 +0,0 @@ -GeoTessera CLI Tests -===================== - -These are tests for the `geotessera` command-line interface. - -Setup ------ - -Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): - - $ export TERM=dumb - -Create a temporary directory for test outputs and cache: - - $ export TESTDIR="$CRAMTMP/test_outputs" - $ mkdir -p "$TESTDIR" - -Override XDG cache directory to use temporary location (for test isolation): - - $ export XDG_CACHE_HOME="$CRAMTMP/cache" - $ mkdir -p "$XDG_CACHE_HOME" - -Test: Version Command ---------------------- - -The version command should print the version number. - - $ uv run -m geotessera.cli version - /bin/sh: 12: uv: not found - [127] - -Test: Info Command (Library Info) ----------------------------------- - -Test the info command without arguments to see library information. -We just verify key information is present, ignoring formatting: - - $ uv run -m geotessera.cli info --dataset-version v1 | grep -E 'Available years' - /bin/sh: 14: uv: not found - [1] - -Test: Download Dry Run for UK Tile ------------------------------------ - -Test downloading a single tile covering London, UK using --dry-run to avoid actual downloads. -Verify key information is present: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format tiff \ - > --dry-run \ - > --dataset-version v1 2>&1 | grep -E '(Format|Year|Compression|Dataset version|Found|Files to download|Total download|Tiles in region)' | sed 's/ *$//' - -Test: Download Single UK Tile (TIFF format) --------------------------------------------- - -Download a single tile in TIFF format to a temporary directory: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format tiff \ - > --output "$TESTDIR/uk_tiles_tiff" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Verify TIFF files were created in the registry structure: - - $ [ -n "$(find "$TESTDIR/uk_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null)" ] && echo "TIFF files created" - [1] - - $ find "$TESTDIR/uk_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" | wc -l | tr -d ' ' - find: '/tmp/cramtests-rsi5b614/test_outputs/uk_tiles_tiff/global_0.1_degree_representation/2024': No such file or directory - 0 - -Test: Download Single UK Tile (NPY format) -------------------------------------------- - -Download the same tile in NPY format (quantized arrays with scales): - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/uk_tiles_npy" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Verify NPY directory structure was created: - - $ test -d "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" && echo "Embeddings directory created" - [1] - - $ test -d "$TESTDIR/uk_tiles_npy/global_0.1_degree_tiff_all" && echo "Landmasks directory created" - [1] - -Verify NPY files exist in grid subdirectories: - - $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null)" ] && echo "Embedding NPY files created" - [1] - - $ find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "*.npy" | wc -l | tr -d ' ' - find: '/tmp/cramtests-rsi5b614/test_outputs/uk_tiles_npy/global_0.1_degree_representation/2024': No such file or directory - 0 - - $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024" -name "*_scales.npy" 2>/dev/null)" ] && echo "Scales NPY files created" - [1] - - $ [ -n "$(find "$TESTDIR/uk_tiles_npy/global_0.1_degree_tiff_all" -name "*.tif*" 2>/dev/null)" ] && echo "Landmask TIFF files created" - [1] - -Test: Info Command on Downloaded TIFF Tiles --------------------------------------------- - -Test the info command on the downloaded TIFF tiles. -Both TIFF and NPY formats should be present (NPY files are retained for efficient reprocessing): - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_tiff" - /bin/sh: 53: uv: not found - [127] - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_tiff" - /bin/sh: 55: uv: not found - [127] - -Test: Info Command on Downloaded NPY Tiles -------------------------------------------- - -Test the info command on the downloaded NPY tiles: - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_npy" - /bin/sh: 57: uv: not found - [127] - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_npy" - /bin/sh: 59: uv: not found - [127] - -Test: Resume Capability for NPY Downloads ------------------------------------------- - -Test that re-running the NPY download skips existing files: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/uk_tiles_npy" \ - > --dataset-version v1 2>&1 | grep -E '(Skipped|existing files)' - [1] - diff --git a/tests/hash.t.err b/tests/hash.t.err deleted file mode 100644 index f1455d3..0000000 --- a/tests/hash.t.err +++ /dev/null @@ -1,140 +0,0 @@ -GeoTessera Hash Verification Tests -==================================== - -These are tests for SHA256 hash verification functionality in the `geotessera` CLI. - -Setup ------ - -Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): - - $ export TERM=dumb - -Create a temporary directory for test outputs and cache: - - $ export TESTDIR="$CRAMTMP/test_outputs" - $ mkdir -p "$TESTDIR" - -Override XDG cache directory to use temporary location (for test isolation): - - $ export XDG_CACHE_HOME="$CRAMTMP/cache" - $ mkdir -p "$XDG_CACHE_HOME" - -Test: Hash Verification Failure --------------------------------- - -Test that corrupted files are detected and downloading fails with hash mismatch error. - -First, download a small tile for testing: - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/hash_test" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Find and corrupt the embedding file by appending data: - - $ CORRUPT_FILE=$(find "$TESTDIR/hash_test" -name "grid_*.npy" -not -name "*_scales.npy" | head -1) - find: '/tmp/cramtests-rsi5b614/test_outputs/hash_test': No such file or directory - $ echo "corrupted data" >> "$CORRUPT_FILE" - /bin/sh: 21: cannot create : Directory nonexistent - [2] - $ echo "File corrupted successfully" - File corrupted successfully - -Try to re-download with hash verification enabled (should fail): - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/hash_test" \ - > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Hash verification detected corruption" - Hash verification detected corruption - -Test: Skip Hash Verification with --skip-hash Flag ---------------------------------------------------- - -Re-download the same corrupted file with --skip-hash flag (should succeed by overwriting): - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/hash_test_skip" \ - > --skip-hash \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Test: Skip Hash via Environment Variable ------------------------------------------ - -Test that GEOTESSERA_SKIP_HASH environment variable works: - - $ export GEOTESSERA_SKIP_HASH=1 - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/hash_test_env" \ - > --dataset-version v1 2>&1 | grep -E '(Hash verification disabled|SUCCESS)' | sed 's/ *$//' - -Clean up environment: - - $ unset GEOTESSERA_SKIP_HASH - -Test: Hash Verification for Scales Files ------------------------------------------ - -Test that scales files are also hash-verified. Download a tile and corrupt the scales file: - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/scales_hash_test" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Corrupt the scales file: - - $ CORRUPT_SCALES=$(find "$TESTDIR/scales_hash_test" -name "*_scales.npy" | head -1) - find: '/tmp/cramtests-rsi5b614/test_outputs/scales_hash_test': No such file or directory - $ echo "corrupted scales" >> "$CORRUPT_SCALES" - /bin/sh: 60: cannot create : Directory nonexistent - [2] - $ echo "Scales file corrupted successfully" - Scales file corrupted successfully - -Try to re-download (should detect corrupted scales file): - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/scales_hash_test" \ - > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Scales hash verification detected corruption" - Scales hash verification detected corruption - -Test: Hash Verification for Landmask Files -------------------------------------------- - -Test that landmask files are also hash-verified. Corrupt a landmask file: - - $ CORRUPT_LANDMASK=$(find "$TESTDIR/hash_test" -path "*/global_0.1_degree_tiff_all/*.tif*" | head -1) - find: '/tmp/cramtests-rsi5b614/test_outputs/hash_test': No such file or directory - $ echo "corrupted landmask" >> "$CORRUPT_LANDMASK" - /bin/sh: 73: cannot create : Directory nonexistent - [2] - $ echo "Landmask file corrupted successfully" - Landmask file corrupted successfully - -Try to re-download (should detect corrupted landmask): - - $ uv run -m geotessera.cli download \ - > --bbox "0.18952,52.18602,0.18953,52.18603" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/hash_test" \ - > --dataset-version v1 2>&1 | grep -E '(Hash mismatch|ValueError)' || echo "Landmask hash verification detected corruption" - Landmask hash verification detected corruption diff --git a/tests/viz.t.err b/tests/viz.t.err deleted file mode 100644 index 8f4596f..0000000 --- a/tests/viz.t.err +++ /dev/null @@ -1,194 +0,0 @@ -GeoTessera Visualization Tests -=============================== - -These are tests for the `geotessera visualize` and `geotessera webmap` commands. - -Setup ------ - -Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): - - $ export TERM=dumb - -Create a temporary directory for test outputs and cache: - - $ export TESTDIR="$CRAMTMP/test_outputs" - $ mkdir -p "$TESTDIR" - -Override XDG cache directory to use temporary location (for test isolation): - - $ export XDG_CACHE_HOME="$CRAMTMP/cache" - $ mkdir -p "$XDG_CACHE_HOME" - -Test: Download Tiles for Cambridge Region (Bbox) -------------------------------------------------- - -Download tiles covering a small area of Cambridge using a bounding box. -This bbox covers just 4 tiles for faster testing: - - $ uv run -m geotessera.cli download \ - > --bbox "0.086174,52.183432,0.151062,52.206318" \ - > --year 2024 \ - > --format tiff \ - > --output "$TESTDIR/cb_tiles_tiff" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS|Found.*tiles' | sed 's/ *$//' - -Verify TIFF files were created: - - $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null)" ] && echo "TIFF files created" - [1] - -Verify NPY files were also created (intermediate format retained for efficient reprocessing): - - $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null)" ] && echo "NPY embedding files created" - [1] - - $ [ -n "$(find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*_scales.npy" 2>/dev/null)" ] && echo "NPY scales files created" - [1] - -Verify both formats coexist (count files of each type): - - $ find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "*.tif*" 2>/dev/null | wc -l | tr -d ' ' - 0 - - $ find "$TESTDIR/cb_tiles_tiff/global_0.1_degree_representation/2024" -name "grid_*.npy" ! -name "*_scales.npy" 2>/dev/null | wc -l | tr -d ' ' - 0 - -Test: Visualize - Create PCA Mosaic from TIFF Files ----------------------------------------------------- - -Create a PCA visualization from the downloaded TIFF files. -The visualize command should: -1. Load all TIFF tiles -2. Apply PCA to reduce 128 channels to RGB -3. Create a mosaic in the target CRS (default EPSG:3857) - - $ uv run -m geotessera.cli visualize \ - > "$TESTDIR/cb_tiles_tiff" \ - > "$TESTDIR/cb_pca_mosaic.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' - -Verify PCA mosaic was created: - - $ [ -f "$TESTDIR/cb_pca_mosaic.tif" ] && echo "PCA mosaic created" - [1] - -Check that it's a valid GeoTIFF with 3 bands (RGB): - - $ uv run python -c "import rasterio; r = rasterio.open('$TESTDIR/cb_pca_mosaic.tif'); print(f'Bands: {r.count}, CRS: {r.crs}')" - /bin/sh: 35: uv: not found - [127] - -Test: Visualize - Custom CRS and Balance Options -------------------------------------------------- - -Test creating a visualization with custom CRS and histogram balancing: - - $ uv run -m geotessera.cli visualize \ - > "$TESTDIR/cb_tiles_tiff" \ - > "$TESTDIR/cb_pca_4326.tif" \ - > --crs EPSG:4326 \ - > --balance histogram 2>&1 | grep -A 1 -E 'Created PCA mosaic' | sed 's/ *$//' - -Verify custom CRS mosaic was created: - - $ [ -f "$TESTDIR/cb_pca_4326.tif" ] && echo "Custom CRS mosaic created" - [1] - - $ uv run python -c "import rasterio; r = rasterio.open('$TESTDIR/cb_pca_4326.tif'); print(f'CRS: {r.crs}')" - /bin/sh: 45: uv: not found - [127] - -Test: Visualize - NPY Format Input ------------------------------------ - -Download the same region in NPY format and create a visualization: - - $ uv run -m geotessera.cli download \ - > --bbox "0.086174,52.183432,0.151062,52.206318" \ - > --year 2024 \ - > --format npy \ - > --output "$TESTDIR/cb_tiles_npy" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Create visualization from NPY format: - - $ uv run -m geotessera.cli visualize \ - > "$TESTDIR/cb_tiles_npy" \ - > "$TESTDIR/cb_pca_from_npy.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' - -Verify NPY-based mosaic was created: - - $ [ -f "$TESTDIR/cb_pca_from_npy.tif" ] && echo "NPY format mosaic created" - [1] - -Test: Webmap - Generate Web Tiles and Viewer ---------------------------------------------- - -Generate web tiles from the PCA mosaic and create an interactive web viewer. -This should: -1. Reproject the mosaic if needed (to EPSG:3857 for web) -2. Generate XYZ web tiles at multiple zoom levels -3. Create an HTML viewer with Leaflet - - $ uv run -m geotessera.cli webmap \ - > "$TESTDIR/cb_pca_mosaic.tif" \ - > --output "$TESTDIR/cb_webmap" \ - > --min-zoom 10 \ - > --max-zoom 13 2>&1 | grep -A 1 -E 'Web visualization ready|Created web' | grep -v '^--$' | sed 's/ *$//' - -Verify web map directory structure was created: - - $ test -d "$TESTDIR/cb_webmap" && echo "Web map directory created" - [1] - - $ test -f "$TESTDIR/cb_webmap/viewer.html" && echo "HTML viewer created" - [1] - - $ [ -n "$(find "$TESTDIR/cb_webmap/tiles" -name "*.png" 2>/dev/null)" ] && echo "Web tiles (PNG) created" - [1] - -Check that tiles exist at multiple zoom levels: - - $ find "$TESTDIR/cb_webmap/tiles" -type d -name "1[0-3]" | wc -l | tr -d ' ' | grep -E '[2-4]' - find: '/tmp/cramtests-rsi5b614/test_outputs/cb_webmap/tiles': No such file or directory - [1] - -Test: Webmap - Custom Output and Settings ------------------------------------------- - -Test webmap with custom initial zoom and center: - - $ uv run -m geotessera.cli webmap \ - > "$TESTDIR/cb_pca_4326.tif" \ - > --output "$TESTDIR/cb_webmap_custom" \ - > --min-zoom 10 \ - > --max-zoom 12 \ - > --initial-zoom 11 2>&1 | grep -A 1 -E 'Web visualization ready' | sed 's/ *$//' - -Verify custom web map was created: - - $ test -f "$TESTDIR/cb_webmap_custom/viewer.html" && echo "Custom web map created" - [1] - -Test: Info Command on Visualization Outputs --------------------------------------------- - -Test that info command works on the created PCA mosaics: - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/cb_tiles_tiff" 2>&1 | grep -E 'Total tiles|Format|Years' | sed 's/ *$//' - -Test: Error Handling - Invalid Input -------------------------------------- - -Test that visualize fails gracefully with non-existent input: - - $ uv run -m geotessera.cli visualize \ - > "$TESTDIR/nonexistent" \ - > "$TESTDIR/output.tif" 2>&1 | grep -A 1 -E 'No tiles found|Error' | grep -v '^--$' - [1] - -Test that webmap fails gracefully with non-TIFF input: - - $ uv run -m geotessera.cli webmap \ - > "$TESTDIR/cb_tiles_tiff" 2>&1 | grep -E 'Error.*must be.*tif' - [1] diff --git a/tests/zarr.t.err b/tests/zarr.t.err deleted file mode 100644 index 46a4049..0000000 --- a/tests/zarr.t.err +++ /dev/null @@ -1,152 +0,0 @@ -GeoTessera Zarr Format Tests -============================= - -These are tests for the zarr format support added in this branch. - -Setup ------ - -Set environment variable to disable fancy terminal output (ANSI codes, boxes, colors): - - $ export TERM=dumb - -Create a temporary directory for test outputs and cache: - - $ export TESTDIR="$CRAMTMP/test_outputs" - $ mkdir -p "$TESTDIR" - -Override XDG cache directory to use temporary location (for test isolation): - - $ export XDG_CACHE_HOME="$CRAMTMP/cache" - $ mkdir -p "$XDG_CACHE_HOME" - -Test: Zarr Format Validation ------------------------------ - -Test that zarr format is recognized as a valid option: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format zarr \ - > --dry-run \ - > --dataset-version v1 2>&1 | grep -E '^.*Format:.*ZARR' | head -1 | sed 's/ *$//' - -Test: Invalid Format Rejected ------------------------------- - -Test that invalid formats are properly rejected: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format invalid \ - > --dry-run \ - > --dataset-version v1 2>&1 | grep -E "Invalid format.*Must be" - [1] - -Test: Download Dry Run for UK Tile (Zarr format) -------------------------------------------------- - -Test dry-run with zarr format to verify it's processed correctly: - - $ uv run -m geotessera.cli download \ - > --bbox "-0.1,51.3,0.1,51.5" \ - > --year 2024 \ - > --format zarr \ - > --dry-run \ - > --dataset-version v1 2>&1 | grep -E '(Format|Year|Dataset version|Found|Tiles in region)' | sed 's/ *$//' - -Test: Download Cambridge Tiles in Zarr Format ----------------------------------------------- - -Download a small region in zarr format (4 tiles for faster testing): - - $ uv run -m geotessera.cli download \ - > --bbox "0.086174,52.183432,0.151062,52.206318" \ - > --year 2024 \ - > --format zarr \ - > --output "$TESTDIR/cb_tiles_zarr" \ - > --dataset-version v1 2>&1 | grep -E '(SUCCESS|Found.*tiles)' | sed 's/ *$//' - -Verify zarr archives were created in the registry structure: - - $ [ -n "$(find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" 2>/dev/null)" ] && echo "Zarr archives created" - [1] - - $ find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" | wc -l | tr -d ' ' - find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr/global_0.1_degree_representation/2024': No such file or directory - 0 - -Test: Info Command on Downloaded Zarr Tiles --------------------------------------------- - -Test the info command on the downloaded zarr tiles. -Note that the info command may detect NPY files that are created alongside zarr: - - $ uv run -m geotessera.cli info --tiles "$TESTDIR/cb_tiles_zarr" 2>&1 | grep -E 'Total tiles|Format|Years' | sed 's/ *$//' - -Test: Zarr Archive Structure ------------------------------ - -Verify that a zarr archive can be opened and contains expected data: - - $ ZARR_FILE=$(find "$TESTDIR/cb_tiles_zarr/global_0.1_degree_representation/2024" -name "*.zarr" | head -1) - find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr/global_0.1_degree_representation/2024': No such file or directory - $ uv run python -c " - > import xarray as xr - > ds = xr.open_dataset('$ZARR_FILE', decode_coords='all') - > print(f'Variables: {list(ds.data_vars.keys())}') - > coords = sorted([c for c in ds.coords.keys() if c != 'spatial_ref']) - > print(f'Coordinates: {coords}') - > print(f'CRS present: {hasattr(ds, \"rio\") and ds.rio.crs is not None}') - > print(f'Transform present: {hasattr(ds, \"rio\") and ds.rio.transform() is not None}') - > " - /bin/sh: 48: uv: not found - [127] - -Test: Band Selection with Zarr Format --------------------------------------- - -Download zarr tiles with specific band selection: - - $ uv run -m geotessera.cli download \ - > --bbox "0.086174,52.183432,0.151062,52.206318" \ - > --year 2024 \ - > --format zarr \ - > --bands "0,1,2" \ - > --output "$TESTDIR/cb_tiles_zarr_bands" \ - > --dataset-version v1 2>&1 | grep -E 'SUCCESS' | sed 's/ *$//' - -Verify band count in band-selected zarr archive: - - $ ZARR_FILE=$(find "$TESTDIR/cb_tiles_zarr_bands/global_0.1_degree_representation/2024" -name "*.zarr" | head -1) - find: '/tmp/cramtests-rsi5b614/test_outputs/cb_tiles_zarr_bands/global_0.1_degree_representation/2024': No such file or directory - $ uv run python -c " - > import xarray as xr - > ds = xr.open_dataset('$ZARR_FILE', decode_coords='all') - > print(f'Band count: {len(ds.band)}') - > " - /bin/sh: 68: uv: not found - [127] - -Test: Visualization with Zarr Format -------------------------------------- - -Create a PCA visualization from the downloaded zarr tiles. -Note that tiles may be detected as npy format since both formats coexist: - - $ uv run -m geotessera.cli visualize "$TESTDIR/cb_tiles_zarr" "$TESTDIR/pca_zarr.tif" 2>&1 | grep -A 1 -E 'Found|Created PCA mosaic' | sed 's/ *$//' - -Verify the PCA visualization file was created: - - $ test -f "$TESTDIR/pca_zarr.tif" && echo "PCA visualization created" - [1] - -Test: CLI Help Shows Zarr Format ---------------------------------- - -Verify that the CLI help text mentions zarr as a format option: - - $ uv run -m geotessera.cli download --help | grep -i zarr | head -1 - /bin/sh: 78: uv: not found From 3aaf71eec89676b50fa6c54e3ee366f32e763a61 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:59:26 +0000 Subject: [PATCH 4/7] Use pattern matching to filter invalid tile filenames instead of explicit temp file check Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- geotessera/tiles.py | 48 +++++------ tests/test_tile_discovery.py | 152 +++++++++++++++++------------------ 2 files changed, 93 insertions(+), 107 deletions(-) diff --git a/geotessera/tiles.py b/geotessera/tiles.py index 2ba01d3..d7016e7 100644 --- a/geotessera/tiles.py +++ b/geotessera/tiles.py @@ -7,22 +7,6 @@ from .registry import EMBEDDINGS_DIR_NAME, LANDMASKS_DIR_NAME, tile_to_landmask_filename -def _is_temp_file(path: Path) -> bool: - """Check if a file is a temporary download file. - - Temporary files are created during download with pattern: .{filename}_tmp_{random} - and should be ignored during tile discovery. - - Args: - path: Path to check - - Returns: - True if the file is a temporary file, False otherwise - """ - # Check if filename starts with '.' (hidden) and contains '_tmp_' - return path.name.startswith('.') and '_tmp_' in path.name - - class Tile: """A single embedding tile that abstracts storage format. @@ -338,15 +322,19 @@ def discover_tiles(directory: Path) -> List[Tile]: Returns: List of Tile objects with spatial metadata loaded, sorted by (year, lat, lon) """ + import re + # Check for NPY format first by looking for .npy files in embeddings directory # Preferred order is NPY, tiff, zarr, as NPY (more efficient, includes scales) embeddings_dir = directory / EMBEDDINGS_DIR_NAME if embeddings_dir.exists() and embeddings_dir.is_dir(): - # Check if there are any .npy files (not just _scales.npy or temporary files) + # Check if there are any .npy files that match the expected pattern + # (not just _scales.npy and must match grid__.npy pattern) + npy_pattern = re.compile(r"grid_(-?\d+\.\d+)_(-?\d+\.\d+)\.npy") npy_files = [ f for f in embeddings_dir.rglob("*.npy") - if not f.name.endswith("_scales.npy") and not _is_temp_file(f) + if not f.name.endswith("_scales.npy") and npy_pattern.match(f.name) ] if npy_files: return discover_npy_tiles(directory) @@ -383,10 +371,6 @@ def discover_npy_tiles(base_dir: Path) -> List[Tile]: # Skip scales files if npy_file.name.endswith("_scales.npy"): continue - - # Skip temporary download files - if _is_temp_file(npy_file): - continue try: tile = Tile.from_npy(npy_file, base_dir) @@ -394,6 +378,10 @@ def discover_npy_tiles(base_dir: Path) -> List[Tile]: tiles.append(tile) else: logging.warning(f"Skipping incomplete tile: {npy_file}") + except ValueError: + # Skip files that don't match expected filename pattern + # (e.g., temporary files, other non-tile files) + continue except Exception as e: logging.warning(f"Failed to load tile {npy_file}: {e}") @@ -418,14 +406,14 @@ def discover_geotiff_tiles(directory: Path) -> List[Tile]: # Skip landmask files (they're in a different directory and have different naming) if LANDMASKS_DIR_NAME in geotiff_file.parts: continue - - # Skip temporary download files - if _is_temp_file(geotiff_file): - continue try: tile = Tile.from_geotiff(geotiff_file) tiles.append(tile) + except ValueError: + # Skip files that don't match expected filename pattern + # (e.g., temporary files, other non-tile files) + continue except Exception as e: logging.warning(f"Failed to load tile {geotiff_file}: {e}") @@ -447,14 +435,14 @@ def discover_zarr_tiles(directory: Path) -> List[Tile]: for pattern in ["*.zarr"]: for zarr_file in directory.rglob(pattern): - # Skip temporary download files - if _is_temp_file(zarr_file): - continue - # Skip landmask files (they're in a different directory and have different naming) try: tile = Tile.from_zarr(zarr_file) tiles.append(tile) + except ValueError: + # Skip files that don't match expected filename pattern + # (e.g., temporary files, other non-tile files) + continue except Exception as e: logging.warning(f"Failed to load tile {zarr_file}: {e}") diff --git a/tests/test_tile_discovery.py b/tests/test_tile_discovery.py index e744489..dc2cce2 100644 --- a/tests/test_tile_discovery.py +++ b/tests/test_tile_discovery.py @@ -1,4 +1,4 @@ -"""Tests for tile discovery with temporary file filtering.""" +"""Tests for tile discovery with pattern-based filtering.""" from pathlib import Path import tempfile @@ -9,31 +9,11 @@ discover_geotiff_tiles, discover_zarr_tiles, discover_tiles, - _is_temp_file, ) -def test_is_temp_file(): - """Test the _is_temp_file helper function.""" - # Temporary files should be detected - assert _is_temp_file(Path(".grid_0.15_52.05.npy_tmp_abc123")) - assert _is_temp_file(Path(".grid_0.15_52.05_2024.tif_tmp_xyz789")) - assert _is_temp_file(Path(".grid_0.15_52.05_2024.zarr_tmp_def456")) - - # Regular files should not be detected as temp - assert not _is_temp_file(Path("grid_0.15_52.05.npy")) - assert not _is_temp_file(Path("grid_0.15_52.05_2024.tif")) - assert not _is_temp_file(Path("grid_0.15_52.05_2024.zarr")) - - # Files with just dot prefix or just _tmp_ but not both - assert not _is_temp_file(Path(".hidden_file.npy")) - assert not _is_temp_file(Path("file_tmp_name.npy")) - - -def test_discover_npy_tiles_filters_temp_files(): - """Test that discover_npy_tiles filters out temporary files.""" - from unittest.mock import patch - +def test_discover_npy_tiles_skips_invalid_patterns(): + """Test that discover_npy_tiles silently skips files that don't match the expected pattern.""" with tempfile.TemporaryDirectory() as tmpdir: base_dir = Path(tmpdir) embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" @@ -44,89 +24,107 @@ def test_discover_npy_tiles_filters_temp_files(): test_array = np.random.rand(10, 10, 16).astype(np.float32) np.save(regular_file, test_array) - # Create a temporary file (should be ignored) - # Note: np.save adds .npy automatically, so specify without .npy - temp_file_base = embeddings_dir / ".grid_0.15_52.05_tmp_abc123" - np.save(temp_file_base, test_array) - temp_file = embeddings_dir / ".grid_0.15_52.05_tmp_abc123.npy" - - # Verify both files exist - assert regular_file.exists() - assert temp_file.exists() - - # Mock the Tile.from_npy to track which files are attempted to be loaded - attempted_files = [] - with patch('geotessera.tiles.Tile.from_npy') as mock_from_npy: - mock_from_npy.side_effect = lambda path, base_dir: attempted_files.append(path) or None - - discover_npy_tiles(base_dir) - - # Check that only the regular file was attempted, not the temp file - assert len(attempted_files) == 1, f"Expected 1 file to be attempted, got {len(attempted_files)}" - assert attempted_files[0] == regular_file, "Wrong file was attempted" - assert temp_file not in attempted_files, "Temporary file should not have been attempted" + # Create files with invalid patterns (should be silently skipped) + # These include temporary files and other non-tile files + invalid_files = [ + embeddings_dir / ".grid_0.15_52.05_tmp_abc123", # temp file + embeddings_dir / "invalid_name", # doesn't match pattern + embeddings_dir / "grid_invalid", # doesn't match pattern + ] + for invalid_file in invalid_files: + np.save(invalid_file, test_array) + + # Count total .npy files created + all_npy_files = list(embeddings_dir.glob("*.npy")) + assert len(all_npy_files) == 4, f"Expected 4 .npy files, got {len(all_npy_files)}" + + # Run discovery (will fail to load tiles without proper metadata, but should not error on invalid names) + tiles = discover_npy_tiles(base_dir) + + # Should return empty list (no valid tiles with all required files) + # but importantly, should not raise exceptions for invalid patterns + assert isinstance(tiles, list) -def test_discover_geotiff_tiles_filters_temp_files(): - """Test that discover_geotiff_tiles filters out temporary files.""" +def test_discover_geotiff_tiles_skips_invalid_patterns(): + """Test that discover_geotiff_tiles silently skips files that don't match the expected pattern.""" with tempfile.TemporaryDirectory() as tmpdir: base_dir = Path(tmpdir) - # Create regular and temp GeoTIFF files + # Create files with various patterns regular_file = base_dir / "grid_0.15_52.05_2024.tif" - temp_file = base_dir / ".grid_0.15_52.05_2024.tif_tmp_xyz789" - - # Create mock GeoTIFF files (just touch them - actual GeoTIFF creation would require rasterio) regular_file.touch() - temp_file.touch() - # This will fail to parse the files, but we're testing that temp files are filtered + invalid_files = [ + base_dir / ".grid_0.15_52.05_2024.tif_tmp_xyz789", # temp file + base_dir / "invalid_name.tif", # doesn't match pattern + base_dir / "grid_invalid.tiff", # doesn't match pattern + ] + for invalid_file in invalid_files: + invalid_file.touch() + + # Count total tif/tiff files created (note: glob doesn't match files starting with .) + all_tif_files = list(base_dir.rglob("*.tif")) + list(base_dir.rglob("*.tiff")) + # Should find 3 visible files (the .tif_tmp file is hidden) + assert len(all_tif_files) >= 3, f"Expected at least 3 tif/tiff files, got {len(all_tif_files)}" + + # Run discovery (will fail to load as GeoTIFF, but should not error on invalid patterns) tiles = discover_geotiff_tiles(base_dir) - # Neither file should be loaded (because they're not valid GeoTIFFs), - # but temp file should have been filtered before attempting to load - assert len(tiles) == 0 + # Should return empty list (no valid GeoTIFFs) + # but importantly, should not raise exceptions for invalid patterns + assert isinstance(tiles, list) -def test_discover_zarr_tiles_filters_temp_files(): - """Test that discover_zarr_tiles filters out temporary files.""" +def test_discover_zarr_tiles_skips_invalid_patterns(): + """Test that discover_zarr_tiles silently skips files that don't match the expected pattern.""" with tempfile.TemporaryDirectory() as tmpdir: base_dir = Path(tmpdir) - # Create regular and temp zarr directories + # Create directories with various patterns regular_file = base_dir / "grid_0.15_52.05_2024.zarr" - temp_file = base_dir / ".grid_0.15_52.05_2024.zarr_tmp_def456" - - # Create mock zarr directories regular_file.mkdir() - temp_file.mkdir() - # This will fail to parse the files, but we're testing that temp files are filtered + invalid_files = [ + base_dir / ".grid_0.15_52.05_2024.zarr_tmp_def456", # temp file + base_dir / "invalid_name.zarr", # doesn't match pattern + base_dir / "grid_invalid.zarr", # doesn't match pattern + ] + for invalid_file in invalid_files: + invalid_file.mkdir() + + # Count total zarr directories created (note: glob doesn't match files starting with .) + all_zarr_dirs = list(base_dir.rglob("*.zarr")) + # Should find 3 visible directories (the .zarr_tmp directory is hidden) + assert len(all_zarr_dirs) >= 3, f"Expected at least 3 zarr dirs, got {len(all_zarr_dirs)}" + + # Run discovery (will fail to load as zarr, but should not error on invalid patterns) tiles = discover_zarr_tiles(base_dir) - # Neither file should be loaded (because they're not valid zarr stores), - # but temp file should have been filtered before attempting to load - assert len(tiles) == 0 + # Should return empty list (no valid zarr stores) + # but importantly, should not raise exceptions for invalid patterns + assert isinstance(tiles, list) -def test_discover_tiles_filters_temp_files(): - """Test that discover_tiles filters out temporary files in initial check.""" +def test_discover_tiles_filters_invalid_patterns(): + """Test that discover_tiles filters out files with invalid patterns in initial check.""" with tempfile.TemporaryDirectory() as tmpdir: base_dir = Path(tmpdir) embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" embeddings_dir.mkdir(parents=True, exist_ok=True) - landmasks_dir = base_dir / "landmasks" - landmasks_dir.mkdir(parents=True, exist_ok=True) - - # Create only a temporary NPY file - temp_file = embeddings_dir / ".grid_0.15_52.05.npy_tmp_abc123" + # Create only files with invalid patterns + invalid_files = [ + embeddings_dir / ".grid_0.15_52.05_tmp_abc123", + embeddings_dir / "invalid_name", + ] test_array = np.random.rand(10, 10, 16).astype(np.float32) - np.save(temp_file, test_array) + for invalid_file in invalid_files: + np.save(invalid_file, test_array) - # discover_tiles should not find any NPY files and fall back to other formats - # Since no valid tiles exist, it should return an empty list + # discover_tiles should not find any valid NPY files (pattern-filtered in initial check) + # and fall back to other formats. Since no valid tiles exist, returns empty list tiles = discover_tiles(base_dir) - # Should find no tiles since only temp file exists + # Should find no tiles assert len(tiles) == 0 From 588afe9c2d9f5a87025127a794e7f99ed3a0694b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:01:41 +0000 Subject: [PATCH 5/7] Address code review feedback: remove duplicate import and clarify exception handling Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- geotessera/tiles.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/geotessera/tiles.py b/geotessera/tiles.py index d7016e7..01f9360 100644 --- a/geotessera/tiles.py +++ b/geotessera/tiles.py @@ -322,8 +322,6 @@ def discover_tiles(directory: Path) -> List[Tile]: Returns: List of Tile objects with spatial metadata loaded, sorted by (year, lat, lon) """ - import re - # Check for NPY format first by looking for .npy files in embeddings directory # Preferred order is NPY, tiff, zarr, as NPY (more efficient, includes scales) embeddings_dir = directory / EMBEDDINGS_DIR_NAME @@ -380,7 +378,7 @@ def discover_npy_tiles(base_dir: Path) -> List[Tile]: logging.warning(f"Skipping incomplete tile: {npy_file}") except ValueError: # Skip files that don't match expected filename pattern - # (e.g., temporary files, other non-tile files) + # ValueError is raised by _parse_npy_filename when pattern doesn't match continue except Exception as e: logging.warning(f"Failed to load tile {npy_file}: {e}") @@ -412,7 +410,7 @@ def discover_geotiff_tiles(directory: Path) -> List[Tile]: tiles.append(tile) except ValueError: # Skip files that don't match expected filename pattern - # (e.g., temporary files, other non-tile files) + # ValueError is raised by _parse_geotiff_filename when pattern doesn't match continue except Exception as e: logging.warning(f"Failed to load tile {geotiff_file}: {e}") @@ -441,7 +439,7 @@ def discover_zarr_tiles(directory: Path) -> List[Tile]: tiles.append(tile) except ValueError: # Skip files that don't match expected filename pattern - # (e.g., temporary files, other non-tile files) + # ValueError is raised by _parse_zarr_filename when pattern doesn't match continue except Exception as e: logging.warning(f"Failed to load tile {zarr_file}: {e}") From 9abf770158c8435aacf17cf38bc50bcd30d86040 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 12:43:23 +0000 Subject: [PATCH 6/7] Fix: Remove pattern pre-filtering in discover_tiles to avoid false negatives Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- geotessera/tiles.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/geotessera/tiles.py b/geotessera/tiles.py index 01f9360..8ac2ebf 100644 --- a/geotessera/tiles.py +++ b/geotessera/tiles.py @@ -326,13 +326,12 @@ def discover_tiles(directory: Path) -> List[Tile]: # Preferred order is NPY, tiff, zarr, as NPY (more efficient, includes scales) embeddings_dir = directory / EMBEDDINGS_DIR_NAME if embeddings_dir.exists() and embeddings_dir.is_dir(): - # Check if there are any .npy files that match the expected pattern - # (not just _scales.npy and must match grid__.npy pattern) - npy_pattern = re.compile(r"grid_(-?\d+\.\d+)_(-?\d+\.\d+)\.npy") + # Check if there are any .npy files (not just _scales.npy) + # The actual pattern validation happens in discover_npy_tiles() npy_files = [ f for f in embeddings_dir.rglob("*.npy") - if not f.name.endswith("_scales.npy") and npy_pattern.match(f.name) + if not f.name.endswith("_scales.npy") ] if npy_files: return discover_npy_tiles(directory) From 440e6d6d15bb79e9419a0e4afdd468195d446876 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Dec 2025 11:19:58 +0000 Subject: [PATCH 7/7] Replace unit test with cram test for temporary file handling - Removed tests/test_tile_discovery.py unit test file - Added new cram test in tests/cli.t that creates temporary files manually - Test verifies info command silently ignores temp files without warnings - Test creates files matching typical temporary file patterns (.{name}_tmp_*) - Test validates tile count remains correct and no warnings appear Co-authored-by: avsm <53164+avsm@users.noreply.github.com> --- tests/cli.t | 16 +++++ tests/test_tile_discovery.py | 130 ----------------------------------- 2 files changed, 16 insertions(+), 130 deletions(-) delete mode 100644 tests/test_tile_discovery.py diff --git a/tests/cli.t b/tests/cli.t index 501e836..fd6070e 100644 --- a/tests/cli.t +++ b/tests/cli.t @@ -185,3 +185,19 @@ Test that re-running the NPY download skips existing files: > --dataset-version v1 2>&1 | grep -E '(Skipped|existing files)' Skipped 48 existing files (resume capability) +Test: Tile Discovery Ignores Temporary Files +--------------------------------------------- + +Test that temporary files left from interrupted downloads are silently ignored. +Create temporary files manually in the NPY tiles directory to simulate interrupted downloads: + + $ touch "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024/.grid_0.05_51.25.npy_tmp_abc123" + $ touch "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024/.grid_0.05_51.35_tmp_xyz789.npy" + $ touch "$TESTDIR/uk_tiles_npy/global_0.1_degree_representation/2024/invalid_file.npy" + +Verify that the info command still works correctly and doesn't show warnings about temp files. +The tile count should remain 16 (unchanged) and no warnings should appear in stderr: + + $ uv run -m geotessera.cli info --tiles "$TESTDIR/uk_tiles_npy" 2>&1 | grep -E '(Total tiles|WARNING|Failed to load|Cannot parse)' + Total tiles: 16 + diff --git a/tests/test_tile_discovery.py b/tests/test_tile_discovery.py deleted file mode 100644 index dc2cce2..0000000 --- a/tests/test_tile_discovery.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Tests for tile discovery with pattern-based filtering.""" - -from pathlib import Path -import tempfile -import numpy as np - -from geotessera.tiles import ( - discover_npy_tiles, - discover_geotiff_tiles, - discover_zarr_tiles, - discover_tiles, -) - - -def test_discover_npy_tiles_skips_invalid_patterns(): - """Test that discover_npy_tiles silently skips files that don't match the expected pattern.""" - with tempfile.TemporaryDirectory() as tmpdir: - base_dir = Path(tmpdir) - embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" - embeddings_dir.mkdir(parents=True, exist_ok=True) - - # Create a regular tile file - regular_file = embeddings_dir / "grid_0.15_52.05.npy" - test_array = np.random.rand(10, 10, 16).astype(np.float32) - np.save(regular_file, test_array) - - # Create files with invalid patterns (should be silently skipped) - # These include temporary files and other non-tile files - invalid_files = [ - embeddings_dir / ".grid_0.15_52.05_tmp_abc123", # temp file - embeddings_dir / "invalid_name", # doesn't match pattern - embeddings_dir / "grid_invalid", # doesn't match pattern - ] - for invalid_file in invalid_files: - np.save(invalid_file, test_array) - - # Count total .npy files created - all_npy_files = list(embeddings_dir.glob("*.npy")) - assert len(all_npy_files) == 4, f"Expected 4 .npy files, got {len(all_npy_files)}" - - # Run discovery (will fail to load tiles without proper metadata, but should not error on invalid names) - tiles = discover_npy_tiles(base_dir) - - # Should return empty list (no valid tiles with all required files) - # but importantly, should not raise exceptions for invalid patterns - assert isinstance(tiles, list) - - -def test_discover_geotiff_tiles_skips_invalid_patterns(): - """Test that discover_geotiff_tiles silently skips files that don't match the expected pattern.""" - with tempfile.TemporaryDirectory() as tmpdir: - base_dir = Path(tmpdir) - - # Create files with various patterns - regular_file = base_dir / "grid_0.15_52.05_2024.tif" - regular_file.touch() - - invalid_files = [ - base_dir / ".grid_0.15_52.05_2024.tif_tmp_xyz789", # temp file - base_dir / "invalid_name.tif", # doesn't match pattern - base_dir / "grid_invalid.tiff", # doesn't match pattern - ] - for invalid_file in invalid_files: - invalid_file.touch() - - # Count total tif/tiff files created (note: glob doesn't match files starting with .) - all_tif_files = list(base_dir.rglob("*.tif")) + list(base_dir.rglob("*.tiff")) - # Should find 3 visible files (the .tif_tmp file is hidden) - assert len(all_tif_files) >= 3, f"Expected at least 3 tif/tiff files, got {len(all_tif_files)}" - - # Run discovery (will fail to load as GeoTIFF, but should not error on invalid patterns) - tiles = discover_geotiff_tiles(base_dir) - - # Should return empty list (no valid GeoTIFFs) - # but importantly, should not raise exceptions for invalid patterns - assert isinstance(tiles, list) - - -def test_discover_zarr_tiles_skips_invalid_patterns(): - """Test that discover_zarr_tiles silently skips files that don't match the expected pattern.""" - with tempfile.TemporaryDirectory() as tmpdir: - base_dir = Path(tmpdir) - - # Create directories with various patterns - regular_file = base_dir / "grid_0.15_52.05_2024.zarr" - regular_file.mkdir() - - invalid_files = [ - base_dir / ".grid_0.15_52.05_2024.zarr_tmp_def456", # temp file - base_dir / "invalid_name.zarr", # doesn't match pattern - base_dir / "grid_invalid.zarr", # doesn't match pattern - ] - for invalid_file in invalid_files: - invalid_file.mkdir() - - # Count total zarr directories created (note: glob doesn't match files starting with .) - all_zarr_dirs = list(base_dir.rglob("*.zarr")) - # Should find 3 visible directories (the .zarr_tmp directory is hidden) - assert len(all_zarr_dirs) >= 3, f"Expected at least 3 zarr dirs, got {len(all_zarr_dirs)}" - - # Run discovery (will fail to load as zarr, but should not error on invalid patterns) - tiles = discover_zarr_tiles(base_dir) - - # Should return empty list (no valid zarr stores) - # but importantly, should not raise exceptions for invalid patterns - assert isinstance(tiles, list) - - -def test_discover_tiles_filters_invalid_patterns(): - """Test that discover_tiles filters out files with invalid patterns in initial check.""" - with tempfile.TemporaryDirectory() as tmpdir: - base_dir = Path(tmpdir) - embeddings_dir = base_dir / "global_0.1_degree_representation" / "2024" - embeddings_dir.mkdir(parents=True, exist_ok=True) - - # Create only files with invalid patterns - invalid_files = [ - embeddings_dir / ".grid_0.15_52.05_tmp_abc123", - embeddings_dir / "invalid_name", - ] - test_array = np.random.rand(10, 10, 16).astype(np.float32) - for invalid_file in invalid_files: - np.save(invalid_file, test_array) - - # discover_tiles should not find any valid NPY files (pattern-filtered in initial check) - # and fall back to other formats. Since no valid tiles exist, returns empty list - tiles = discover_tiles(base_dir) - - # Should find no tiles - assert len(tiles) == 0