feat: product-level dedup for Copernicus downloads

rfievet · rfievet · commit 3f790e223c5c · 2026-04-07T13:22:00.000-04:00
Embed Copernicus product UUID in filenames ({product_id}__{safe_name}.ext)
so that different queries returning the same tile share the download.

- Add find_product_on_disk() to detect already-downloaded products by UUID
- Add zip integrity check to catch corrupted/truncated downloads
- Update process_products() to skip downloads for existing products
- Update S1/S2 filename format to include product ID
- Add 16 tests covering dedup, corruption detection, and cross-bbox scenarios
diff --git a/src/data/copernicus/common.py b/src/data/copernicus/common.py
@@ -2,16 +2,77 @@
 
 This module contains common logic used by both S1 and S2 fetching modules,
 reducing code duplication and providing a consistent interface.
+
+Product-level deduplication:
+    Files are named with the Copernicus product ID embedded: {product_id}__{safe_name}.ext
+    This allows us to detect already-downloaded products regardless of which query found them.
+    Two different bboxes that return the same Copernicus tile will share the download.
 """
 
 import json
+import zipfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
 if TYPE_CHECKING:
     from .client import CopernicusClient
 
 
+def find_product_on_disk(
+    cache_dir: Path,
+    satellite_subdir: str,
+    product_id: str,
+) -> Optional[Path]:
+    """Check if a Copernicus product is already downloaded by scanning filenames for its ID.
+
+    Files are named as {product_id}__{safe_name}.ext, so we glob for {product_id}__*
+    to find any existing download regardless of which query originally fetched it.
+
+    Args:
+        cache_dir: Root cache directory (e.g. data/cache/copernicus)
+        satellite_subdir: Subdirectory for the satellite type ("s1" or "s2")
+        product_id: Copernicus product UUID (e.g. "a8dd0899-7a3b-4e4b-9b3a-5e7f1234abcd")
+
+    Returns:
+        Path to the existing file if found and non-empty, None otherwise
+    """
+    subdir = cache_dir / satellite_subdir
+    if not subdir.exists():
+        return None
+
+    matches = list(subdir.glob(f"{product_id}__*"))
+    for match in matches:
+        if match.exists() and match.stat().st_size > 0:
+            # For zip files, verify the archive isn't truncated/corrupted
+            if match.suffix == ".zip" and not _is_valid_zip(match):
+                print(f"⚠️  Corrupted zip detected, removing: {match.name}")
+                match.unlink()
+                continue
+            return match
+    return None
+
+
+def _is_valid_zip(path: Path) -> bool:
+    """Quick integrity check for a zip file.
+
+    Reads the central directory (at the end of the file) and runs CRC checks
+    on all entries. A truncated download will fail here because the central
+    directory is written last.
+
+    Args:
+        path: Path to the zip file
+
+    Returns:
+        True if the zip is structurally valid, False otherwise
+    """
+    try:
+        with zipfile.ZipFile(path, "r") as zf:
+            bad = zf.testzip()
+            return bad is None
+    except (zipfile.BadZipFile, OSError):
+        return False
+
+
 def check_cache(
     cache_file: Path,
 ) -> Optional[List[Path]]:
@@ -143,6 +204,10 @@ def process_products(
 ) -> List[Path]:
     """Process products by downloading or creating metadata.
 
+    Uses product-level deduplication: before downloading, checks if the product
+    (identified by its Copernicus UUID) already exists on disk from a previous
+    query. This prevents re-downloading the same tile when the bbox shifts slightly.
+
     Args:
         client: CopernicusClient instance
         products: List of products to process
@@ -156,12 +221,25 @@ def process_products(
         List of paths to downloaded/created files
     """
     downloaded_paths: List[Path] = []
+    sat_subdir = "s1" if "1" in satellite else "s2"
+    skipped = 0
 
     if download_data:
         print(f"\n📥 DOWNLOADING {satellite} IMAGERY")
         print("=" * 45)
 
         for i, product in enumerate(products, 1):
+            product_id = product.get("Id", "")
+
+            # Product-level dedup: check if this product ID is already on disk
+            if product_id:
+                existing = find_product_on_disk(client.cache_dir, sat_subdir, product_id)
+                if existing:
+                    print(f"\n⏭️  Product {i}/{len(products)} already on disk: {existing.name}")
+                    downloaded_paths.append(existing)
+                    skipped += 1
+                    continue
+
             print(f"\n🛰️ Downloading product {i}/{len(products)}")
 
             downloaded_file = download_func(client, product, i - 1, **kwargs)
@@ -175,8 +253,22 @@ def process_products(
         print("=" * 35)
 
         for i, product in enumerate(products):
+            product_id = product.get("Id", "")
+
+            # Product-level dedup for metadata files too
+            if product_id:
+                existing = find_product_on_disk(client.cache_dir, sat_subdir, product_id)
+                if existing:
+                    print(f"⏭️  Metadata for product already on disk: {existing.name}")
+                    downloaded_paths.append(existing)
+                    skipped += 1
+                    continue
+
             metadata_file = metadata_func(client, product, i, **kwargs)
             if metadata_file:
                 downloaded_paths.append(metadata_file)
 
+    if skipped:
+        print(f"\n🎯 Skipped {skipped}/{len(products)} products (already downloaded)")
+
     return downloaded_paths
diff --git a/src/data/copernicus/s1.py b/src/data/copernicus/s1.py
@@ -494,10 +494,10 @@ def _create_product_metadata(
     product_id: str = product.get("Id", f"unknown_{index}")
     product_name: str = product.get("Name", f"S1_product_{index}")
 
-    # Create a safe filename by sanitizing the product name
-    # Add metadata suffix to make purpose clear
+    # Create a safe filename with product ID embedded for deduplication
+    # Format: {product_id}__{safe_name}_metadata.json
     safe_name: str = sanitize_filename(product_name)
-    filename: str = f"{safe_name}_metadata.json"
+    filename: str = f"{product_id}__{safe_name}_metadata.json"
 
     # Determine file path within the cache directory
     # Use s1/ subdirectory to organize by satellite type
@@ -590,10 +590,10 @@ def _download_s1_product(
     product_name: str = product.get("Name", f"S1_product_{index}")
     content_length: int = product.get("ContentLength", 0)  # File size in bytes
 
-    # Create safe filename for filesystem storage
-    # Remove characters that are invalid on Windows/macOS/Linux
+    # Create safe filename with product ID embedded for deduplication
+    # Format: {product_id}__{safe_name}.zip
     safe_name: str = sanitize_filename(product_name)
-    filename: str = f"{safe_name}.zip"  # S1 products are distributed as ZIP files
+    filename: str = f"{product_id}__{safe_name}.zip"
 
     # Determine file path within cache directory
     # Organize by satellite type: s1/ for Sentinel-1, s2/ for Sentinel-2
diff --git a/src/data/copernicus/s2.py b/src/data/copernicus/s2.py
@@ -320,10 +320,10 @@ def _create_product_metadata(
     product_id: str = product.get("Id", f"unknown_{index}")
     product_name: str = product.get("Name", f"S2_product_{index}")
 
-    # Create a safe filename by sanitizing the product name
-    # Add resolution and metadata suffix to make purpose clear
+    # Create a safe filename with product ID embedded for deduplication
+    # Format: {product_id}__{safe_name}_R{resolution}m_metadata.json
     safe_name: str = sanitize_filename(product_name)
-    filename: str = f"{safe_name}_R{resolution}m_metadata.json"
+    filename: str = f"{product_id}__{safe_name}_R{resolution}m_metadata.json"
 
     # Determine file path within the cache directory
     # Use s2/ subdirectory to organize by satellite type
@@ -393,9 +393,10 @@ def _download_s2_product(
     product_name: str = product.get("Name", f"S2_product_{index}")
     content_length: int = product.get("ContentLength", 0)
 
-    # Create safe filename
+    # Create safe filename with product ID embedded for deduplication
+    # Format: {product_id}__{safe_name}_R{resolution}m.zip
     safe_name: str = sanitize_filename(product_name)
-    filename: str = f"{safe_name}_R{resolution}m.zip"
+    filename: str = f"{product_id}__{safe_name}_R{resolution}m.zip"
 
     # Determine file path within cache directory
     file_path: Path = client.cache_dir / "s2" / filename
diff --git a/tests/test_product_dedup.py b/tests/test_product_dedup.py