Skip to content

Commit 3f790e2

Browse files
committed
feat: product-level dedup for Copernicus downloads
Embed Copernicus product UUID in filenames ({product_id}__{safe_name}.ext) so that different queries returning the same tile share the download. - Add find_product_on_disk() to detect already-downloaded products by UUID - Add zip integrity check to catch corrupted/truncated downloads - Update process_products() to skip downloads for existing products - Update S1/S2 filename format to include product ID - Add 16 tests covering dedup, corruption detection, and cross-bbox scenarios
1 parent 91095b7 commit 3f790e2

4 files changed

Lines changed: 510 additions & 11 deletions

File tree

src/data/copernicus/common.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,77 @@
22
33
This module contains common logic used by both S1 and S2 fetching modules,
44
reducing code duplication and providing a consistent interface.
5+
6+
Product-level deduplication:
7+
Files are named with the Copernicus product ID embedded: {product_id}__{safe_name}.ext
8+
This allows us to detect already-downloaded products regardless of which query found them.
9+
Two different bboxes that return the same Copernicus tile will share the download.
510
"""
611

712
import json
13+
import zipfile
814
from pathlib import Path
915
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
1016

1117
if TYPE_CHECKING:
1218
from .client import CopernicusClient
1319

1420

21+
def find_product_on_disk(
22+
cache_dir: Path,
23+
satellite_subdir: str,
24+
product_id: str,
25+
) -> Optional[Path]:
26+
"""Check if a Copernicus product is already downloaded by scanning filenames for its ID.
27+
28+
Files are named as {product_id}__{safe_name}.ext, so we glob for {product_id}__*
29+
to find any existing download regardless of which query originally fetched it.
30+
31+
Args:
32+
cache_dir: Root cache directory (e.g. data/cache/copernicus)
33+
satellite_subdir: Subdirectory for the satellite type ("s1" or "s2")
34+
product_id: Copernicus product UUID (e.g. "a8dd0899-7a3b-4e4b-9b3a-5e7f1234abcd")
35+
36+
Returns:
37+
Path to the existing file if found and non-empty, None otherwise
38+
"""
39+
subdir = cache_dir / satellite_subdir
40+
if not subdir.exists():
41+
return None
42+
43+
matches = list(subdir.glob(f"{product_id}__*"))
44+
for match in matches:
45+
if match.exists() and match.stat().st_size > 0:
46+
# For zip files, verify the archive isn't truncated/corrupted
47+
if match.suffix == ".zip" and not _is_valid_zip(match):
48+
print(f"⚠️ Corrupted zip detected, removing: {match.name}")
49+
match.unlink()
50+
continue
51+
return match
52+
return None
53+
54+
55+
def _is_valid_zip(path: Path) -> bool:
56+
"""Quick integrity check for a zip file.
57+
58+
Reads the central directory (at the end of the file) and runs CRC checks
59+
on all entries. A truncated download will fail here because the central
60+
directory is written last.
61+
62+
Args:
63+
path: Path to the zip file
64+
65+
Returns:
66+
True if the zip is structurally valid, False otherwise
67+
"""
68+
try:
69+
with zipfile.ZipFile(path, "r") as zf:
70+
bad = zf.testzip()
71+
return bad is None
72+
except (zipfile.BadZipFile, OSError):
73+
return False
74+
75+
1576
def check_cache(
1677
cache_file: Path,
1778
) -> Optional[List[Path]]:
@@ -143,6 +204,10 @@ def process_products(
143204
) -> List[Path]:
144205
"""Process products by downloading or creating metadata.
145206
207+
Uses product-level deduplication: before downloading, checks if the product
208+
(identified by its Copernicus UUID) already exists on disk from a previous
209+
query. This prevents re-downloading the same tile when the bbox shifts slightly.
210+
146211
Args:
147212
client: CopernicusClient instance
148213
products: List of products to process
@@ -156,12 +221,25 @@ def process_products(
156221
List of paths to downloaded/created files
157222
"""
158223
downloaded_paths: List[Path] = []
224+
sat_subdir = "s1" if "1" in satellite else "s2"
225+
skipped = 0
159226

160227
if download_data:
161228
print(f"\n📥 DOWNLOADING {satellite} IMAGERY")
162229
print("=" * 45)
163230

164231
for i, product in enumerate(products, 1):
232+
product_id = product.get("Id", "")
233+
234+
# Product-level dedup: check if this product ID is already on disk
235+
if product_id:
236+
existing = find_product_on_disk(client.cache_dir, sat_subdir, product_id)
237+
if existing:
238+
print(f"\n⏭️ Product {i}/{len(products)} already on disk: {existing.name}")
239+
downloaded_paths.append(existing)
240+
skipped += 1
241+
continue
242+
165243
print(f"\n🛰️ Downloading product {i}/{len(products)}")
166244

167245
downloaded_file = download_func(client, product, i - 1, **kwargs)
@@ -175,8 +253,22 @@ def process_products(
175253
print("=" * 35)
176254

177255
for i, product in enumerate(products):
256+
product_id = product.get("Id", "")
257+
258+
# Product-level dedup for metadata files too
259+
if product_id:
260+
existing = find_product_on_disk(client.cache_dir, sat_subdir, product_id)
261+
if existing:
262+
print(f"⏭️ Metadata for product already on disk: {existing.name}")
263+
downloaded_paths.append(existing)
264+
skipped += 1
265+
continue
266+
178267
metadata_file = metadata_func(client, product, i, **kwargs)
179268
if metadata_file:
180269
downloaded_paths.append(metadata_file)
181270

271+
if skipped:
272+
print(f"\n🎯 Skipped {skipped}/{len(products)} products (already downloaded)")
273+
182274
return downloaded_paths

src/data/copernicus/s1.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -494,10 +494,10 @@ def _create_product_metadata(
494494
product_id: str = product.get("Id", f"unknown_{index}")
495495
product_name: str = product.get("Name", f"S1_product_{index}")
496496

497-
# Create a safe filename by sanitizing the product name
498-
# Add metadata suffix to make purpose clear
497+
# Create a safe filename with product ID embedded for deduplication
498+
# Format: {product_id}__{safe_name}_metadata.json
499499
safe_name: str = sanitize_filename(product_name)
500-
filename: str = f"{safe_name}_metadata.json"
500+
filename: str = f"{product_id}__{safe_name}_metadata.json"
501501

502502
# Determine file path within the cache directory
503503
# Use s1/ subdirectory to organize by satellite type
@@ -590,10 +590,10 @@ def _download_s1_product(
590590
product_name: str = product.get("Name", f"S1_product_{index}")
591591
content_length: int = product.get("ContentLength", 0) # File size in bytes
592592

593-
# Create safe filename for filesystem storage
594-
# Remove characters that are invalid on Windows/macOS/Linux
593+
# Create safe filename with product ID embedded for deduplication
594+
# Format: {product_id}__{safe_name}.zip
595595
safe_name: str = sanitize_filename(product_name)
596-
filename: str = f"{safe_name}.zip" # S1 products are distributed as ZIP files
596+
filename: str = f"{product_id}__{safe_name}.zip"
597597

598598
# Determine file path within cache directory
599599
# Organize by satellite type: s1/ for Sentinel-1, s2/ for Sentinel-2

src/data/copernicus/s2.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,10 +320,10 @@ def _create_product_metadata(
320320
product_id: str = product.get("Id", f"unknown_{index}")
321321
product_name: str = product.get("Name", f"S2_product_{index}")
322322

323-
# Create a safe filename by sanitizing the product name
324-
# Add resolution and metadata suffix to make purpose clear
323+
# Create a safe filename with product ID embedded for deduplication
324+
# Format: {product_id}__{safe_name}_R{resolution}m_metadata.json
325325
safe_name: str = sanitize_filename(product_name)
326-
filename: str = f"{safe_name}_R{resolution}m_metadata.json"
326+
filename: str = f"{product_id}__{safe_name}_R{resolution}m_metadata.json"
327327

328328
# Determine file path within the cache directory
329329
# Use s2/ subdirectory to organize by satellite type
@@ -393,9 +393,10 @@ def _download_s2_product(
393393
product_name: str = product.get("Name", f"S2_product_{index}")
394394
content_length: int = product.get("ContentLength", 0)
395395

396-
# Create safe filename
396+
# Create safe filename with product ID embedded for deduplication
397+
# Format: {product_id}__{safe_name}_R{resolution}m.zip
397398
safe_name: str = sanitize_filename(product_name)
398-
filename: str = f"{safe_name}_R{resolution}m.zip"
399+
filename: str = f"{product_id}__{safe_name}_R{resolution}m.zip"
399400

400401
# Determine file path within cache directory
401402
file_path: Path = client.cache_dir / "s2" / filename

0 commit comments

Comments
 (0)