Skip to content

Commit 4b06132

Browse files
committed
add a file scanner for registries
this lets us build an embedding map across the various generation machines
1 parent 5e7d8ad commit 4b06132

File tree

1 file changed

+192
-0
lines changed

1 file changed

+192
-0
lines changed

geotessera/registry_cli.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2368,6 +2368,168 @@ def export_manifests_command(args):
23682368
return 0
23692369

23702370

2371+
def file_scan_command(args):
2372+
"""Recursively scan directories for embedding tiles and generate a parquet inventory.
2373+
2374+
This command scans an input directory tree looking for directories containing
2375+
embedding tiles (identified by grid*.npy and grid*_scales.npy files). It extracts
2376+
coordinates from filenames and records modification times for both files.
2377+
2378+
Useful for finding potential duplicate embeddings across machines.
2379+
"""
2380+
import re
2381+
from datetime import datetime
2382+
2383+
console = Console()
2384+
2385+
# Resolve input directory
2386+
input_dir = Path(args.input_dir).resolve()
2387+
if not input_dir.exists():
2388+
console.print(f"[red]Error: Input directory does not exist: {input_dir}[/red]")
2389+
return 1
2390+
2391+
# Determine output file
2392+
if args.output:
2393+
output_file = Path(args.output).resolve()
2394+
else:
2395+
output_file = input_dir / "embedding_inventory.parquet"
2396+
2397+
console.print(
2398+
Panel.fit(
2399+
f"[bold blue]🔍 Scanning for Embedding Tiles[/bold blue]\n"
2400+
f"📁 Input: {input_dir}\n"
2401+
f"📄 Output: {output_file}",
2402+
style="blue",
2403+
)
2404+
)
2405+
2406+
# Pattern to match grid files
2407+
grid_pattern = re.compile(r"grid_(-?\d+\.\d+)_(-?\d+\.\d+)\.npy$")
2408+
2409+
# Collect data
2410+
records = []
2411+
processed_dirs = set()
2412+
2413+
console.print("\n[cyan]Scanning directories...[/cyan]")
2414+
2415+
# Walk the directory tree
2416+
for root, dirs, files in os.walk(input_dir):
2417+
# Look for grid*.npy files in this directory
2418+
grid_files = [f for f in files if grid_pattern.match(f)]
2419+
2420+
if not grid_files:
2421+
continue
2422+
2423+
# Process each grid file found
2424+
for grid_file in grid_files:
2425+
match = grid_pattern.match(grid_file)
2426+
if not match:
2427+
continue
2428+
2429+
lon = float(match.group(1))
2430+
lat = float(match.group(2))
2431+
2432+
# Construct paths
2433+
grid_name = f"grid_{lon:.2f}_{lat:.2f}"
2434+
grid_path = os.path.join(root, f"{grid_name}.npy")
2435+
scales_path = os.path.join(root, f"{grid_name}_scales.npy")
2436+
2437+
# Check if both files exist
2438+
if not os.path.exists(grid_path):
2439+
console.print(f"[yellow]Warning: Missing {grid_path}[/yellow]")
2440+
continue
2441+
2442+
if not os.path.exists(scales_path):
2443+
console.print(f"[yellow]Warning: Missing scales file for {grid_path}[/yellow]")
2444+
continue
2445+
2446+
# Get modification times
2447+
try:
2448+
grid_stat = os.stat(grid_path)
2449+
scales_stat = os.stat(scales_path)
2450+
2451+
grid_mtime = datetime.fromtimestamp(grid_stat.st_mtime)
2452+
scales_mtime = datetime.fromtimestamp(scales_stat.st_mtime)
2453+
grid_size = grid_stat.st_size
2454+
scales_size = scales_stat.st_size
2455+
2456+
# Record the information
2457+
records.append({
2458+
'directory': root,
2459+
'lon': lon,
2460+
'lat': lat,
2461+
'grid_path': grid_path,
2462+
'scales_path': scales_path,
2463+
'grid_mtime': grid_mtime,
2464+
'scales_mtime': scales_mtime,
2465+
'grid_size': grid_size,
2466+
'scales_size': scales_size,
2467+
})
2468+
2469+
# Track unique directories processed
2470+
processed_dirs.add(root)
2471+
2472+
except Exception as e:
2473+
console.print(f"[red]Error processing {grid_path}: {e}[/red]")
2474+
continue
2475+
2476+
if not records:
2477+
console.print("[yellow]No embedding tiles found![/yellow]")
2478+
return 1
2479+
2480+
# Create DataFrame
2481+
console.print(f"\n[cyan]Creating parquet file with {len(records):,} tiles...[/cyan]")
2482+
df = pd.DataFrame(records)
2483+
2484+
# Sort by lon, lat for easier analysis
2485+
df = df.sort_values(['lon', 'lat'])
2486+
2487+
# Save to parquet
2488+
try:
2489+
# Create output directory if it doesn't exist
2490+
output_file.parent.mkdir(parents=True, exist_ok=True)
2491+
2492+
df.to_parquet(output_file, index=False)
2493+
2494+
console.print(
2495+
Panel.fit(
2496+
f"[green]✅ Scan Complete[/green]\n"
2497+
f"📊 Tiles found: {len(records):,}\n"
2498+
f"📁 Unique directories: {len(processed_dirs):,}\n"
2499+
f"📄 Output: {output_file}",
2500+
style="green",
2501+
)
2502+
)
2503+
2504+
# Show sample of data
2505+
console.print("\n[cyan]Sample of collected data:[/cyan]")
2506+
table = Table(show_header=True)
2507+
table.add_column("Lon", style="cyan")
2508+
table.add_column("Lat", style="cyan")
2509+
table.add_column("Grid mtime", style="yellow")
2510+
table.add_column("Scales mtime", style="yellow")
2511+
table.add_column("Directory", style="dim")
2512+
2513+
for _, row in df.head(5).iterrows():
2514+
table.add_row(
2515+
f"{row['lon']:.2f}",
2516+
f"{row['lat']:.2f}",
2517+
row['grid_mtime'].strftime('%Y-%m-%d %H:%M:%S'),
2518+
row['scales_mtime'].strftime('%Y-%m-%d %H:%M:%S'),
2519+
str(row['directory'])[:50] + "..." if len(str(row['directory'])) > 50 else str(row['directory'])
2520+
)
2521+
2522+
console.print(table)
2523+
2524+
return 0
2525+
2526+
except Exception as e:
2527+
console.print(f"[red]Error writing parquet file: {e}[/red]")
2528+
import traceback
2529+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
2530+
return 1
2531+
2532+
23712533
def main():
23722534
"""Main entry point for the geotessera-registry CLI tool."""
23732535
# Configure logging with rich handler
@@ -2448,6 +2610,19 @@ def main():
24482610
# Export to custom output directory
24492611
geotessera-registry export-manifests /path/to/v1 --output-dir ~/src/git/ucam-eo/tessera-manifests
24502612
2613+
# Scan directories for embedding tiles and create an inventory
2614+
geotessera-registry file-scan /path/to/embeddings
2615+
2616+
# This will:
2617+
# - Recursively scan for directories containing grid*.npy files
2618+
# - Extract lon/lat coordinates from filenames
2619+
# - Record modification times for both grid*.npy and *_scales.npy files
2620+
# - Generate a parquet file with: directory, lon, lat, grid_mtime, scales_mtime, file sizes
2621+
# - Useful for finding potential duplicate embeddings across machines
2622+
2623+
# Specify custom output path
2624+
geotessera-registry file-scan /path/to/embeddings --output /path/to/inventory.parquet
2625+
24512626
This tool is intended for GeoTessera data maintainers to generate the registry
24522627
files that are distributed with the package. End users typically don't need
24532628
to use this tool.
@@ -2557,6 +2732,23 @@ def main():
25572732
)
25582733
export_parser.set_defaults(func=export_manifests_command)
25592734

2735+
# File-scan command
2736+
file_scan_parser = subparsers.add_parser(
2737+
"file-scan",
2738+
help="Recursively scan directories for embedding tiles and generate an inventory parquet file",
2739+
)
2740+
file_scan_parser.add_argument(
2741+
"input_dir",
2742+
help="Base directory to recursively scan for embedding tiles (grid*.npy files)",
2743+
)
2744+
file_scan_parser.add_argument(
2745+
"--output",
2746+
type=str,
2747+
default=None,
2748+
help="Output parquet file path (default: INPUT_DIR/embedding_inventory.parquet)",
2749+
)
2750+
file_scan_parser.set_defaults(func=file_scan_command)
2751+
25602752
args = parser.parse_args()
25612753

25622754
if not args.command:

0 commit comments

Comments
 (0)