@@ -2368,6 +2368,168 @@ def export_manifests_command(args):
23682368 return 0
23692369
23702370
2371+ def file_scan_command (args ):
2372+ """Recursively scan directories for embedding tiles and generate a parquet inventory.
2373+
2374+ This command scans an input directory tree looking for directories containing
2375+ embedding tiles (identified by grid*.npy and grid*_scales.npy files). It extracts
2376+ coordinates from filenames and records modification times for both files.
2377+
2378+ Useful for finding potential duplicate embeddings across machines.
2379+ """
2380+ import re
2381+ from datetime import datetime
2382+
2383+ console = Console ()
2384+
2385+ # Resolve input directory
2386+ input_dir = Path (args .input_dir ).resolve ()
2387+ if not input_dir .exists ():
2388+ console .print (f"[red]Error: Input directory does not exist: { input_dir } [/red]" )
2389+ return 1
2390+
2391+ # Determine output file
2392+ if args .output :
2393+ output_file = Path (args .output ).resolve ()
2394+ else :
2395+ output_file = input_dir / "embedding_inventory.parquet"
2396+
2397+ console .print (
2398+ Panel .fit (
2399+ f"[bold blue]🔍 Scanning for Embedding Tiles[/bold blue]\n "
2400+ f"📁 Input: { input_dir } \n "
2401+ f"📄 Output: { output_file } " ,
2402+ style = "blue" ,
2403+ )
2404+ )
2405+
2406+ # Pattern to match grid files
2407+ grid_pattern = re .compile (r"grid_(-?\d+\.\d+)_(-?\d+\.\d+)\.npy$" )
2408+
2409+ # Collect data
2410+ records = []
2411+ processed_dirs = set ()
2412+
2413+ console .print ("\n [cyan]Scanning directories...[/cyan]" )
2414+
2415+ # Walk the directory tree
2416+ for root , dirs , files in os .walk (input_dir ):
2417+ # Look for grid*.npy files in this directory
2418+ grid_files = [f for f in files if grid_pattern .match (f )]
2419+
2420+ if not grid_files :
2421+ continue
2422+
2423+ # Process each grid file found
2424+ for grid_file in grid_files :
2425+ match = grid_pattern .match (grid_file )
2426+ if not match :
2427+ continue
2428+
2429+ lon = float (match .group (1 ))
2430+ lat = float (match .group (2 ))
2431+
2432+ # Construct paths
2433+ grid_name = f"grid_{ lon :.2f} _{ lat :.2f} "
2434+ grid_path = os .path .join (root , f"{ grid_name } .npy" )
2435+ scales_path = os .path .join (root , f"{ grid_name } _scales.npy" )
2436+
2437+ # Check if both files exist
2438+ if not os .path .exists (grid_path ):
2439+ console .print (f"[yellow]Warning: Missing { grid_path } [/yellow]" )
2440+ continue
2441+
2442+ if not os .path .exists (scales_path ):
2443+ console .print (f"[yellow]Warning: Missing scales file for { grid_path } [/yellow]" )
2444+ continue
2445+
2446+ # Get modification times
2447+ try :
2448+ grid_stat = os .stat (grid_path )
2449+ scales_stat = os .stat (scales_path )
2450+
2451+ grid_mtime = datetime .fromtimestamp (grid_stat .st_mtime )
2452+ scales_mtime = datetime .fromtimestamp (scales_stat .st_mtime )
2453+ grid_size = grid_stat .st_size
2454+ scales_size = scales_stat .st_size
2455+
2456+ # Record the information
2457+ records .append ({
2458+ 'directory' : root ,
2459+ 'lon' : lon ,
2460+ 'lat' : lat ,
2461+ 'grid_path' : grid_path ,
2462+ 'scales_path' : scales_path ,
2463+ 'grid_mtime' : grid_mtime ,
2464+ 'scales_mtime' : scales_mtime ,
2465+ 'grid_size' : grid_size ,
2466+ 'scales_size' : scales_size ,
2467+ })
2468+
2469+ # Track unique directories processed
2470+ processed_dirs .add (root )
2471+
2472+ except Exception as e :
2473+ console .print (f"[red]Error processing { grid_path } : { e } [/red]" )
2474+ continue
2475+
2476+ if not records :
2477+ console .print ("[yellow]No embedding tiles found![/yellow]" )
2478+ return 1
2479+
2480+ # Create DataFrame
2481+ console .print (f"\n [cyan]Creating parquet file with { len (records ):,} tiles...[/cyan]" )
2482+ df = pd .DataFrame (records )
2483+
2484+ # Sort by lon, lat for easier analysis
2485+ df = df .sort_values (['lon' , 'lat' ])
2486+
2487+ # Save to parquet
2488+ try :
2489+ # Create output directory if it doesn't exist
2490+ output_file .parent .mkdir (parents = True , exist_ok = True )
2491+
2492+ df .to_parquet (output_file , index = False )
2493+
2494+ console .print (
2495+ Panel .fit (
2496+ f"[green]✅ Scan Complete[/green]\n "
2497+ f"📊 Tiles found: { len (records ):,} \n "
2498+ f"📁 Unique directories: { len (processed_dirs ):,} \n "
2499+ f"📄 Output: { output_file } " ,
2500+ style = "green" ,
2501+ )
2502+ )
2503+
2504+ # Show sample of data
2505+ console .print ("\n [cyan]Sample of collected data:[/cyan]" )
2506+ table = Table (show_header = True )
2507+ table .add_column ("Lon" , style = "cyan" )
2508+ table .add_column ("Lat" , style = "cyan" )
2509+ table .add_column ("Grid mtime" , style = "yellow" )
2510+ table .add_column ("Scales mtime" , style = "yellow" )
2511+ table .add_column ("Directory" , style = "dim" )
2512+
2513+ for _ , row in df .head (5 ).iterrows ():
2514+ table .add_row (
2515+ f"{ row ['lon' ]:.2f} " ,
2516+ f"{ row ['lat' ]:.2f} " ,
2517+ row ['grid_mtime' ].strftime ('%Y-%m-%d %H:%M:%S' ),
2518+ row ['scales_mtime' ].strftime ('%Y-%m-%d %H:%M:%S' ),
2519+ str (row ['directory' ])[:50 ] + "..." if len (str (row ['directory' ])) > 50 else str (row ['directory' ])
2520+ )
2521+
2522+ console .print (table )
2523+
2524+ return 0
2525+
2526+ except Exception as e :
2527+ console .print (f"[red]Error writing parquet file: { e } [/red]" )
2528+ import traceback
2529+ console .print (f"[dim]{ traceback .format_exc ()} [/dim]" )
2530+ return 1
2531+
2532+
23712533def main ():
23722534 """Main entry point for the geotessera-registry CLI tool."""
23732535 # Configure logging with rich handler
@@ -2448,6 +2610,19 @@ def main():
24482610 # Export to custom output directory
24492611 geotessera-registry export-manifests /path/to/v1 --output-dir ~/src/git/ucam-eo/tessera-manifests
24502612
2613+ # Scan directories for embedding tiles and create an inventory
2614+ geotessera-registry file-scan /path/to/embeddings
2615+
2616+ # This will:
2617+ # - Recursively scan for directories containing grid*.npy files
2618+ # - Extract lon/lat coordinates from filenames
2619+ # - Record modification times for both grid*.npy and *_scales.npy files
2620+ # - Generate a parquet file with: directory, lon, lat, grid_mtime, scales_mtime, file sizes
2621+ # - Useful for finding potential duplicate embeddings across machines
2622+
2623+ # Specify custom output path
2624+ geotessera-registry file-scan /path/to/embeddings --output /path/to/inventory.parquet
2625+
24512626This tool is intended for GeoTessera data maintainers to generate the registry
24522627files that are distributed with the package. End users typically don't need
24532628to use this tool.
@@ -2557,6 +2732,23 @@ def main():
25572732 )
25582733 export_parser .set_defaults (func = export_manifests_command )
25592734
2735+ # File-scan command
2736+ file_scan_parser = subparsers .add_parser (
2737+ "file-scan" ,
2738+ help = "Recursively scan directories for embedding tiles and generate an inventory parquet file" ,
2739+ )
2740+ file_scan_parser .add_argument (
2741+ "input_dir" ,
2742+ help = "Base directory to recursively scan for embedding tiles (grid*.npy files)" ,
2743+ )
2744+ file_scan_parser .add_argument (
2745+ "--output" ,
2746+ type = str ,
2747+ default = None ,
2748+ help = "Output parquet file path (default: INPUT_DIR/embedding_inventory.parquet)" ,
2749+ )
2750+ file_scan_parser .set_defaults (func = file_scan_command )
2751+
25602752 args = parser .parse_args ()
25612753
25622754 if not args .command :
0 commit comments