Skip to content

Commit 4045413

Browse files
committed
idx
1 parent a8a1960 commit 4045413

File tree

1 file changed

+48
-53
lines changed

1 file changed

+48
-53
lines changed

geotessera/registry.py

Lines changed: 48 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -231,26 +231,19 @@ def parse_grid_name(filename: str) -> Tuple[Optional[float], Optional[float]]:
231231
return None, None
232232

233233

234-
def coord_to_grid_index(coord: float) -> np.int64:
235-
"""Convert a coordinate to an integer grid index.
234+
def coord_to_grid_int(coord: float) -> np.int32:
235+
"""Convert a float coordinate to an integer grid index.
236236
237237
Multiplies by 100 and rounds to get an integer index.
238-
This avoids floating-point comparison issues since:
239-
- Tiles are on a 0.1° grid with centers at 0.05° offsets
240-
- Multiplying by 100 gives integers like 5, 15, 25, -5, -15, etc.
241-
- Integer comparison is always exact
242-
243-
Uses numpy rounding and int64 to match the DataFrame column computation exactly.
244-
Returns np.int64 to ensure type compatibility with pandas int64 columns.
238+
This is the inverse of grid_int_to_coord().
245239
246240
Args:
247-
coord: Longitude or latitude coordinate
241+
coord: Longitude or latitude coordinate (e.g., -0.15, 51.55)
248242
249243
Returns:
250-
Integer grid index (coordinate * 100, rounded) as numpy.int64
244+
Integer grid index (e.g., -15, 5155) as numpy.int32
251245
"""
252-
# Use numpy round and int64 to match pandas .round().astype(int) exactly
253-
return np.int64(np.round(coord * 100))
246+
return np.int32(np.round(coord * 100))
254247

255248

256249
def tile_to_grid_name(lon: float, lat: float) -> str:
@@ -805,10 +798,11 @@ def _load_registry(self):
805798

806799
self.logger.info(f"Loaded GeoParquet with {len(self._registry_gdf):,} tiles")
807800

808-
# Add pre-computed grid index columns for robust coordinate lookup
809-
# Using integer indices avoids all floating-point comparison issues
810-
self._registry_gdf["lon_idx"] = (self._registry_gdf["lon"] * 100).round().astype(int)
811-
self._registry_gdf["lat_idx"] = (self._registry_gdf["lat"] * 100).round().astype(int)
801+
# Convert float lon/lat to integer grid indices (multiply by 100)
802+
# This avoids all floating-point comparison issues across platforms
803+
# Future registries will store integers directly; this is a translation layer
804+
self._registry_gdf["lon_i"] = (self._registry_gdf["lon"] * 100).round().astype(np.int32)
805+
self._registry_gdf["lat_i"] = (self._registry_gdf["lat"] * 100).round().astype(np.int32)
812806

813807
# Validate registry structure
814808
required_columns = {"lat", "lon", "year", "hash", "file_size"}
@@ -890,9 +884,10 @@ def _load_landmasks_registry(self):
890884
)
891885
self._landmasks_df = None
892886
else:
893-
# Add pre-computed grid index columns for robust coordinate lookup
894-
self._landmasks_df["lon_idx"] = (self._landmasks_df["lon"] * 100).round().astype(int)
895-
self._landmasks_df["lat_idx"] = (self._landmasks_df["lat"] * 100).round().astype(int)
887+
# Convert float lon/lat to integer grid indices (multiply by 100)
888+
# This avoids all floating-point comparison issues across platforms
889+
self._landmasks_df["lon_i"] = (self._landmasks_df["lon"] * 100).round().astype(np.int32)
890+
self._landmasks_df["lat_i"] = (self._landmasks_df["lat"] * 100).round().astype(np.int32)
896891

897892
def iter_tiles_in_region(
898893
self, bounds: Tuple[float, float, float, float], year: int
@@ -941,13 +936,13 @@ def iter_tiles_in_region(
941936

942937
# Drop duplicates and yield (vectorized iteration)
943938
# Use the pre-computed grid indices to ensure consistency with lookups
944-
tiles_unique = tiles[["year", "lon", "lat", "lon_idx", "lat_idx"]].drop_duplicates()
945-
for year_val, lon_val, lat_val, lon_idx, lat_idx in tiles_unique.values:
939+
tiles_unique = tiles[["year", "lon", "lat", "lon_i", "lat_i"]].drop_duplicates()
940+
for year_val, lon_val, lat_val, lon_i, lat_i in tiles_unique.values:
946941
# Store grid indices on the float values so lookups use consistent values
947942
# We yield the original floats for compatibility but convert them to exact grid centers
948943
# This ensures the yielded coordinates exactly match what's in the registry
949-
lon_exact = lon_idx / 100.0
950-
lat_exact = lat_idx / 100.0
944+
lon_exact = lon_i / 100.0
945+
lat_exact = lat_i / 100.0
951946
yield (int(year_val), lon_exact, lat_exact)
952947

953948
def load_blocks_for_region(
@@ -1000,15 +995,15 @@ def get_available_embeddings(self) -> List[Tuple[int, float, float]]:
1000995
Returns:
1001996
List of (year, lon, lat) tuples for all available embedding tiles
1002997
"""
1003-
unique_tiles = self._registry_gdf[["year", "lon_idx", "lat_idx"]].drop_duplicates()
998+
unique_tiles = self._registry_gdf[["year", "lon_i", "lat_i"]].drop_duplicates()
1004999

10051000
# Use grid indices to compute exact grid center coordinates
10061001
# This ensures coordinates round-trip correctly for lookups
10071002
return list(
10081003
zip(
10091004
unique_tiles["year"].astype(int).values,
1010-
(unique_tiles["lon_idx"].values / 100.0),
1011-
(unique_tiles["lat_idx"].values / 100.0),
1005+
(unique_tiles["lon_i"].values / 100.0),
1006+
(unique_tiles["lat_i"].values / 100.0),
10121007
)
10131008
)
10141009

@@ -1066,12 +1061,12 @@ def fetch(
10661061
and lat is not None
10671062
):
10681063
# Use pre-computed integer grid indices for robust comparison
1069-
lon_idx = coord_to_grid_index(lon)
1070-
lat_idx = coord_to_grid_index(lat)
1064+
lon_i = coord_to_grid_int(lon)
1065+
lat_i = coord_to_grid_int(lat)
10711066
matches = self._registry_gdf[
10721067
(self._registry_gdf["year"] == year)
1073-
& (self._registry_gdf["lon_idx"] == lon_idx)
1074-
& (self._registry_gdf["lat_idx"] == lat_idx)
1068+
& (self._registry_gdf["lon_i"] == lon_i)
1069+
& (self._registry_gdf["lat_i"] == lat_i)
10751070
]
10761071
if len(matches) > 0:
10771072
if is_scales:
@@ -1146,11 +1141,11 @@ def fetch_landmask(
11461141
and lat is not None
11471142
):
11481143
# Use pre-computed integer grid indices for robust comparison
1149-
lon_idx = coord_to_grid_index(lon)
1150-
lat_idx = coord_to_grid_index(lat)
1144+
lon_i = coord_to_grid_int(lon)
1145+
lat_i = coord_to_grid_int(lat)
11511146
matches = self._landmasks_df[
1152-
(self._landmasks_df["lon_idx"] == lon_idx)
1153-
& (self._landmasks_df["lat_idx"] == lat_idx)
1147+
(self._landmasks_df["lon_i"] == lon_i)
1148+
& (self._landmasks_df["lat_i"] == lat_i)
11541149
]
11551150
if len(matches) > 0:
11561151
file_hash = matches.iloc[0]["hash"]
@@ -1195,18 +1190,18 @@ def available_landmasks(self) -> List[Tuple[float, float]]:
11951190
"""
11961191
# Use landmasks registry if available
11971192
if self._landmasks_df is not None:
1198-
unique_tiles = self._landmasks_df[["lon_idx", "lat_idx"]].drop_duplicates()
1193+
unique_tiles = self._landmasks_df[["lon_i", "lat_i"]].drop_duplicates()
11991194
# Use grid indices to compute exact grid center coordinates
12001195
return list(zip(
1201-
unique_tiles["lon_idx"].values / 100.0,
1202-
unique_tiles["lat_idx"].values / 100.0
1196+
unique_tiles["lon_i"].values / 100.0,
1197+
unique_tiles["lat_i"].values / 100.0
12031198
))
12041199

12051200
# Fallback: assume landmasks are available for all embedding tiles
1206-
unique_tiles = self._registry_gdf[["lon_idx", "lat_idx"]].drop_duplicates()
1201+
unique_tiles = self._registry_gdf[["lon_i", "lat_i"]].drop_duplicates()
12071202
return list(zip(
1208-
unique_tiles["lon_idx"].values / 100.0,
1209-
unique_tiles["lat_idx"].values / 100.0
1203+
unique_tiles["lon_i"].values / 100.0,
1204+
unique_tiles["lat_i"].values / 100.0
12101205
))
12111206

12121207
def get_manifest_info(self) -> Tuple[Optional[str], Optional[str]]:
@@ -1241,12 +1236,12 @@ def get_tile_file_size(self, year: int, lon: float, lat: float) -> int:
12411236
)
12421237

12431238
# Use pre-computed integer grid indices for robust comparison
1244-
lon_idx = coord_to_grid_index(lon)
1245-
lat_idx = coord_to_grid_index(lat)
1239+
lon_i = coord_to_grid_int(lon)
1240+
lat_i = coord_to_grid_int(lat)
12461241
matches = self._registry_gdf[
12471242
(self._registry_gdf["year"] == year)
1248-
& (self._registry_gdf["lon_idx"] == lon_idx)
1249-
& (self._registry_gdf["lat_idx"] == lat_idx)
1243+
& (self._registry_gdf["lon_i"] == lon_i)
1244+
& (self._registry_gdf["lat_i"] == lat_i)
12501245
]
12511246

12521247
if len(matches) == 0:
@@ -1277,12 +1272,12 @@ def get_scales_file_size(self, year: int, lon: float, lat: float) -> int:
12771272
)
12781273

12791274
# Use pre-computed integer grid indices for robust comparison
1280-
lon_idx = coord_to_grid_index(lon)
1281-
lat_idx = coord_to_grid_index(lat)
1275+
lon_i = coord_to_grid_int(lon)
1276+
lat_i = coord_to_grid_int(lat)
12821277
matches = self._registry_gdf[
12831278
(self._registry_gdf["year"] == year)
1284-
& (self._registry_gdf["lon_idx"] == lon_idx)
1285-
& (self._registry_gdf["lat_idx"] == lat_idx)
1279+
& (self._registry_gdf["lon_i"] == lon_i)
1280+
& (self._registry_gdf["lat_i"] == lat_i)
12861281
]
12871282

12881283
if len(matches) == 0:
@@ -1318,11 +1313,11 @@ def get_landmask_file_size(self, lon: float, lat: float) -> int:
13181313
)
13191314

13201315
# Use pre-computed integer grid indices for robust comparison
1321-
lon_idx = coord_to_grid_index(lon)
1322-
lat_idx = coord_to_grid_index(lat)
1316+
lon_i = coord_to_grid_int(lon)
1317+
lat_i = coord_to_grid_int(lat)
13231318
matches = self._landmasks_df[
1324-
(self._landmasks_df["lon_idx"] == lon_idx)
1325-
& (self._landmasks_df["lat_idx"] == lat_idx)
1319+
(self._landmasks_df["lon_i"] == lon_i)
1320+
& (self._landmasks_df["lat_i"] == lat_i)
13261321
]
13271322

13281323
if len(matches) == 0:

0 commit comments

Comments
 (0)