opera-adt · scottstanie · Jan 10, 2026
diff --git a/src/geepers/gps_sources/unr_grid.py b/src/geepers/gps_sources/unr_grid.py
@@ -5,7 +5,7 @@
 from functools import lru_cache
 from pathlib import Path
 from sys import version
-from typing import TYPE_CHECKING, Literal, Optional
+from typing import TYPE_CHECKING, Literal
 
 import geopandas as gpd
 import pandas as pd
@@ -25,7 +25,9 @@
 
 VALID_VERSIONS = {"0.1", "0.2"}
 DEFAULT_VERSION = "0.2"
-LOOKUP_FILE_URL = "https://geodesy.unr.edu/grid_timeseries/Version{version}/grid_latlon_lookup.txt"
+LOOKUP_FILE_URL = (
+    "https://geodesy.unr.edu/grid_timeseries/Version{version}/grid_latlon_lookup.txt"
+)
 FILENAME_TEMPLATE = "{plate}/{grid_id:06d}_{plate}.tenv8"
 GRID_DATA_BASE_URL = (
     "https://geodesy.unr.edu/grid_timeseries/Version{version}/"
@@ -37,16 +39,18 @@
 
 class UnrGridSource(BaseGpsSource):
     """UNR Grid GPS data source for gridded time series data."""
-    def __init__(self, 
-                 version: Literal["0.1", "0.2"] = "0.2",
-                 cache_dir: Optional[str | Path] = None,):
+
+    def __init__(
+        self,
+        version: Literal["0.1", "0.2"] = "0.2",
+        cache_dir: str | Path | None = None,
+    ):
         """Initialize UNR Grid GPS data source."""
-
         # Initialize BaseGpsSource
         super().__init__(cache_dir=cache_dir)
 
         # Store UNR version
-        self.version = version 
+        self.version = version
 
     def timeseries(
         self,
@@ -96,8 +100,9 @@ def timeseries(
 
         # TODO: how to handle fetching/saving, vs using pandas to read...
         if download_if_missing:
-            local_file = self._download_file(station_id, plate=plate,
-                                             version=self.version)
+            local_file = self._download_file(
+                station_id, plate=plate, version=self.version
+            )
             df = self.parse_data_file(local_file)
         else:
             filename = FILENAME_TEMPLATE.format(plate=plate, grid_id=station_id)
@@ -199,15 +204,12 @@ def _download_file(
         output_dir.mkdir(parents=True, exist_ok=True)
 
         if plate == "IGS14" and version == "0.2":
-            # Warning: IGS14 plate is not available in UNR version 0.2. 
+            # Warning: IGS14 plate is not available in UNR version 0.2.
             # Using IGS20 instead.
             plate = "IGS20"
 
         filename = FILENAME_TEMPLATE.format(plate=plate, grid_id=grid_id)
-        url = GRID_DATA_BASE_URL.format(
-            version=version,
-            filename=filename
-        )
+        url = GRID_DATA_BASE_URL.format(version=version, filename=filename)
         dest = output_dir / url.rsplit("/", 1)[-1]
         if not dest.exists():
             if session is None:
@@ -336,16 +338,16 @@ def parse_data_file(self, uri: str | Path) -> pd.DataFrame:
         return StationObservationSchema.validate(df_out, lazy=True)
 
     @staticmethod
-    @lru_cache(maxsize=None)
+    @lru_cache
     def _read_grid_file(version: str = "0.2") -> pd.DataFrame:
         """Download and cache the UNR grid latitude/longitude lookup table."""
         if version not in VALID_VERSIONS:
             msg = (
                 f"Unsupported version '{version}'. "
                 f"Valid options are: {', '.join(VALID_VERSIONS)}."
-                )
-            raise ValueError(msg) 
-        
+            )
+            raise ValueError(msg)
+
         url = LOOKUP_FILE_URL.format(version=version)
         df = pd.read_csv(
             url,
@@ -355,7 +357,7 @@ def _read_grid_file(version: str = "0.2") -> pd.DataFrame:
         if version == "0.2":
             # Version 0.2 maps latitudes from 0 to 360
             # convert to -180 to 180, to be consistent with version 0.1
-            df['longitude'] = ((df['longitude'] + 180) % 360) - 180 
+            df["longitude"] = ((df["longitude"] + 180) % 360) - 180
         return df.set_index("grid_point")
 
 

diff --git a/src/geepers/quality.py b/src/geepers/quality.py
@@ -139,23 +139,17 @@ def select_gps_reference(
         for station, df in station_to_merged_df.items()
     }
 
-    # Get total time for the InSAR data
-    max_insar_time = pd.Series(
-        [d.index.max() for d in station_to_merged_df.values()]
-    ).max()
-    min_insar_time = pd.Series(
-        [d.index.min() for d in station_to_merged_df.values()]
-    ).min()
-
-    # los_insar
-    total_time = max_insar_time - min_insar_time
-    total_days = total_time.total_seconds() / (24 * 3600)
+    # Get total number of InSAR epochs across all stations
+    # Use the maximum number of unique InSAR epochs from any station
+    max_insar_epochs = max(
+        df["los_insar"].notna().sum() for df in station_to_merged_df.values()
+    )
 
     # Filter stations with insufficient overlap
     candidate_stations = {
         station: quality
         for station, quality in qualities.items()
-        if quality.num_gps >= (min_coverage_fraction * total_days)
+        if quality.num_gps >= (min_coverage_fraction * max_insar_epochs)
     }
 
     if not candidate_stations:

diff --git a/src/geepers/workflows.py b/src/geepers/workflows.py
@@ -211,16 +211,33 @@ def _load_or_none(name: str) -> pd.DataFrame | None:
     logger.info("Merging GPS and InSAR tables per station")
     station_to_merged: dict[str, pd.DataFrame] = {}
     for station_id in tqdm(station_to_los_gps, desc="Merging GPS and InSAR"):
+        gps = station_to_los_gps[station_id].copy()
+        insar = station_to_insar[station_id].copy()
+        # Preserve the InSAR epoch as a column for deduplication
+        insar = insar.assign(insar_time=insar.index)
+
         # Use asof merge in case GPS is datetime and insar is date
-        station_to_merged[station_id] = pd.merge_asof(
-            left=station_to_los_gps[station_id],
-            right=station_to_insar[station_id],
+        merged = pd.merge_asof(
+            left=gps.sort_index(),
+            right=insar.sort_index(),
             tolerance=pd.Timedelta("1D"),
             direction="nearest",
             left_index=True,
             right_index=True,
         )
 
+        # Keep only the closest GPS row per InSAR epoch
+        # This prevents duplicate InSAR values when multiple GPS days
+        # match to the same InSAR acquisition
+        dt = pd.Series(merged.index - merged["insar_time"], index=merged.index).abs()
+        keep_idx = dt.groupby(merged["insar_time"]).idxmin()
+        merged_one_per_insar = merged.loc[keep_idx].sort_index()
+
+        # Drop the helper column
+        station_to_merged[station_id] = merged_one_per_insar.drop(
+            columns=["insar_time"]
+        )
+
     # Save results
     combined_df = create_tidy_df(station_to_merged)
     combined_df.to_csv(output_dir / "combined_data.csv", index=False)

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -37,11 +37,13 @@ def test_main(tmp_path, monkeypatch):
         "CRIM",
     ]
     assert set(df.id) == set(expected_stations)
+    # Verify the first HLNA entry (value changed due to duplicate timestamp fix)
+    # The fix now selects the closest GPS point to each InSAR epoch
     expected_entry = {
         "id": "HLNA",
         "date": "2016-07-23",
         "measurement": "los_gps",
-        "value": 0.0010796103397325,
+        "value": -0.0020305613040891,
     }
     pd.testing.assert_series_equal(
         df[df.id == "HLNA"].iloc[0], pd.Series(expected_entry, name=0)
@@ -182,3 +184,96 @@ def test_select_gps_reference_no_coherence_data():
     # Should fall back to RMS-based selection even with coherence_priority=True
     ref_station = select_gps_reference(station_to_merged, coherence_priority=True)
     assert ref_station == "STAT_A"  # Lower RMS misfit
+
+
+def test_no_duplicate_insar_timestamps():
+    """Test that merge produces exactly one GPS row per InSAR epoch.
+
+    This test demonstrates the issue where multiple GPS observations
+    (from adjacent days or same day with different times) can match
+    to the same InSAR acquisition when using merge_asof with
+    tolerance="1D" and direction="nearest".
+
+    The fix ensures we keep only the closest GPS sample per InSAR epoch.
+    """
+    # GPS data: daily observations at ~11:57:30 (typical GPS timestamp)
+    gps_dates = pd.date_range("2023-01-01 11:57:30", periods=10, freq="D")
+    gps_df = pd.DataFrame(
+        {
+            "los_gps": np.random.normal(0, 0.01, 10),
+            "sigma_los": np.full(10, 0.001),
+        },
+        index=gps_dates,
+    )
+
+    # InSAR data: epochs at midnight (typical InSAR timestamp)
+    # Use fewer epochs to trigger the duplicate matching issue
+    insar_dates = pd.DatetimeIndex(
+        [
+            "2023-01-02 00:00:00",
+            "2023-01-05 00:00:00",
+            "2023-01-08 00:00:00",
+        ]
+    )
+    insar_df = pd.DataFrame(
+        {
+            "los_insar": np.random.normal(0, 0.01, 3),
+        },
+        index=insar_dates,
+    )
+
+    # OLD BEHAVIOR: merge_asof with tolerance="1D" allows multiple GPS rows
+    # to match the same InSAR epoch
+    old_merged = pd.merge_asof(
+        left=gps_df.sort_index(),
+        right=insar_df.sort_index(),
+        tolerance=pd.Timedelta("1D"),
+        direction="nearest",
+        left_index=True,
+        right_index=True,
+    )
+
+    # Demonstrate the issue: multiple GPS rows can have the same los_insar value
+    # (meaning they matched to the same InSAR epoch)
+    value_counts = old_merged["los_insar"].value_counts()
+    (value_counts > 1).any()
+
+    # FIXED BEHAVIOR: Keep only one GPS row per InSAR epoch (the closest one)
+    # Add insar_time column to track which InSAR epoch each GPS row matched to
+    insar_with_time = insar_df.assign(insar_time=insar_df.index)
+    merged = pd.merge_asof(
+        left=gps_df.sort_index(),
+        right=insar_with_time.sort_index(),
+        tolerance=pd.Timedelta("1D"),
+        direction="nearest",
+        left_index=True,
+        right_index=True,
+    )
+
+    # Keep only the closest GPS row per InSAR epoch
+    dt = (merged.index - merged["insar_time"]).abs()
+    keep_idx = dt.groupby(merged["insar_time"]).idxmin()
+    merged_one_per_insar = merged.loc[keep_idx].sort_index()
+
+    # After the fix: each InSAR value should appear exactly once
+    fixed_value_counts = merged_one_per_insar["los_insar"].value_counts()
+    assert (
+        fixed_value_counts <= 1
+    ).all(), "Each InSAR epoch should appear at most once"
+
+    # Verify we have the expected number of matches (3 InSAR epochs)
+    assert (
+        len(merged_one_per_insar) == 3
+    ), "Should have exactly 3 GPS rows (one per InSAR epoch)"
+
+    # Verify that GPS rows are matched to the closest InSAR epoch
+    for idx, row in merged_one_per_insar.iterrows():
+        insar_time = row["insar_time"]
+        # This GPS timestamp should be the closest to its matched InSAR epoch
+        gps_matches = merged[merged["insar_time"] == insar_time]
+        time_diffs = pd.Series(
+            abs(gps_matches.index - insar_time), index=gps_matches.index
+        )
+        assert (
+            idx == time_diffs.idxmin()
+        ), f"GPS row {idx} should be closest to InSAR epoch {insar_time}"