Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 21 additions & 19 deletions src/geepers/gps_sources/unr_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from functools import lru_cache
from pathlib import Path
from sys import version
from typing import TYPE_CHECKING, Literal, Optional
from typing import TYPE_CHECKING, Literal

import geopandas as gpd
import pandas as pd
Expand All @@ -25,7 +25,9 @@

VALID_VERSIONS = {"0.1", "0.2"}
DEFAULT_VERSION = "0.2"
LOOKUP_FILE_URL = "https://geodesy.unr.edu/grid_timeseries/Version{version}/grid_latlon_lookup.txt"
LOOKUP_FILE_URL = (
"https://geodesy.unr.edu/grid_timeseries/Version{version}/grid_latlon_lookup.txt"
)
FILENAME_TEMPLATE = "{plate}/{grid_id:06d}_{plate}.tenv8"
GRID_DATA_BASE_URL = (
"https://geodesy.unr.edu/grid_timeseries/Version{version}/"
Expand All @@ -37,16 +39,18 @@

class UnrGridSource(BaseGpsSource):
"""UNR Grid GPS data source for gridded time series data."""
def __init__(self,
version: Literal["0.1", "0.2"] = "0.2",
cache_dir: Optional[str | Path] = None,):

def __init__(
self,
version: Literal["0.1", "0.2"] = "0.2",
cache_dir: str | Path | None = None,
):
"""Initialize UNR Grid GPS data source."""

# Initialize BaseGpsSource
super().__init__(cache_dir=cache_dir)

# Store UNR version
self.version = version
self.version = version

def timeseries(
self,
Expand Down Expand Up @@ -96,8 +100,9 @@ def timeseries(

# TODO: how to handle fetching/saving, vs using pandas to read...
if download_if_missing:
local_file = self._download_file(station_id, plate=plate,
version=self.version)
local_file = self._download_file(
station_id, plate=plate, version=self.version
)
df = self.parse_data_file(local_file)
else:
filename = FILENAME_TEMPLATE.format(plate=plate, grid_id=station_id)
Expand Down Expand Up @@ -199,15 +204,12 @@ def _download_file(
output_dir.mkdir(parents=True, exist_ok=True)

if plate == "IGS14" and version == "0.2":
# Warning: IGS14 plate is not available in UNR version 0.2.
# Warning: IGS14 plate is not available in UNR version 0.2.
# Using IGS20 instead.
plate = "IGS20"

filename = FILENAME_TEMPLATE.format(plate=plate, grid_id=grid_id)
url = GRID_DATA_BASE_URL.format(
version=version,
filename=filename
)
url = GRID_DATA_BASE_URL.format(version=version, filename=filename)
dest = output_dir / url.rsplit("/", 1)[-1]
if not dest.exists():
if session is None:
Expand Down Expand Up @@ -336,16 +338,16 @@ def parse_data_file(self, uri: str | Path) -> pd.DataFrame:
return StationObservationSchema.validate(df_out, lazy=True)

@staticmethod
@lru_cache(maxsize=None)
@lru_cache
def _read_grid_file(version: str = "0.2") -> pd.DataFrame:
"""Download and cache the UNR grid latitude/longitude lookup table."""
if version not in VALID_VERSIONS:
msg = (
f"Unsupported version '{version}'. "
f"Valid options are: {', '.join(VALID_VERSIONS)}."
)
raise ValueError(msg)
)
raise ValueError(msg)

url = LOOKUP_FILE_URL.format(version=version)
df = pd.read_csv(
url,
Expand All @@ -355,7 +357,7 @@ def _read_grid_file(version: str = "0.2") -> pd.DataFrame:
if version == "0.2":
# Version 0.2 maps latitudes from 0 to 360
# convert to -180 to 180, to be consistent with version 0.1
df['longitude'] = ((df['longitude'] + 180) % 360) - 180
df["longitude"] = ((df["longitude"] + 180) % 360) - 180
return df.set_index("grid_point")


Expand Down
18 changes: 6 additions & 12 deletions src/geepers/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,23 +139,17 @@ def select_gps_reference(
for station, df in station_to_merged_df.items()
}

# Get total time for the InSAR data
max_insar_time = pd.Series(
[d.index.max() for d in station_to_merged_df.values()]
).max()
min_insar_time = pd.Series(
[d.index.min() for d in station_to_merged_df.values()]
).min()

# los_insar
total_time = max_insar_time - min_insar_time
total_days = total_time.total_seconds() / (24 * 3600)
# Get total number of InSAR epochs across all stations
# Use the maximum number of unique InSAR epochs from any station
max_insar_epochs = max(
df["los_insar"].notna().sum() for df in station_to_merged_df.values()
)

# Filter stations with insufficient overlap
candidate_stations = {
station: quality
for station, quality in qualities.items()
if quality.num_gps >= (min_coverage_fraction * total_days)
if quality.num_gps >= (min_coverage_fraction * max_insar_epochs)
}

if not candidate_stations:
Expand Down
23 changes: 20 additions & 3 deletions src/geepers/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,16 +211,33 @@ def _load_or_none(name: str) -> pd.DataFrame | None:
logger.info("Merging GPS and InSAR tables per station")
station_to_merged: dict[str, pd.DataFrame] = {}
for station_id in tqdm(station_to_los_gps, desc="Merging GPS and InSAR"):
gps = station_to_los_gps[station_id].copy()
insar = station_to_insar[station_id].copy()
# Preserve the InSAR epoch as a column for deduplication
insar = insar.assign(insar_time=insar.index)

# Use asof merge in case GPS is datetime and insar is date
station_to_merged[station_id] = pd.merge_asof(
left=station_to_los_gps[station_id],
right=station_to_insar[station_id],
merged = pd.merge_asof(
left=gps.sort_index(),
right=insar.sort_index(),
tolerance=pd.Timedelta("1D"),
direction="nearest",
left_index=True,
right_index=True,
)

# Keep only the closest GPS row per InSAR epoch
# This prevents duplicate InSAR values when multiple GPS days
# match to the same InSAR acquisition
dt = pd.Series(merged.index - merged["insar_time"], index=merged.index).abs()
keep_idx = dt.groupby(merged["insar_time"]).idxmin()
merged_one_per_insar = merged.loc[keep_idx].sort_index()

# Drop the helper column
station_to_merged[station_id] = merged_one_per_insar.drop(
columns=["insar_time"]
)

# Save results
combined_df = create_tidy_df(station_to_merged)
combined_df.to_csv(output_dir / "combined_data.csv", index=False)
Expand Down
97 changes: 96 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ def test_main(tmp_path, monkeypatch):
"CRIM",
]
assert set(df.id) == set(expected_stations)
# Verify the first HLNA entry (value changed due to duplicate timestamp fix)
# The fix now selects the closest GPS point to each InSAR epoch
expected_entry = {
"id": "HLNA",
"date": "2016-07-23",
"measurement": "los_gps",
"value": 0.0010796103397325,
"value": -0.0020305613040891,
}
pd.testing.assert_series_equal(
df[df.id == "HLNA"].iloc[0], pd.Series(expected_entry, name=0)
Expand Down Expand Up @@ -182,3 +184,96 @@ def test_select_gps_reference_no_coherence_data():
# Should fall back to RMS-based selection even with coherence_priority=True
ref_station = select_gps_reference(station_to_merged, coherence_priority=True)
assert ref_station == "STAT_A" # Lower RMS misfit


def test_no_duplicate_insar_timestamps():
"""Test that merge produces exactly one GPS row per InSAR epoch.

This test demonstrates the issue where multiple GPS observations
(from adjacent days or same day with different times) can match
to the same InSAR acquisition when using merge_asof with
tolerance="1D" and direction="nearest".

The fix ensures we keep only the closest GPS sample per InSAR epoch.
"""
# GPS data: daily observations at ~11:57:30 (typical GPS timestamp)
gps_dates = pd.date_range("2023-01-01 11:57:30", periods=10, freq="D")
gps_df = pd.DataFrame(
{
"los_gps": np.random.normal(0, 0.01, 10),
"sigma_los": np.full(10, 0.001),
},
index=gps_dates,
)

# InSAR data: epochs at midnight (typical InSAR timestamp)
# Use fewer epochs to trigger the duplicate matching issue
insar_dates = pd.DatetimeIndex(
[
"2023-01-02 00:00:00",
"2023-01-05 00:00:00",
"2023-01-08 00:00:00",
]
)
insar_df = pd.DataFrame(
{
"los_insar": np.random.normal(0, 0.01, 3),
},
index=insar_dates,
)

# OLD BEHAVIOR: merge_asof with tolerance="1D" allows multiple GPS rows
# to match the same InSAR epoch
old_merged = pd.merge_asof(
left=gps_df.sort_index(),
right=insar_df.sort_index(),
tolerance=pd.Timedelta("1D"),
direction="nearest",
left_index=True,
right_index=True,
)

# Demonstrate the issue: multiple GPS rows can have the same los_insar value
# (meaning they matched to the same InSAR epoch)
value_counts = old_merged["los_insar"].value_counts()
(value_counts > 1).any()

# FIXED BEHAVIOR: Keep only one GPS row per InSAR epoch (the closest one)
# Add insar_time column to track which InSAR epoch each GPS row matched to
insar_with_time = insar_df.assign(insar_time=insar_df.index)
merged = pd.merge_asof(
left=gps_df.sort_index(),
right=insar_with_time.sort_index(),
tolerance=pd.Timedelta("1D"),
direction="nearest",
left_index=True,
right_index=True,
)

# Keep only the closest GPS row per InSAR epoch
dt = (merged.index - merged["insar_time"]).abs()
keep_idx = dt.groupby(merged["insar_time"]).idxmin()
merged_one_per_insar = merged.loc[keep_idx].sort_index()

# After the fix: each InSAR value should appear exactly once
fixed_value_counts = merged_one_per_insar["los_insar"].value_counts()
assert (
fixed_value_counts <= 1
).all(), "Each InSAR epoch should appear at most once"

# Verify we have the expected number of matches (3 InSAR epochs)
assert (
len(merged_one_per_insar) == 3
), "Should have exactly 3 GPS rows (one per InSAR epoch)"

# Verify that GPS rows are matched to the closest InSAR epoch
for idx, row in merged_one_per_insar.iterrows():
insar_time = row["insar_time"]
# This GPS timestamp should be the closest to its matched InSAR epoch
gps_matches = merged[merged["insar_time"] == insar_time]
time_diffs = pd.Series(
abs(gps_matches.index - insar_time), index=gps_matches.index
)
assert (
idx == time_diffs.idxmin()
), f"GPS row {idx} should be closest to InSAR epoch {insar_time}"
Loading