improve phasenet_plus clustering, but not ideal

zhuwq0 · zhuwq0 · commit b4fec1ffa8c1 · 2024-11-02T23:02:53.000-07:00
diff --git a/scripts/run_adloc.py b/scripts/run_adloc.py
@@ -44,8 +44,10 @@ def run_adloc(
 
     # %%
     data_path = f"{root_path}/{region}/gamma"
-    picks_file = os.path.join(data_path, f"gamma_picks.csv")
-    events_file = os.path.join(data_path, f"gamma_events.csv")
+    picks_file = f"{data_path}/gamma_picks.csv"
+    events_file = f"{data_path}/gamma_events.csv"
+    # picks_file = f"{root_path}/{region}/gamma_plus/gamma_picks.csv"
+    # events_file = f"{root_path}/{region}/gamma_plus/gamma_events.csv"
     # picks_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_picks_associated.csv"
     # events_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_events_associated.csv"
 
@@ -186,6 +188,9 @@ def run_adloc(
     picks = picks.merge(events[["event_index", "idx_eve"]], on="event_index")
     picks = picks.merge(stations[["station_id", "idx_sta"]], on="station_id")
 
+    print(f"Number of picks: {len(picks)}")
+    print(f"Number of events: {len(events)}")
+
     # %%
     estimator = ADLoc(config, stations=stations[["x_km", "y_km", "z_km"]].values, eikonal=config["eikonal"])
 
diff --git a/scripts/run_event_association.py b/scripts/run_event_association.py
@@ -1,15 +1,19 @@
 # %%
 import json
 import os
+from glob import glob
 from typing import Dict
+
 import fsspec
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from args import parse_args
+from pyproj import Proj
+from scipy.sparse.csgraph import minimum_spanning_tree
+from scipy.spatial.distance import pdist, squareform
 from sklearn.cluster import DBSCAN
 from tqdm import tqdm
-from args import parse_args
-from glob import glob
 
 
 def associate(
@@ -20,6 +24,21 @@ def associate(
 ):
 
     VPVS_RATIO = config["VPVS_RATIO"]
+    VP = config["VP"]
+
+    proj = Proj(proj="merc", datum="WGS84", units="km")
+    stations[["x_km", "y_km"]] = stations.apply(lambda x: pd.Series(proj(x.longitude, x.latitude)), axis=1)
+
+    # dist_matrix = squareform(pdist(stations[["x_km", "y_km"]].values))
+    # mst = minimum_spanning_tree(dist_matrix)
+    # dx = np.median(mst.data[mst.data > 0])
+    # print(f"dx: {dx:.3f}")
+    # eps_t = dx / VP * 2.0
+    # eps_t = 6.0
+    # eps_xy = eps_t * VP * 2 / (1.0 + VPVS_RATIO)
+    # print(f"eps_t: {eps_t:.3f}, eps_xy: {eps_xy:.3f}")
+    eps_xy = 30.0
+    print(f"eps_xy: {eps_xy:.3f}")
 
     # %%
     t0 = min(events["event_time"].min(), picks["phase_time"].min())
@@ -28,8 +47,13 @@ def associate(
     picks["timestamp"] = picks["phase_time"].apply(lambda x: (x - t0).total_seconds())
 
     # %%
-    # clustering = DBSCAN(eps=3, min_samples=3).fit(events[["timestamp", "x_s", "y_s"]])
-    clustering = DBSCAN(eps=3, min_samples=3).fit(events[["timestamp"]])
+    events = events.merge(stations[["station_id", "x_km", "y_km"]], on="station_id", how="left")
+
+    scaling = np.array([1.0, 1.0 / eps_xy, 1.0 / eps_xy])
+    clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp", "x_km", "y_km"]] * scaling)
+    # clustering = DBSCAN(eps=2.0, min_samples=4).fit(events[["timestamp"]])
+    # clustering = DBSCAN(eps=3.0, min_samples=3).fit(events[["timestamp"]])
+    # clustering = DBSCAN(eps=1.0, min_samples=3).fit(events[["timestamp"]])
     events["event_index"] = clustering.labels_
     print(f"Number of associated events: {len(events['event_index'].unique())}")
 
diff --git a/scripts/run_gamma.py b/scripts/run_gamma.py
@@ -26,23 +26,26 @@ def run_gamma(
 
     # %%
     data_path = f"{region}/phasenet"
+    # data_path = f"{region}/phasenet_plus"
     result_path = f"{region}/gamma"
     if not os.path.exists(f"{root_path}/{result_path}"):
         os.makedirs(f"{root_path}/{result_path}")
 
     # %%
     station_json = f"{region}/obspy/stations.json"
-    if picks_csv is None:
-        picks_csv = f"{data_path}/phasenet_picks_{node_rank:03d}_{num_nodes:03d}.csv"
-    gamma_events_csv = f"{result_path}/gamma_events_{node_rank:03d}_{num_nodes:03d}.csv"
-    gamma_picks_csv = f"{result_path}/gamma_picks_{node_rank:03d}_{num_nodes:03d}.csv"
+    # if picks_csv is None:
+    picks_csv = f"{data_path}/phasenet_picks.csv"
+    # picks_csv = f"{data_path}/phasenet_plus_picks.csv"
+    gamma_events_csv = f"{result_path}/gamma_events.csv"
+    gamma_picks_csv = f"{result_path}/gamma_picks.csv"
 
     # %%
     ## read picks
     if protocol == "file":
         picks = pd.read_csv(f"{root_path}/{picks_csv}")
     else:
         picks = pd.read_csv(f"{protocol}://{bucket}/{picks_csv}")
+    picks.drop(columns=["event_index"], inplace=True, errors="ignore")
     picks["id"] = picks["station_id"]
     picks["timestamp"] = picks["phase_time"]
     if "phase_amp" in picks.columns:
@@ -126,6 +129,8 @@ def run_gamma(
     for k, v in config.items():
         print(f"{k}: {v}")
 
+    print(f"Number of picks: {len(picks)}")
+
     # %%
     event_idx0 = 0  ## current earthquake index
     assignments = []
diff --git a/scripts/run_phasenet_plus.py b/scripts/run_phasenet_plus.py
@@ -125,6 +125,8 @@ def run_phasenet(
         )
         picks = pd.read_csv(f"{root_path}/{region}/phasenet_plus/picks_phasenet_plus.csv", parse_dates=["phase_time"])
         events, picks = associate(picks, events, stations, config)
+        print(f"Number of picks: {len(picks):,}")
+        print(f"Number of associated events: {len(events['event_index'].unique()):,}")
         events.to_csv(f"{root_path}/{region}/phasenet_plus/phasenet_plus_events_associated.csv", index=False)
         picks.to_csv(f"{root_path}/{region}/phasenet_plus/phasenet_plus_picks_associated.csv", index=False)
 

Original file line number	Diff line number	Diff line change
`@@ -125,6 +125,8 @@ def run_phasenet(`
`125`	`125`	`)`
`126`	`126`	`picks = pd.read_csv(f"{root_path}/{region}/phasenet_plus/picks_phasenet_plus.csv", parse_dates=["phase_time"])`
`127`	`127`	`events, picks = associate(picks, events, stations, config)`
	`128`	`+ print(f"Number of picks: {len(picks):,}")`
	`129`	`+ print(f"Number of associated events: {len(events['event_index'].unique()):,}")`
`128`	`130`	`events.to_csv(f"{root_path}/{region}/phasenet_plus/phasenet_plus_events_associated.csv", index=False)`
`129`	`131`	`picks.to_csv(f"{root_path}/{region}/phasenet_plus/phasenet_plus_picks_associated.csv", index=False)`
`130`	`132`