|
| 1 | +# %% |
| 2 | +import argparse |
| 3 | +import json |
| 4 | +import multiprocessing as mp |
| 5 | +import os |
| 6 | +import pickle |
| 7 | +import time |
| 8 | +from contextlib import nullcontext |
| 9 | + |
| 10 | +import fsspec |
| 11 | +import numpy as np |
| 12 | +import pandas as pd |
| 13 | +from pyproj import Proj |
| 14 | +from sklearn.neighbors import NearestNeighbors |
| 15 | +from tqdm import tqdm |
| 16 | + |
| 17 | + |
| 18 | +# %% |
| 19 | +def pairing_picks(event_pairs, picks, config): |
| 20 | + |
| 21 | + picks = picks[["idx_eve", "idx_sta", "phase_type", "phase_score", "phase_time"]].copy() |
| 22 | + merged = pd.merge( |
| 23 | + event_pairs, |
| 24 | + picks, |
| 25 | + left_on="idx_eve1", |
| 26 | + right_on="idx_eve", |
| 27 | + ) |
| 28 | + merged = pd.merge( |
| 29 | + merged, |
| 30 | + picks, |
| 31 | + left_on=["idx_eve2", "idx_sta", "phase_type"], |
| 32 | + right_on=["idx_eve", "idx_sta", "phase_type"], |
| 33 | + suffixes=("_1", "_2"), |
| 34 | + ) |
| 35 | + merged = merged.rename(columns={"phase_time_1": "phase_time1", "phase_time_2": "phase_time2"}) |
| 36 | + merged["phase_score"] = (merged["phase_score_1"] + merged["phase_score_2"]) / 2.0 |
| 37 | + |
| 38 | + merged["travel_time1"] = (merged["phase_time1"] - merged["event_time1"]).dt.total_seconds() |
| 39 | + merged["travel_time2"] = (merged["phase_time2"] - merged["event_time2"]).dt.total_seconds() |
| 40 | + merged["phase_dtime"] = merged["travel_time1"] - merged["travel_time2"] |
| 41 | + |
| 42 | + # filtering |
| 43 | + # merged = merged.sort_values("phase_score", ascending=False) |
| 44 | + merged = ( |
| 45 | + merged.groupby(["idx_eve1", "idx_eve2"], group_keys=False) |
| 46 | + .apply(lambda x: (x.nlargest(config["MAX_OBS"], "phase_score") if len(x) > config["MIN_OBS"] else None)) |
| 47 | + .reset_index(drop=True) |
| 48 | + ) |
| 49 | + |
| 50 | + return merged[["idx_eve1", "idx_eve2", "idx_sta", "phase_type", "phase_score", "phase_dtime"]] |
| 51 | + |
| 52 | + |
| 53 | +# %% |
| 54 | +def load_data(year, jday, data_path, root_path, bucket, protocol, token): |
| 55 | + |
| 56 | + fs = fsspec.filesystem(protocol, token=token) |
| 57 | + adloc_events_csv = f"{data_path}/{year:04d}/adloc_events_{jday:03d}.csv" |
| 58 | + adloc_picks_csv = f"{data_path}/{year:04d}/adloc_picks_{jday:03d}.csv" |
| 59 | + if protocol == "file": |
| 60 | + events = pd.read_csv(f"{root_path}/{adloc_events_csv}", parse_dates=["time"]) |
| 61 | + picks = pd.read_csv(f"{root_path}/{adloc_picks_csv}", parse_dates=["phase_time"]) |
| 62 | + else: |
| 63 | + with fs.open(f"{bucket}/{adloc_events_csv}", "r") as fp: |
| 64 | + events = pd.read_csv(fp, parse_dates=["time"]) |
| 65 | + with fs.open(f"{bucket}/{adloc_picks_csv}", "r") as fp: |
| 66 | + picks = pd.read_csv(fp, parse_dates=["phase_time"]) |
| 67 | + |
| 68 | + events["year"] = year |
| 69 | + events["jday"] = jday |
| 70 | + picks["year"] = year |
| 71 | + picks["jday"] = jday |
| 72 | + |
| 73 | + return events, picks |
| 74 | + |
| 75 | + |
| 76 | +def parse_args(): |
| 77 | + parser = argparse.ArgumentParser(description="Run Gamma on NCEDC/SCEDC data") |
| 78 | + parser.add_argument("--num_nodes", type=int, default=366) |
| 79 | + parser.add_argument("--node_rank", type=int, default=0) |
| 80 | + parser.add_argument("--year", type=int, default=2023) |
| 81 | + parser.add_argument("--root_path", type=str, default="local") |
| 82 | + parser.add_argument("--region", type=str, default="Cal") |
| 83 | + parser.add_argument("--bucket", type=str, default="quakeflow_catalog") |
| 84 | + return parser.parse_args() |
| 85 | + |
| 86 | + |
| 87 | +# %% |
| 88 | +if __name__ == "__main__": |
| 89 | + |
| 90 | + # %% |
| 91 | + protocol = "gs" |
| 92 | + token_json = f"application_default_credentials.json" |
| 93 | + with open(token_json, "r") as fp: |
| 94 | + token = json.load(fp) |
| 95 | + |
| 96 | + fs = fsspec.filesystem(protocol, token=token) |
| 97 | + |
| 98 | + # %% |
| 99 | + args = parse_args() |
| 100 | + region = args.region |
| 101 | + root_path = args.root_path |
| 102 | + bucket = args.bucket |
| 103 | + num_nodes = args.num_nodes |
| 104 | + node_rank = args.node_rank |
| 105 | + year = args.year |
| 106 | + |
| 107 | + data_path = f"{region}/adloc2" |
| 108 | + result_path = f"{region}/adloc_dd_2022" |
| 109 | + |
| 110 | + if not os.path.exists(f"{root_path}/{result_path}"): |
| 111 | + os.makedirs(f"{root_path}/{result_path}") |
| 112 | + |
| 113 | + # %% |
| 114 | + station_json = f"{region}/network/stations.json" |
| 115 | + if protocol == "file": |
| 116 | + stations = pd.read_json(f"{root_path}/{station_json}", orient="index") |
| 117 | + else: |
| 118 | + with fs.open(f"{bucket}/{station_json}", "r") as fp: |
| 119 | + stations = pd.read_json(fp, orient="index") |
| 120 | + stations["station_id"] = stations.index |
| 121 | + |
| 122 | + # %% |
| 123 | + events = [] |
| 124 | + picks = [] |
| 125 | + jobs = [] |
| 126 | + ctx = mp.get_context("spawn") |
| 127 | + ncpu = min(32, mp.cpu_count()) |
| 128 | + # years = range(2015, 2024) |
| 129 | + years = [2022] |
| 130 | + # num_days = sum([366 if (year % 4 == 0 and year % 100 != 0) or year % 400 == 0 else 365 for year in years]) |
| 131 | + num_days = 365 * len(years) |
| 132 | + pbar = tqdm(total=num_days, desc="Loading data") |
| 133 | + with ctx.Pool(processes=ncpu) as pool: |
| 134 | + for year in years: |
| 135 | + # num_jday = 366 if (year % 4 == 0 and year % 100 != 0) or year % 400 == 0 else 365 |
| 136 | + num_jday = 365 |
| 137 | + for jday in range(1, num_jday + 1): |
| 138 | + job = pool.apply_async( |
| 139 | + load_data, |
| 140 | + args=(year, jday, data_path, root_path, bucket, protocol, token), |
| 141 | + callback=lambda x: pbar.update(), |
| 142 | + ) |
| 143 | + jobs.append(job) |
| 144 | + |
| 145 | + pool.close() |
| 146 | + pool.join() |
| 147 | + for job in jobs: |
| 148 | + events_, picks_ = job.get() |
| 149 | + events.append(events_) |
| 150 | + picks.append(picks_) |
| 151 | + |
| 152 | + pbar.close() |
| 153 | + events = pd.concat(events, ignore_index=True) |
| 154 | + picks = pd.concat(picks, ignore_index=True) |
| 155 | + |
| 156 | + events = events.sort_values("time") |
| 157 | + events["dummy_id"] = ( |
| 158 | + events["year"].astype(str) |
| 159 | + + "." |
| 160 | + + events["jday"].astype(str).str.zfill(3) |
| 161 | + + "." |
| 162 | + + events["event_index"].astype(str).str.zfill(4) |
| 163 | + ) |
| 164 | + picks["dummy_id"] = ( |
| 165 | + picks["year"].astype(str) |
| 166 | + + "." |
| 167 | + + picks["jday"].astype(str).str.zfill(3) |
| 168 | + + "." |
| 169 | + + picks["event_index"].astype(str).str.zfill(4) |
| 170 | + ) |
| 171 | + events["event_index"] = np.arange(len(events)) |
| 172 | + picks = picks.drop("event_index", axis=1) |
| 173 | + picks = picks.merge(events[["dummy_id", "event_index"]], on="dummy_id") |
| 174 | + |
| 175 | + print(f"Processing {len(events)} events, {len(picks)} picks") |
| 176 | + |
| 177 | + events.to_csv(f"{root_path}/{result_path}/events.csv", index=False) |
| 178 | + picks.to_csv(f"{root_path}/{result_path}/picks.csv", index=False) |
| 179 | + |
| 180 | + # # %% |
| 181 | + # events = pd.read_csv(f"{root_path}/{result_path}/events.csv", parse_dates=["time"]) |
| 182 | + # picks = pd.read_csv(f"{root_path}/{result_path}/picks.csv", parse_dates=["phase_time"]) |
| 183 | + |
| 184 | + # %% |
| 185 | + MAX_PAIR_DIST = 10 # km |
| 186 | + MAX_NEIGHBORS = 50 |
| 187 | + MIN_NEIGHBORS = 8 |
| 188 | + MIN_OBS = 8 |
| 189 | + MAX_OBS = 100 |
| 190 | + config = {} |
| 191 | + config["MAX_PAIR_DIST"] = MAX_PAIR_DIST |
| 192 | + config["MAX_NEIGHBORS"] = MAX_NEIGHBORS |
| 193 | + config["MIN_NEIGHBORS"] = MIN_NEIGHBORS |
| 194 | + config["MIN_OBS"] = MIN_OBS |
| 195 | + config["MAX_OBS"] = MAX_OBS |
| 196 | + mapping_phase_type_int = {"P": 0, "S": 1} |
| 197 | + |
| 198 | + picks = picks[picks["event_index"] != -1] |
| 199 | + # check phase_type is P/S or 0/1 |
| 200 | + # if set(picks["phase_type"].unique()).issubset(set(mapping_phase_type_int.keys())): # P/S |
| 201 | + picks["phase_type"] = picks["phase_type"].map(mapping_phase_type_int) |
| 202 | + |
| 203 | + # %% |
| 204 | + if "idx_eve" in events.columns: |
| 205 | + events = events.drop("idx_eve", axis=1) |
| 206 | + if "idx_sta" in stations.columns: |
| 207 | + stations = stations.drop("idx_sta", axis=1) |
| 208 | + if "idx_eve" in picks.columns: |
| 209 | + picks = picks.drop("idx_eve", axis=1) |
| 210 | + if "idx_sta" in picks.columns: |
| 211 | + picks = picks.drop("idx_sta", axis=1) |
| 212 | + |
| 213 | + # %% |
| 214 | + # reindex in case the index does not start from 0 or is not continuous |
| 215 | + stations = stations[stations["station_id"].isin(picks["station_id"].unique())] |
| 216 | + events = events[events["event_index"].isin(picks["event_index"].unique())] |
| 217 | + stations["idx_sta"] = np.arange(len(stations)) |
| 218 | + events["idx_eve"] = np.arange(len(events)) |
| 219 | + |
| 220 | + picks = picks.merge(events[["event_index", "idx_eve"]], on="event_index") |
| 221 | + picks = picks.merge(stations[["station_id", "idx_sta"]], on="station_id") |
| 222 | + |
| 223 | + # %% |
| 224 | + lon0 = stations["longitude"].median() |
| 225 | + lat0 = stations["latitude"].median() |
| 226 | + proj = Proj(f"+proj=sterea +lon_0={lon0} +lat_0={lat0} +units=km") |
| 227 | + |
| 228 | + stations[["x_km", "y_km"]] = stations.apply( |
| 229 | + lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1 |
| 230 | + ) |
| 231 | + stations["depth_km"] = -stations["elevation_m"] / 1000 |
| 232 | + stations["z_km"] = stations["depth_km"] |
| 233 | + |
| 234 | + events[["x_km", "y_km"]] = events.apply( |
| 235 | + lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1 |
| 236 | + ) |
| 237 | + events["z_km"] = events["depth_km"] |
| 238 | + |
| 239 | + picks = picks.merge(events[["idx_eve", "time"]], on="idx_eve") |
| 240 | + picks["travel_time"] = (picks["phase_time"] - picks["time"]).dt.total_seconds() |
| 241 | + picks.drop("time", axis=1, inplace=True) |
| 242 | + |
| 243 | + # %% |
| 244 | + # # Option 1: Radius neighbors |
| 245 | + # neigh = NearestNeighbors(radius=MAX_PAIR_DIST, n_jobs=-1) |
| 246 | + # print("Fitting NearestNeighbors") |
| 247 | + # neigh.fit(events[["x_km", "y_km", "z_km"]].values) |
| 248 | + # pairs = set() |
| 249 | + # print("Get neighbors") |
| 250 | + # neigh_ind = neigh.radius_neighbors(sort_results=True)[1] |
| 251 | + # print("Generating pairs") |
| 252 | + # for i, neighs in enumerate(tqdm(neigh_ind, desc="Generating pairs")): |
| 253 | + # if len(neighs) < MIN_NEIGHBORS: |
| 254 | + # continue |
| 255 | + # for j in neighs[:MAX_NEIGHBORS]: |
| 256 | + # if i < j: |
| 257 | + # pairs.add((i, j)) |
| 258 | + # else: |
| 259 | + # pairs.add((j, i)) |
| 260 | + |
| 261 | + # Option 2: K-nearest neighbors |
| 262 | + neigh = NearestNeighbors(n_neighbors=MAX_NEIGHBORS, n_jobs=-1) |
| 263 | + print("Fitting NearestNeighbors...") |
| 264 | + neigh.fit(events[["x_km", "y_km", "z_km"]].values) |
| 265 | + pairs = set() |
| 266 | + print("Get neighbors...") |
| 267 | + neigh_dist, neigh_ind = neigh.kneighbors() |
| 268 | + print("Generating pairs...") |
| 269 | + for i, (dists, inds) in enumerate(tqdm(zip(neigh_dist, neigh_ind), desc="Generating pairs", total=len(neigh_ind))): |
| 270 | + inds = inds[dists <= MAX_PAIR_DIST] |
| 271 | + if len(inds) < MIN_NEIGHBORS: |
| 272 | + continue |
| 273 | + for j in inds: |
| 274 | + if i < j: |
| 275 | + pairs.add((i, j)) |
| 276 | + else: |
| 277 | + pairs.add((j, i)) |
| 278 | + |
| 279 | + pairs = list(pairs) |
| 280 | + event_pairs = pd.DataFrame(list(pairs), columns=["idx_eve1", "idx_eve2"]) |
| 281 | + print(f"Number of events: {len(events)}") |
| 282 | + print(f"Number of event pairs: {len(event_pairs)}") |
| 283 | + event_pairs["event_time1"] = events["time"].iloc[event_pairs["idx_eve1"]].values |
| 284 | + event_pairs["event_time2"] = events["time"].iloc[event_pairs["idx_eve2"]].values |
| 285 | + |
| 286 | + # %% |
| 287 | + chunk_size = 100_000 |
| 288 | + num_chunk = len(event_pairs) // chunk_size |
| 289 | + pbar = tqdm(total=num_chunk, desc="Pairing picks") |
| 290 | + |
| 291 | + results = [] |
| 292 | + jobs = [] |
| 293 | + ctx = mp.get_context("spawn") |
| 294 | + ncpu = min(num_chunk, min(32, mp.cpu_count())) |
| 295 | + picks["idx_eve"] = picks["idx_eve"].astype("category") |
| 296 | + with ctx.Pool(processes=ncpu) as pool: |
| 297 | + for i in np.array_split(np.arange(len(event_pairs)), num_chunk): |
| 298 | + event_pairs_ = event_pairs.iloc[i] |
| 299 | + idx = np.unique(event_pairs_[["idx_eve1", "idx_eve2"]].values.flatten()) |
| 300 | + picks_ = picks[picks["idx_eve"].isin(idx)] |
| 301 | + job = pool.apply_async(pairing_picks, args=(event_pairs_, picks_, config), callback=lambda x: pbar.update()) |
| 302 | + jobs.append(job) |
| 303 | + pool.close() |
| 304 | + pool.join() |
| 305 | + for job in jobs: |
| 306 | + results.append(job.get()) |
| 307 | + |
| 308 | + event_pairs = pd.concat(results, ignore_index=True) |
| 309 | + event_pairs = event_pairs.drop_duplicates() |
| 310 | + |
| 311 | + print(f"Number of pick pairs: {len(event_pairs)}") |
| 312 | + |
| 313 | + dtypes = np.dtype( |
| 314 | + [ |
| 315 | + ("idx_eve1", np.int32), |
| 316 | + ("idx_eve2", np.int32), |
| 317 | + ("idx_sta", np.int32), |
| 318 | + ("phase_type", np.int32), |
| 319 | + ("phase_score", np.float32), |
| 320 | + ("phase_dtime", np.float32), |
| 321 | + ] |
| 322 | + ) |
| 323 | + pairs_array = np.memmap( |
| 324 | + f"{root_path}/{result_path}/pair_dt.dat", |
| 325 | + mode="w+", |
| 326 | + shape=(len(event_pairs),), |
| 327 | + dtype=dtypes, |
| 328 | + ) |
| 329 | + pairs_array["idx_eve1"] = event_pairs["idx_eve1"].values |
| 330 | + pairs_array["idx_eve2"] = event_pairs["idx_eve2"].values |
| 331 | + pairs_array["idx_sta"] = event_pairs["idx_sta"].values |
| 332 | + pairs_array["phase_type"] = event_pairs["phase_type"].values |
| 333 | + pairs_array["phase_score"] = event_pairs["phase_score"].values |
| 334 | + pairs_array["phase_dtime"] = event_pairs["phase_dtime"].values |
| 335 | + with open(f"{root_path}/{result_path}/pair_dtypes.pkl", "wb") as f: |
| 336 | + pickle.dump(dtypes, f) |
| 337 | + |
| 338 | + events.to_csv(f"{root_path}/{result_path}/pair_events.csv", index=False) |
| 339 | + stations.to_csv(f"{root_path}/{result_path}/pair_stations.csv", index=False) |
| 340 | + picks.to_csv(f"{root_path}/{result_path}/pair_picks.csv", index=False) |
| 341 | + |
| 342 | +# %% |
0 commit comments