Skip to content

Commit 32d8176

Browse files
committed
update california
1 parent 6bf3c4e commit 32d8176

File tree

3 files changed

+709
-0
lines changed

3 files changed

+709
-0
lines changed
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
# %%
2+
import argparse
3+
import json
4+
import multiprocessing as mp
5+
import os
6+
import pickle
7+
import time
8+
from contextlib import nullcontext
9+
10+
import fsspec
11+
import numpy as np
12+
import pandas as pd
13+
from pyproj import Proj
14+
from sklearn.neighbors import NearestNeighbors
15+
from tqdm import tqdm
16+
17+
18+
# %%
19+
def pairing_picks(event_pairs, picks, config):
20+
21+
picks = picks[["idx_eve", "idx_sta", "phase_type", "phase_score", "phase_time"]].copy()
22+
merged = pd.merge(
23+
event_pairs,
24+
picks,
25+
left_on="idx_eve1",
26+
right_on="idx_eve",
27+
)
28+
merged = pd.merge(
29+
merged,
30+
picks,
31+
left_on=["idx_eve2", "idx_sta", "phase_type"],
32+
right_on=["idx_eve", "idx_sta", "phase_type"],
33+
suffixes=("_1", "_2"),
34+
)
35+
merged = merged.rename(columns={"phase_time_1": "phase_time1", "phase_time_2": "phase_time2"})
36+
merged["phase_score"] = (merged["phase_score_1"] + merged["phase_score_2"]) / 2.0
37+
38+
merged["travel_time1"] = (merged["phase_time1"] - merged["event_time1"]).dt.total_seconds()
39+
merged["travel_time2"] = (merged["phase_time2"] - merged["event_time2"]).dt.total_seconds()
40+
merged["phase_dtime"] = merged["travel_time1"] - merged["travel_time2"]
41+
42+
# filtering
43+
# merged = merged.sort_values("phase_score", ascending=False)
44+
merged = (
45+
merged.groupby(["idx_eve1", "idx_eve2"], group_keys=False)
46+
.apply(lambda x: (x.nlargest(config["MAX_OBS"], "phase_score") if len(x) > config["MIN_OBS"] else None))
47+
.reset_index(drop=True)
48+
)
49+
50+
return merged[["idx_eve1", "idx_eve2", "idx_sta", "phase_type", "phase_score", "phase_dtime"]]
51+
52+
53+
# %%
54+
def load_data(year, jday, data_path, root_path, bucket, protocol, token):
55+
56+
fs = fsspec.filesystem(protocol, token=token)
57+
adloc_events_csv = f"{data_path}/{year:04d}/adloc_events_{jday:03d}.csv"
58+
adloc_picks_csv = f"{data_path}/{year:04d}/adloc_picks_{jday:03d}.csv"
59+
if protocol == "file":
60+
events = pd.read_csv(f"{root_path}/{adloc_events_csv}", parse_dates=["time"])
61+
picks = pd.read_csv(f"{root_path}/{adloc_picks_csv}", parse_dates=["phase_time"])
62+
else:
63+
with fs.open(f"{bucket}/{adloc_events_csv}", "r") as fp:
64+
events = pd.read_csv(fp, parse_dates=["time"])
65+
with fs.open(f"{bucket}/{adloc_picks_csv}", "r") as fp:
66+
picks = pd.read_csv(fp, parse_dates=["phase_time"])
67+
68+
events["year"] = year
69+
events["jday"] = jday
70+
picks["year"] = year
71+
picks["jday"] = jday
72+
73+
return events, picks
74+
75+
76+
def parse_args():
77+
parser = argparse.ArgumentParser(description="Run Gamma on NCEDC/SCEDC data")
78+
parser.add_argument("--num_nodes", type=int, default=366)
79+
parser.add_argument("--node_rank", type=int, default=0)
80+
parser.add_argument("--year", type=int, default=2023)
81+
parser.add_argument("--root_path", type=str, default="local")
82+
parser.add_argument("--region", type=str, default="Cal")
83+
parser.add_argument("--bucket", type=str, default="quakeflow_catalog")
84+
return parser.parse_args()
85+
86+
87+
# %%
88+
if __name__ == "__main__":
89+
90+
# %%
91+
protocol = "gs"
92+
token_json = f"application_default_credentials.json"
93+
with open(token_json, "r") as fp:
94+
token = json.load(fp)
95+
96+
fs = fsspec.filesystem(protocol, token=token)
97+
98+
# %%
99+
args = parse_args()
100+
region = args.region
101+
root_path = args.root_path
102+
bucket = args.bucket
103+
num_nodes = args.num_nodes
104+
node_rank = args.node_rank
105+
year = args.year
106+
107+
data_path = f"{region}/adloc2"
108+
result_path = f"{region}/adloc_dd_2022"
109+
110+
if not os.path.exists(f"{root_path}/{result_path}"):
111+
os.makedirs(f"{root_path}/{result_path}")
112+
113+
# %%
114+
station_json = f"{region}/network/stations.json"
115+
if protocol == "file":
116+
stations = pd.read_json(f"{root_path}/{station_json}", orient="index")
117+
else:
118+
with fs.open(f"{bucket}/{station_json}", "r") as fp:
119+
stations = pd.read_json(fp, orient="index")
120+
stations["station_id"] = stations.index
121+
122+
# %%
123+
events = []
124+
picks = []
125+
jobs = []
126+
ctx = mp.get_context("spawn")
127+
ncpu = min(32, mp.cpu_count())
128+
# years = range(2015, 2024)
129+
years = [2022]
130+
# num_days = sum([366 if (year % 4 == 0 and year % 100 != 0) or year % 400 == 0 else 365 for year in years])
131+
num_days = 365 * len(years)
132+
pbar = tqdm(total=num_days, desc="Loading data")
133+
with ctx.Pool(processes=ncpu) as pool:
134+
for year in years:
135+
# num_jday = 366 if (year % 4 == 0 and year % 100 != 0) or year % 400 == 0 else 365
136+
num_jday = 365
137+
for jday in range(1, num_jday + 1):
138+
job = pool.apply_async(
139+
load_data,
140+
args=(year, jday, data_path, root_path, bucket, protocol, token),
141+
callback=lambda x: pbar.update(),
142+
)
143+
jobs.append(job)
144+
145+
pool.close()
146+
pool.join()
147+
for job in jobs:
148+
events_, picks_ = job.get()
149+
events.append(events_)
150+
picks.append(picks_)
151+
152+
pbar.close()
153+
events = pd.concat(events, ignore_index=True)
154+
picks = pd.concat(picks, ignore_index=True)
155+
156+
events = events.sort_values("time")
157+
events["dummy_id"] = (
158+
events["year"].astype(str)
159+
+ "."
160+
+ events["jday"].astype(str).str.zfill(3)
161+
+ "."
162+
+ events["event_index"].astype(str).str.zfill(4)
163+
)
164+
picks["dummy_id"] = (
165+
picks["year"].astype(str)
166+
+ "."
167+
+ picks["jday"].astype(str).str.zfill(3)
168+
+ "."
169+
+ picks["event_index"].astype(str).str.zfill(4)
170+
)
171+
events["event_index"] = np.arange(len(events))
172+
picks = picks.drop("event_index", axis=1)
173+
picks = picks.merge(events[["dummy_id", "event_index"]], on="dummy_id")
174+
175+
print(f"Processing {len(events)} events, {len(picks)} picks")
176+
177+
events.to_csv(f"{root_path}/{result_path}/events.csv", index=False)
178+
picks.to_csv(f"{root_path}/{result_path}/picks.csv", index=False)
179+
180+
# # %%
181+
# events = pd.read_csv(f"{root_path}/{result_path}/events.csv", parse_dates=["time"])
182+
# picks = pd.read_csv(f"{root_path}/{result_path}/picks.csv", parse_dates=["phase_time"])
183+
184+
# %%
185+
MAX_PAIR_DIST = 10 # km
186+
MAX_NEIGHBORS = 50
187+
MIN_NEIGHBORS = 8
188+
MIN_OBS = 8
189+
MAX_OBS = 100
190+
config = {}
191+
config["MAX_PAIR_DIST"] = MAX_PAIR_DIST
192+
config["MAX_NEIGHBORS"] = MAX_NEIGHBORS
193+
config["MIN_NEIGHBORS"] = MIN_NEIGHBORS
194+
config["MIN_OBS"] = MIN_OBS
195+
config["MAX_OBS"] = MAX_OBS
196+
mapping_phase_type_int = {"P": 0, "S": 1}
197+
198+
picks = picks[picks["event_index"] != -1]
199+
# check phase_type is P/S or 0/1
200+
# if set(picks["phase_type"].unique()).issubset(set(mapping_phase_type_int.keys())): # P/S
201+
picks["phase_type"] = picks["phase_type"].map(mapping_phase_type_int)
202+
203+
# %%
204+
if "idx_eve" in events.columns:
205+
events = events.drop("idx_eve", axis=1)
206+
if "idx_sta" in stations.columns:
207+
stations = stations.drop("idx_sta", axis=1)
208+
if "idx_eve" in picks.columns:
209+
picks = picks.drop("idx_eve", axis=1)
210+
if "idx_sta" in picks.columns:
211+
picks = picks.drop("idx_sta", axis=1)
212+
213+
# %%
214+
# reindex in case the index does not start from 0 or is not continuous
215+
stations = stations[stations["station_id"].isin(picks["station_id"].unique())]
216+
events = events[events["event_index"].isin(picks["event_index"].unique())]
217+
stations["idx_sta"] = np.arange(len(stations))
218+
events["idx_eve"] = np.arange(len(events))
219+
220+
picks = picks.merge(events[["event_index", "idx_eve"]], on="event_index")
221+
picks = picks.merge(stations[["station_id", "idx_sta"]], on="station_id")
222+
223+
# %%
224+
lon0 = stations["longitude"].median()
225+
lat0 = stations["latitude"].median()
226+
proj = Proj(f"+proj=sterea +lon_0={lon0} +lat_0={lat0} +units=km")
227+
228+
stations[["x_km", "y_km"]] = stations.apply(
229+
lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1
230+
)
231+
stations["depth_km"] = -stations["elevation_m"] / 1000
232+
stations["z_km"] = stations["depth_km"]
233+
234+
events[["x_km", "y_km"]] = events.apply(
235+
lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1
236+
)
237+
events["z_km"] = events["depth_km"]
238+
239+
picks = picks.merge(events[["idx_eve", "time"]], on="idx_eve")
240+
picks["travel_time"] = (picks["phase_time"] - picks["time"]).dt.total_seconds()
241+
picks.drop("time", axis=1, inplace=True)
242+
243+
# %%
244+
# # Option 1: Radius neighbors
245+
# neigh = NearestNeighbors(radius=MAX_PAIR_DIST, n_jobs=-1)
246+
# print("Fitting NearestNeighbors")
247+
# neigh.fit(events[["x_km", "y_km", "z_km"]].values)
248+
# pairs = set()
249+
# print("Get neighbors")
250+
# neigh_ind = neigh.radius_neighbors(sort_results=True)[1]
251+
# print("Generating pairs")
252+
# for i, neighs in enumerate(tqdm(neigh_ind, desc="Generating pairs")):
253+
# if len(neighs) < MIN_NEIGHBORS:
254+
# continue
255+
# for j in neighs[:MAX_NEIGHBORS]:
256+
# if i < j:
257+
# pairs.add((i, j))
258+
# else:
259+
# pairs.add((j, i))
260+
261+
# Option 2: K-nearest neighbors
262+
neigh = NearestNeighbors(n_neighbors=MAX_NEIGHBORS, n_jobs=-1)
263+
print("Fitting NearestNeighbors...")
264+
neigh.fit(events[["x_km", "y_km", "z_km"]].values)
265+
pairs = set()
266+
print("Get neighbors...")
267+
neigh_dist, neigh_ind = neigh.kneighbors()
268+
print("Generating pairs...")
269+
for i, (dists, inds) in enumerate(tqdm(zip(neigh_dist, neigh_ind), desc="Generating pairs", total=len(neigh_ind))):
270+
inds = inds[dists <= MAX_PAIR_DIST]
271+
if len(inds) < MIN_NEIGHBORS:
272+
continue
273+
for j in inds:
274+
if i < j:
275+
pairs.add((i, j))
276+
else:
277+
pairs.add((j, i))
278+
279+
pairs = list(pairs)
280+
event_pairs = pd.DataFrame(list(pairs), columns=["idx_eve1", "idx_eve2"])
281+
print(f"Number of events: {len(events)}")
282+
print(f"Number of event pairs: {len(event_pairs)}")
283+
event_pairs["event_time1"] = events["time"].iloc[event_pairs["idx_eve1"]].values
284+
event_pairs["event_time2"] = events["time"].iloc[event_pairs["idx_eve2"]].values
285+
286+
# %%
287+
chunk_size = 100_000
288+
num_chunk = len(event_pairs) // chunk_size
289+
pbar = tqdm(total=num_chunk, desc="Pairing picks")
290+
291+
results = []
292+
jobs = []
293+
ctx = mp.get_context("spawn")
294+
ncpu = min(num_chunk, min(32, mp.cpu_count()))
295+
picks["idx_eve"] = picks["idx_eve"].astype("category")
296+
with ctx.Pool(processes=ncpu) as pool:
297+
for i in np.array_split(np.arange(len(event_pairs)), num_chunk):
298+
event_pairs_ = event_pairs.iloc[i]
299+
idx = np.unique(event_pairs_[["idx_eve1", "idx_eve2"]].values.flatten())
300+
picks_ = picks[picks["idx_eve"].isin(idx)]
301+
job = pool.apply_async(pairing_picks, args=(event_pairs_, picks_, config), callback=lambda x: pbar.update())
302+
jobs.append(job)
303+
pool.close()
304+
pool.join()
305+
for job in jobs:
306+
results.append(job.get())
307+
308+
event_pairs = pd.concat(results, ignore_index=True)
309+
event_pairs = event_pairs.drop_duplicates()
310+
311+
print(f"Number of pick pairs: {len(event_pairs)}")
312+
313+
dtypes = np.dtype(
314+
[
315+
("idx_eve1", np.int32),
316+
("idx_eve2", np.int32),
317+
("idx_sta", np.int32),
318+
("phase_type", np.int32),
319+
("phase_score", np.float32),
320+
("phase_dtime", np.float32),
321+
]
322+
)
323+
pairs_array = np.memmap(
324+
f"{root_path}/{result_path}/pair_dt.dat",
325+
mode="w+",
326+
shape=(len(event_pairs),),
327+
dtype=dtypes,
328+
)
329+
pairs_array["idx_eve1"] = event_pairs["idx_eve1"].values
330+
pairs_array["idx_eve2"] = event_pairs["idx_eve2"].values
331+
pairs_array["idx_sta"] = event_pairs["idx_sta"].values
332+
pairs_array["phase_type"] = event_pairs["phase_type"].values
333+
pairs_array["phase_score"] = event_pairs["phase_score"].values
334+
pairs_array["phase_dtime"] = event_pairs["phase_dtime"].values
335+
with open(f"{root_path}/{result_path}/pair_dtypes.pkl", "wb") as f:
336+
pickle.dump(dtypes, f)
337+
338+
events.to_csv(f"{root_path}/{result_path}/pair_events.csv", index=False)
339+
stations.to_csv(f"{root_path}/{result_path}/pair_stations.csv", index=False)
340+
picks.to_csv(f"{root_path}/{result_path}/pair_picks.csv", index=False)
341+
342+
# %%

0 commit comments

Comments
 (0)