Skip to content

Commit 1d641b9

Browse files
committed
Merge branch 'main' of github.com:AI4EPS/QuakeFlow
2 parents 78fbff8 + 18f5b73 commit 1d641b9

File tree

15 files changed

+2210
-144
lines changed

15 files changed

+2210
-144
lines changed

datasets/.skyignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
NCEDC
2+
SCEDC

datasets/NCEDC/convert_hdf5_v2.py

Lines changed: 168 additions & 40 deletions
Large diffs are not rendered by default.

datasets/NCEDC/extract_csv.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,18 @@ def process(i, folder, mode):
3131

3232
events_df = pd.DataFrame(events_df)
3333
picks_df = pd.DataFrame(picks_df)
34-
events_df.to_csv(f"{folder}/events{mode}.csv", index=False)
35-
picks_df.to_csv(f"{folder}/picks{mode}.csv", index=False)
34+
for col in events_df.columns:
35+
try:
36+
events_df[col] = events_df[col].apply(lambda x: x.__str__().replace("\n", " ").replace("\t", " ").replace("\r", " ")) # prevent csv from breaking # replace('nan', '')
37+
except:
38+
pass
39+
for col in picks_df.columns:
40+
try:
41+
picks_df[col] = picks_df[col].apply(lambda x: x.__str__().replace("\n", " ").replace("\t", " ").replace("\r", " ")) # prevent csv from breaking
42+
except:
43+
pass
44+
events_df.to_csv(f"{folder}/events{mode}.csv", index=False, na_rep='')
45+
picks_df.to_csv(f"{folder}/picks{mode}.csv", index=False, na_rep='')
3646

3747

3848
# %%

datasets/NCEDC/extract_ps.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
h5_files = os.listdir(h5_dir)
1111

1212
# %%
13-
result_path = "waveform_ps_h5"
13+
result_path = "quakeflow_nc/waveform_h5"
1414
if not os.path.exists(result_path):
1515
os.makedirs(result_path)
1616

@@ -19,7 +19,7 @@
1919
def run(h5_file):
2020
h5_input = os.path.join(h5_dir, h5_file)
2121
h5_output = os.path.join(result_path, h5_file)
22-
pos = 2022 - int(h5_file.split("/")[-1].split(".")[0])
22+
pos = 2023 - int(h5_file.split("/")[-1].split(".")[0])
2323
with h5py.File(h5_input, "r") as fin:
2424
with h5py.File(h5_output, "w") as fout:
2525
for event in tqdm(fin.keys(), desc=h5_file, total=len(fin.keys()), position=pos, leave=True):
@@ -29,7 +29,8 @@ def run(h5_file):
2929
gp.attrs[key] = fin[event].attrs[key]
3030
num_station = 0
3131
for station in fin[event].keys():
32-
if "S" in fin[event][station].attrs["phase_type"]:
32+
attrs = fin[event][station].attrs
33+
if len(set(attrs["phase_type"][attrs["event_id"] == event])) > 1:
3334
ds = gp.create_dataset(station, data=fin[event][station])
3435
for key in fin[event][station].attrs.keys():
3536
ds.attrs[key] = fin[event][station].attrs[key]
@@ -43,7 +44,7 @@ def run(h5_file):
4344
if __name__ == "__main__":
4445
# run(0, h5_files[0])
4546

46-
ncpu = len(h5_files)
47+
ncpu = min(len(h5_files), 32)
4748
print(f"Using {ncpu} CPUs")
4849
with mp.Pool(ncpu) as p:
4950
p.map(run, h5_files)

datasets/NCEDC/merge_hdf5.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from tqdm import tqdm
77

88
# %%
9-
h5_dir = "waveform_ps_h5"
10-
h5_out = "waveform_ps.h5"
11-
h5_train = "waveform_ps_train.h5"
12-
h5_test = "waveform_ps_test.h5"
9+
h5_dir = "waveform_h5"
10+
h5_out = "waveform.h5"
11+
h5_train = "waveform_train.h5"
12+
h5_test = "waveform_test.h5"
1313

1414
# # %%
1515
# h5_dir = "waveform_h5"

datasets/NCEDC/parse_event.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# %%
2+
import fsspec
3+
import pandas as pd
4+
from tqdm import tqdm
5+
import os
6+
from io import StringIO
7+
8+
9+
input_protocol = "s3"
10+
input_bucket = "ncedc-pds"
11+
input_folder = "earthquake_catalogs/NCEDC"
12+
13+
output_protocol = "gs"
14+
output_bucket = "quakeflow_dataset"
15+
output_folder = "NC/catalog"
16+
17+
result_path = "dataset"
18+
os.makedirs(result_path, exist_ok=True)
19+
20+
# %%
21+
# status: (Event status)
22+
# A: Automatic
23+
# F: Finalized
24+
# H: Human Reviewed
25+
# I: Intermediate
26+
27+
# magType: (Magnitude Type)
28+
# a : Primary amplitude magnitude (Jerry Eaton's XMAG)
29+
# b : Body-wave magnitude
30+
# d : Duration magnitude
31+
# dl: Low-gain initial P-wave amplitude magnitude
32+
# e : Energy magnitude
33+
# h : Human assigned magnitude
34+
# l : Local magnitude
35+
# n : No magnitude
36+
# un: Unknown magnitude type
37+
# w : Moment magnitude
38+
39+
# type: (EventType)
40+
# bc: Building collapse/demolition
41+
# eq: Earthquake
42+
# ex: Generic chemical blast
43+
# lp: Long period volcanic earthquake
44+
# ls: Landslide
45+
# mi: Meteor/comet impact
46+
# nt: Nuclear test
47+
# ot: Other miscellaneous
48+
# qb: Quarry blast
49+
# rs: Rockslide
50+
# sh: Refraction/reflection survey shot
51+
# sn: Sonic shockwave
52+
# st: Subnet trigger
53+
# th: Thunder
54+
# uk: Unknown type
55+
56+
def map_column_names(df):
57+
column_mapping = {
58+
'id': 'event_id',
59+
'time': 'time',
60+
'latitude': 'latitude',
61+
'longitude': 'longitude',
62+
'depth': 'depth_km',
63+
'mag': 'magnitude',
64+
'magType': 'magnitude_type',
65+
'type': 'event_type',
66+
'gap': 'azimuthal_gap',
67+
'dmin': 'minimum_distance_km',
68+
'rms': 'time_residual',
69+
'horizontalError': 'horizontal_error_km',
70+
'depthError': 'depth_error_km',
71+
'status': 'review_status',
72+
'nst': 'num_stations',
73+
'net': 'network',
74+
'updated': 'updated_time',
75+
'place': 'place',
76+
'magError': 'magnitude_error',
77+
'magNst': 'magnitude_num_stations',
78+
'locationSource': 'location_source',
79+
'magSource': 'magnitude_source'
80+
}
81+
82+
# Rename columns that exist in the dataframe
83+
existing_columns = {col: column_mapping[col] for col in df.columns if col in column_mapping}
84+
df = df.rename(columns=existing_columns)
85+
86+
return df
87+
88+
# %%
89+
input_fs = fsspec.filesystem(input_protocol, anon=True)
90+
csv_files = sorted(input_fs.glob(f"{input_bucket}/{input_folder}/*.ehpcsv"), reverse=True)
91+
output_fs = fsspec.filesystem(output_protocol, token=os.path.expanduser("~/.config/gcloud/application_default_credentials.json"))
92+
93+
# %%
94+
columns_to_keep = [
95+
'event_id',
96+
'time',
97+
'latitude',
98+
'longitude',
99+
'depth_km',
100+
'magnitude',
101+
'magnitude_type',
102+
'event_type',
103+
'azimuthal_gap',
104+
'minimum_distance_km',
105+
'time_residual',
106+
'horizontal_error_km',
107+
'depth_error_km',
108+
'review_status',
109+
]
110+
111+
for csv_file in tqdm(csv_files):
112+
print(csv_file)
113+
114+
df = pd.read_csv(f"{input_protocol}://{csv_file}", dtype=str, encoding='latin-1')
115+
df = map_column_names(df)
116+
117+
df["time"] = pd.to_datetime(df["time"])
118+
df["year"] = df["time"].dt.strftime("%Y")
119+
df["jday"] = df["time"].dt.strftime("%j")
120+
df['time'] = df['time'].apply(lambda x: x.strftime('%Y-%m-%dT%H:%M:%S.%f'))
121+
df['event_id'] = df['event_id'].apply(lambda x: "nc" + x)
122+
123+
124+
for (year, jday), df in df.groupby(["year", "jday"]):
125+
if len(df) == 0:
126+
continue
127+
os.makedirs(f"{result_path}/{year}/{jday}", exist_ok=True)
128+
129+
df = df[columns_to_keep]
130+
df.to_csv(f"{result_path}/{year}/{jday}/events.csv", index=False)
131+
output_fs.put(
132+
f"{result_path}/{year}/{jday}/events.csv",
133+
f"{output_bucket}/{output_folder}/{year}/{jday}/events.csv",
134+
)
135+
# df.to_csv(f"{output_protocol}://{output_bucket}/{output_folder}/{year}/{jday}/events.csv", index=False)
136+
137+
if year <= "2024":
138+
break
139+
140+
141+
# %%
142+

0 commit comments

Comments
 (0)