AI4EPS
diff --git a/‎datasets/.skyignore‎
Lines changed: 2 additions & 0 deletions b/‎datasets/.skyignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎datasets/NCEDC/convert_hdf5_v2.py‎
Lines changed: 168 additions & 40 deletions b/‎datasets/NCEDC/convert_hdf5_v2.py‎
Lines changed: 168 additions & 40 deletions
diff --git a/‎datasets/NCEDC/extract_csv.py‎
Lines changed: 12 additions & 2 deletions b/‎datasets/NCEDC/extract_csv.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎datasets/NCEDC/extract_ps.py‎
Lines changed: 5 additions & 4 deletions b/‎datasets/NCEDC/extract_ps.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎datasets/NCEDC/merge_hdf5.py‎
Lines changed: 4 additions & 4 deletions b/‎datasets/NCEDC/merge_hdf5.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎datasets/NCEDC/parse_event.py‎
Lines changed: 142 additions & 0 deletions b/‎datasets/NCEDC/parse_event.py‎
Lines changed: 142 additions & 0 deletions
@@ -0,0 +1,2 @@
+NCEDC
+SCEDC
@@ -31,8 +31,18 @@ def process(i, folder, mode):
 
     events_df = pd.DataFrame(events_df)
     picks_df = pd.DataFrame(picks_df)
-    events_df.to_csv(f"{folder}/events{mode}.csv", index=False)
-    picks_df.to_csv(f"{folder}/picks{mode}.csv", index=False)
+    for col in events_df.columns:
+        try:
+            events_df[col] = events_df[col].apply(lambda x: x.__str__().replace("\n", " ").replace("\t", " ").replace("\r", " ")) # prevent csv from breaking # replace('nan', '')
+        except:
+            pass
+    for col in picks_df.columns:
+        try:
+            picks_df[col] = picks_df[col].apply(lambda x: x.__str__().replace("\n", " ").replace("\t", " ").replace("\r", " ")) # prevent csv from breaking
+        except:
+            pass
+    events_df.to_csv(f"{folder}/events{mode}.csv", index=False, na_rep='')
+    picks_df.to_csv(f"{folder}/picks{mode}.csv", index=False, na_rep='')
 
 
 # %%
 
@@ -10,7 +10,7 @@
 h5_files = os.listdir(h5_dir)
 
 # %%
-result_path = "waveform_ps_h5"
+result_path = "quakeflow_nc/waveform_h5"
 if not os.path.exists(result_path):
     os.makedirs(result_path)
 
@@ -19,7 +19,7 @@
 def run(h5_file):
     h5_input = os.path.join(h5_dir, h5_file)
     h5_output = os.path.join(result_path, h5_file)
-    pos = 2022 - int(h5_file.split("/")[-1].split(".")[0])
+    pos = 2023 - int(h5_file.split("/")[-1].split(".")[0])
     with h5py.File(h5_input, "r") as fin:
         with h5py.File(h5_output, "w") as fout:
             for event in tqdm(fin.keys(), desc=h5_file, total=len(fin.keys()), position=pos, leave=True):
@@ -29,7 +29,8 @@ def run(h5_file):
                     gp.attrs[key] = fin[event].attrs[key]
                 num_station = 0
                 for station in fin[event].keys():
-                    if "S" in fin[event][station].attrs["phase_type"]:
+                    attrs = fin[event][station].attrs
+                    if len(set(attrs["phase_type"][attrs["event_id"] == event])) > 1:
                         ds = gp.create_dataset(station, data=fin[event][station])
                         for key in fin[event][station].attrs.keys():
                             ds.attrs[key] = fin[event][station].attrs[key]
@@ -43,7 +44,7 @@ def run(h5_file):
 if __name__ == "__main__":
     # run(0, h5_files[0])
 
-    ncpu = len(h5_files)
+    ncpu = min(len(h5_files), 32)
     print(f"Using {ncpu} CPUs")
     with mp.Pool(ncpu) as p:
         p.map(run, h5_files)
@@ -6,10 +6,10 @@
 from tqdm import tqdm
 
 # %%
-h5_dir = "waveform_ps_h5"
-h5_out = "waveform_ps.h5"
-h5_train = "waveform_ps_train.h5"
-h5_test = "waveform_ps_test.h5"
+h5_dir = "waveform_h5"
+h5_out = "waveform.h5"
+h5_train = "waveform_train.h5"
+h5_test = "waveform_test.h5"
 
 # # %%
 # h5_dir = "waveform_h5"
 
@@ -0,0 +1,142 @@
+# %%
+import fsspec   
+import pandas as pd
+from tqdm import tqdm
+import os
+from io import StringIO
+
+
+input_protocol = "s3"
+input_bucket = "ncedc-pds"
+input_folder = "earthquake_catalogs/NCEDC"
+
+output_protocol = "gs"
+output_bucket = "quakeflow_dataset"
+output_folder = "NC/catalog"
+
+result_path = "dataset"
+os.makedirs(result_path, exist_ok=True)
+
+# %%
+# status: (Event status)
+#       A: Automatic
+#       F: Finalized
+#       H: Human Reviewed
+#       I: Intermediate
+
+# magType: (Magnitude Type)
+#       a : Primary amplitude magnitude (Jerry Eaton's XMAG)
+#       b : Body-wave magnitude
+#       d : Duration magnitude
+#       dl: Low-gain initial P-wave amplitude magnitude
+#       e : Energy magnitude
+#       h : Human assigned magnitude
+#       l : Local magnitude
+#       n : No magnitude
+#       un: Unknown magnitude type
+#       w : Moment magnitude
+
+# type: (EventType)
+#       bc: Building collapse/demolition
+#       eq: Earthquake
+#       ex: Generic chemical blast
+#       lp: Long period volcanic earthquake
+#       ls: Landslide
+#       mi: Meteor/comet impact
+#       nt: Nuclear test
+#       ot: Other miscellaneous
+#       qb: Quarry blast
+#       rs: Rockslide
+#       sh: Refraction/reflection survey shot
+#       sn: Sonic shockwave
+#       st: Subnet trigger
+#       th: Thunder
+#       uk: Unknown type
+
+def map_column_names(df):
+    column_mapping = {
+        'id': 'event_id',
+        'time': 'time',
+        'latitude': 'latitude', 
+        'longitude': 'longitude',
+        'depth': 'depth_km',
+        'mag': 'magnitude',
+        'magType': 'magnitude_type',
+        'type': 'event_type',
+        'gap': 'azimuthal_gap',
+        'dmin': 'minimum_distance_km',
+        'rms': 'time_residual',
+        'horizontalError': 'horizontal_error_km',
+        'depthError': 'depth_error_km', 
+        'status': 'review_status',
+        'nst': 'num_stations',
+        'net': 'network',
+        'updated': 'updated_time',
+        'place': 'place',
+        'magError': 'magnitude_error',
+        'magNst': 'magnitude_num_stations',
+        'locationSource': 'location_source',
+        'magSource': 'magnitude_source'
+    }
+    
+    # Rename columns that exist in the dataframe
+    existing_columns = {col: column_mapping[col] for col in df.columns if col in column_mapping}
+    df = df.rename(columns=existing_columns)
+    
+    return df
+
+# %%
+input_fs = fsspec.filesystem(input_protocol, anon=True)
+csv_files = sorted(input_fs.glob(f"{input_bucket}/{input_folder}/*.ehpcsv"), reverse=True)
+output_fs = fsspec.filesystem(output_protocol, token=os.path.expanduser("~/.config/gcloud/application_default_credentials.json"))
+
+# %%
+columns_to_keep = [
+        'event_id',
+        'time',
+        'latitude', 
+        'longitude',
+        'depth_km',
+        'magnitude',
+        'magnitude_type',
+        'event_type',
+        'azimuthal_gap',
+        'minimum_distance_km',
+        'time_residual',
+        'horizontal_error_km',
+        'depth_error_km', 
+        'review_status',
+]
+
+for csv_file in tqdm(csv_files):
+    print(csv_file)
+
+    df = pd.read_csv(f"{input_protocol}://{csv_file}", dtype=str, encoding='latin-1')
+    df = map_column_names(df)
+
+    df["time"] = pd.to_datetime(df["time"])
+    df["year"] = df["time"].dt.strftime("%Y")
+    df["jday"] = df["time"].dt.strftime("%j")
+    df['time'] = df['time'].apply(lambda x: x.strftime('%Y-%m-%dT%H:%M:%S.%f'))
+    df['event_id'] = df['event_id'].apply(lambda x: "nc" + x)
+
+
+    for (year, jday), df in df.groupby(["year", "jday"]):
+        if len(df) == 0:
+            continue
+        os.makedirs(f"{result_path}/{year}/{jday}", exist_ok=True)
+
+        df = df[columns_to_keep]
+        df.to_csv(f"{result_path}/{year}/{jday}/events.csv", index=False)
+        output_fs.put(
+            f"{result_path}/{year}/{jday}/events.csv",
+            f"{output_bucket}/{output_folder}/{year}/{jday}/events.csv",
+        )
+        # df.to_csv(f"{output_protocol}://{output_bucket}/{output_folder}/{year}/{jday}/events.csv", index=False)
+
+    if year <= "2024":
+        break
+
+    
+# %%
+