|
| 1 | +# %% |
| 2 | +import fsspec |
| 3 | +import pandas as pd |
| 4 | +from tqdm import tqdm |
| 5 | +import os |
| 6 | +from io import StringIO |
| 7 | + |
| 8 | + |
| 9 | +input_protocol = "s3" |
| 10 | +input_bucket = "ncedc-pds" |
| 11 | +input_folder = "earthquake_catalogs/NCEDC" |
| 12 | + |
| 13 | +output_protocol = "gs" |
| 14 | +output_bucket = "quakeflow_dataset" |
| 15 | +output_folder = "NC/catalog" |
| 16 | + |
| 17 | +result_path = "dataset" |
| 18 | +os.makedirs(result_path, exist_ok=True) |
| 19 | + |
| 20 | +# %% |
| 21 | +# status: (Event status) |
| 22 | +# A: Automatic |
| 23 | +# F: Finalized |
| 24 | +# H: Human Reviewed |
| 25 | +# I: Intermediate |
| 26 | + |
| 27 | +# magType: (Magnitude Type) |
| 28 | +# a : Primary amplitude magnitude (Jerry Eaton's XMAG) |
| 29 | +# b : Body-wave magnitude |
| 30 | +# d : Duration magnitude |
| 31 | +# dl: Low-gain initial P-wave amplitude magnitude |
| 32 | +# e : Energy magnitude |
| 33 | +# h : Human assigned magnitude |
| 34 | +# l : Local magnitude |
| 35 | +# n : No magnitude |
| 36 | +# un: Unknown magnitude type |
| 37 | +# w : Moment magnitude |
| 38 | + |
| 39 | +# type: (EventType) |
| 40 | +# bc: Building collapse/demolition |
| 41 | +# eq: Earthquake |
| 42 | +# ex: Generic chemical blast |
| 43 | +# lp: Long period volcanic earthquake |
| 44 | +# ls: Landslide |
| 45 | +# mi: Meteor/comet impact |
| 46 | +# nt: Nuclear test |
| 47 | +# ot: Other miscellaneous |
| 48 | +# qb: Quarry blast |
| 49 | +# rs: Rockslide |
| 50 | +# sh: Refraction/reflection survey shot |
| 51 | +# sn: Sonic shockwave |
| 52 | +# st: Subnet trigger |
| 53 | +# th: Thunder |
| 54 | +# uk: Unknown type |
| 55 | + |
| 56 | +def map_column_names(df): |
| 57 | + column_mapping = { |
| 58 | + 'id': 'event_id', |
| 59 | + 'time': 'time', |
| 60 | + 'latitude': 'latitude', |
| 61 | + 'longitude': 'longitude', |
| 62 | + 'depth': 'depth_km', |
| 63 | + 'mag': 'magnitude', |
| 64 | + 'magType': 'magnitude_type', |
| 65 | + 'type': 'event_type', |
| 66 | + 'gap': 'azimuthal_gap', |
| 67 | + 'dmin': 'minimum_distance_km', |
| 68 | + 'rms': 'time_residual', |
| 69 | + 'horizontalError': 'horizontal_error_km', |
| 70 | + 'depthError': 'depth_error_km', |
| 71 | + 'status': 'review_status', |
| 72 | + 'nst': 'num_stations', |
| 73 | + 'net': 'network', |
| 74 | + 'updated': 'updated_time', |
| 75 | + 'place': 'place', |
| 76 | + 'magError': 'magnitude_error', |
| 77 | + 'magNst': 'magnitude_num_stations', |
| 78 | + 'locationSource': 'location_source', |
| 79 | + 'magSource': 'magnitude_source' |
| 80 | + } |
| 81 | + |
| 82 | + # Rename columns that exist in the dataframe |
| 83 | + existing_columns = {col: column_mapping[col] for col in df.columns if col in column_mapping} |
| 84 | + df = df.rename(columns=existing_columns) |
| 85 | + |
| 86 | + return df |
| 87 | + |
| 88 | +# %% |
| 89 | +input_fs = fsspec.filesystem(input_protocol, anon=True) |
| 90 | +csv_files = sorted(input_fs.glob(f"{input_bucket}/{input_folder}/*.ehpcsv"), reverse=True) |
| 91 | +output_fs = fsspec.filesystem(output_protocol, token=os.path.expanduser("~/.config/gcloud/application_default_credentials.json")) |
| 92 | + |
| 93 | +# %% |
| 94 | +columns_to_keep = [ |
| 95 | + 'event_id', |
| 96 | + 'time', |
| 97 | + 'latitude', |
| 98 | + 'longitude', |
| 99 | + 'depth_km', |
| 100 | + 'magnitude', |
| 101 | + 'magnitude_type', |
| 102 | + 'event_type', |
| 103 | + 'azimuthal_gap', |
| 104 | + 'minimum_distance_km', |
| 105 | + 'time_residual', |
| 106 | + 'horizontal_error_km', |
| 107 | + 'depth_error_km', |
| 108 | + 'review_status', |
| 109 | +] |
| 110 | + |
| 111 | +for csv_file in tqdm(csv_files): |
| 112 | + print(csv_file) |
| 113 | + |
| 114 | + df = pd.read_csv(f"{input_protocol}://{csv_file}", dtype=str, encoding='latin-1') |
| 115 | + df = map_column_names(df) |
| 116 | + |
| 117 | + df["time"] = pd.to_datetime(df["time"]) |
| 118 | + df["year"] = df["time"].dt.strftime("%Y") |
| 119 | + df["jday"] = df["time"].dt.strftime("%j") |
| 120 | + df['time'] = df['time'].apply(lambda x: x.strftime('%Y-%m-%dT%H:%M:%S.%f')) |
| 121 | + df['event_id'] = df['event_id'].apply(lambda x: "nc" + x) |
| 122 | + |
| 123 | + |
| 124 | + for (year, jday), df in df.groupby(["year", "jday"]): |
| 125 | + if len(df) == 0: |
| 126 | + continue |
| 127 | + os.makedirs(f"{result_path}/{year}/{jday}", exist_ok=True) |
| 128 | + |
| 129 | + df = df[columns_to_keep] |
| 130 | + df.to_csv(f"{result_path}/{year}/{jday}/events.csv", index=False) |
| 131 | + output_fs.put( |
| 132 | + f"{result_path}/{year}/{jday}/events.csv", |
| 133 | + f"{output_bucket}/{output_folder}/{year}/{jday}/events.csv", |
| 134 | + ) |
| 135 | + # df.to_csv(f"{output_protocol}://{output_bucket}/{output_folder}/{year}/{jday}/events.csv", index=False) |
| 136 | + |
| 137 | + if year <= "2024": |
| 138 | + break |
| 139 | + |
| 140 | + |
| 141 | +# %% |
| 142 | + |
0 commit comments