fix by hour to by day

zhuwq0 · zhuwq0 · commit a527fddda707 · 2025-01-19T13:18:30.000-08:00
diff --git a/scripts/cut_templates_cc.py b/scripts/cut_templates_cc.py
@@ -415,6 +415,8 @@ def cut_templates(root_path, region, config):
     picks = picks[picks["adloc_mask"] == 1]
     picks["phase_time"] = pd.to_datetime(picks["phase_time"], utc=True)
     min_phase_score = picks["phase_score"].min()
+    print(f"Number of picks: {len(picks)}")
+    print(picks.iloc[:5])
 
     picks = picks.merge(events[["event_index", "event_timestamp"]], on="event_index")
     # picks = picks.merge(stations[["station_id", "station_term_time"]], on="station_id")
@@ -428,6 +430,11 @@ def cut_templates(root_path, region, config):
         picks["phase_timestamp"] - picks["event_timestamp"] - picks["station_term_time"]
     )  ## Separate P and S station term
 
+    # Keep only the pick with highest phase_score for each event-station-phase combination
+    picks = picks.sort_values("phase_score", ascending=False).drop_duplicates(
+        subset=["event_index", "station_id", "phase_type"], keep="first"
+    )
+
     picks = fillin_missing_picks(
         picks,
         events,
@@ -529,15 +536,60 @@ def cut_templates(root_path, region, config):
         key = "/".join(mseed.replace(".mseed", "").split("/")[-subdir - 1 :])
         key = key[:-1]  ## remove the channel suffix
         mseed_3c[key].append(mseed)
-    print(f"Number of mseed files: {len(mseed_3c)}")
 
     def parse_key(key):
         year, jday, name = key.split("/")
         network, station, location, instrument = name.split(".")
         return [year, jday, network, station, location, instrument]
 
     mseeds = [parse_key(k) + [",".join(sorted(mseed_3c[k]))] for k in mseed_3c]
-    mseeds = pd.DataFrame(mseeds, columns=["year", "jday", "network", "station", "location", "instrument", "ENZ"])
+    mseeds_df = pd.DataFrame(mseeds, columns=["year", "jday", "network", "station", "location", "instrument", "ENZ"])
+
+    # protocol = "gs"
+    # bucket = "quakeflow_catalog"
+    # folder = "SC"
+    # token_json = "application_default_credentials.json"
+    # with open(token_json, "r") as fp:
+    #     token = json.load(fp)
+    # fs = fsspec.filesystem(protocol=protocol, token=token)
+    # year = 2019
+    # mseeds_df = []
+    # # for folder in ["SC", "NC"]:
+    # for folder in ["SC"]:
+    #     with fs.open(f"{bucket}/{folder}/mseed_list/{year}_3c.txt", "r") as f:
+    #         mseeds = f.readlines()
+    #     mseeds = [x.strip("\n") for x in mseeds]
+    #     mseeds = pd.DataFrame(mseeds, columns=["ENZ"])
+    #     if folder == "SC":
+    #         mseeds["fname"] = mseeds["ENZ"].apply(lambda x: x.split("/")[-1])
+    #         mseeds["network"] = mseeds["fname"].apply(lambda x: x[:2])
+    #         mseeds["station"] = mseeds["fname"].apply(lambda x: x[2:7].strip("_"))
+    #         mseeds["instrument"] = mseeds["fname"].apply(lambda x: x[7:9])
+    #         mseeds["location"] = mseeds["fname"].apply(lambda x: x[10:12].strip("_"))
+    #         mseeds["year"] = mseeds["fname"].apply(lambda x: x[13:17])
+    #         mseeds["jday"] = mseeds["fname"].apply(lambda x: x[17:20])
+    #     if folder == "NC":
+    #         mseeds["fname"] = mseeds["ENZ"].apply(lambda x: x.split("/")[-1])
+    #         mseeds["network"] = mseeds["fname"].apply(lambda x: x.split(".")[1])
+    #         mseeds["station"] = mseeds["fname"].apply(lambda x: x.split(".")[0])
+    #         mseeds["instrument"] = mseeds["fname"].apply(lambda x: x.split(".")[2][:-1])
+    #         mseeds["location"] = mseeds["fname"].apply(lambda x: x.split(".")[3])
+    #         mseeds["year"] = mseeds["fname"].apply(lambda x: x.split(".")[5])
+    #         mseeds["jday"] = mseeds["fname"].apply(lambda x: x.split(".")[6])
+    #     mseeds_df.append(mseeds)
+    # mseeds_df = pd.concat(mseeds_df)
+    # mseeds_df.drop(columns=["fname"], inplace=True, errors="ignore")
+    # mseeds_df = mseeds_df[["year", "jday", "network", "station", "location", "instrument", "ENZ"]]
+    # mseeds_df = mseeds_df.merge(
+    #     picks[["network", "station", "location", "instrument", "year", "jday"]],
+    #     on=["network", "station", "location", "instrument", "year", "jday"],
+    # )
+    # mseeds_df = mseeds_df.drop_duplicates(subset=["ENZ"], keep="first")
+    # mseeds_df.sort_values(by=["year", "jday", "network", "station", "location", "instrument", "ENZ"], inplace=True)
+    # mseeds_df.to_csv(f"debug_mseed_remote.csv", index=False)
+
+    print(f"Number of mseeds: {len(mseeds_df)}")
+    print(mseeds_df.iloc[:5])
 
     ## Match picks with mseed files
     picks["network"] = picks["station_id"].apply(lambda x: x.split(".")[0])
@@ -546,7 +598,7 @@ def parse_key(key):
     picks["instrument"] = picks["station_id"].apply(lambda x: x.split(".")[3])
     picks["year"] = picks["phase_time"].dt.strftime("%Y")
     picks["jday"] = picks["phase_time"].dt.strftime("%j")
-    picks = picks.merge(mseeds, on=["network", "station", "location", "instrument", "year", "jday"])
+    picks = picks.merge(mseeds_df, on=["network", "station", "location", "instrument", "year", "jday"])
     picks.drop(columns=["station_id", "network", "location", "instrument", "year", "jday"], inplace=True)
 
     picks_group = picks.copy()
diff --git a/scripts/merge_phasenet_plus_picks.py b/scripts/merge_phasenet_plus_picks.py
@@ -23,8 +23,8 @@ def scan_csv(year, root_path, region, model, data="picks", fs=None, bucket=None,
         if protocol != "file":
             csvs = fs.glob(f"{jday}/??/*.csv")
         else:
-            csvs = glob(f"{root_path}/{region}/{model}/{data}_{model}/{year}/{jday}/??/*.csv")
-            # csvs = glob(f"{root_path}/{region}/{model}/{data}_{model}/{year}/{jday}/*.csv")
+            # csvs = glob(f"{root_path}/{region}/{model}/{data}_{model}/{year}/{jday}/??/*.csv")
+            csvs = glob(f"{root_path}/{region}/{model}/{data}_{model}/{year}/{jday}/*.csv")
 
         csv_list.extend([[year, jday, csv] for csv in csvs])
 
diff --git a/scripts/plot_catalog.py b/scripts/plot_catalog.py
@@ -36,6 +36,12 @@
 print(json.dumps(config, indent=4, sort_keys=True))
 xlim = [config["minlongitude"], config["maxlongitude"]]
 ylim = [config["minlatitude"], config["maxlatitude"]]
+if "mindepth" not in config:
+    config["mindepth"] = 0
+if "maxdepth" not in config:
+    config["maxdepth"] = 60
+zlim = [config["mindepth"], config["maxdepth"]]
+
 
 # %%
 # %%
@@ -485,26 +491,30 @@
     for j in range(3):
         ax[i, j].grid()
 
-if routine_exist and (len(routine_catalog) > 0):
-    ax[0, 0].scatter(
-        routine_catalog["longitude"],
-        routine_catalog["depth_km"],
-        c=routine_catalog["depth_km"],
-        s=8000 / len(routine_catalog),
+xlim = None
+ylim = None
+if adloc_exist and (len(adloc_catalog) > 0):
+    ax[0, 2].scatter(
+        adloc_catalog["longitude"],
+        adloc_catalog["depth_km"],
+        c=adloc_catalog["depth_km"],
+        s=8000 / len(adloc_catalog),
         alpha=1.0,
         linewidth=0,
         vmin=cmin,
         vmax=cmax,
         cmap="viridis_r",
-        label=f"Routine: {len(routine_catalog)}",
+        label=f"AdLoc: {len(adloc_catalog)}",
     )
-    ax[0, 0].set_title(f"Routine: {len(routine_catalog)}")
-    # ax[0, 0].invert_yaxis()
-    xlim = ax[0, 0].get_xlim()
-    ylim = ax[0, 0].get_ylim()
-else:
-    xlim = None
-    ylim = None
+    # ax[0, 2].legend()
+    ax[0, 2].set_title(f"AdLoc: {len(adloc_catalog)}")
+    if (xlim is None) and (ylim is None):
+        ax[0, 2].invert_yaxis()
+        xlim = ax[0, 2].get_xlim()
+        ylim = ax[0, 2].get_ylim()
+    else:
+        ax[0, 2].set_xlim(xlim)
+        ax[0, 2].set_ylim(ylim)
 
 if gamma_exist and (len(gamma_catalog) > 0):
     ax[0, 1].scatter(
@@ -520,30 +530,35 @@
         label=f"GaMMA: {len(gamma_catalog)}",
     )
     ax[0, 1].set_title(f"GaMMA: {len(gamma_catalog)}")
-    ax[0, 1].invert_yaxis()
-    xlim = ax[0, 1].get_xlim()
-    ylim = ax[0, 1].get_ylim()
-else:
-    xlim = None
-    ylim = None
+    if (xlim is None) and (ylim is None):
+        ax[0, 1].invert_yaxis()
+        xlim = ax[0, 1].get_xlim()
+        ylim = ax[0, 1].get_ylim()
+    else:
+        ax[0, 1].set_xlim(xlim)
+        ax[0, 1].set_ylim(ylim)
 
-if adloc_exist and (len(adloc_catalog) > 0):
-    ax[0, 2].scatter(
-        adloc_catalog["longitude"],
-        adloc_catalog["depth_km"],
-        c=adloc_catalog["depth_km"],
-        s=8000 / len(adloc_catalog),
+if routine_exist and (len(routine_catalog) > 0):
+    ax[0, 0].scatter(
+        routine_catalog["longitude"],
+        routine_catalog["depth_km"],
+        c=routine_catalog["depth_km"],
+        s=8000 / len(routine_catalog),
         alpha=1.0,
         linewidth=0,
         vmin=cmin,
         vmax=cmax,
         cmap="viridis_r",
-        label=f"AdLoc: {len(adloc_catalog)}",
+        label=f"Routine: {len(routine_catalog)}",
     )
-    # ax[0, 2].legend()
-    ax[0, 2].set_title(f"AdLoc: {len(adloc_catalog)}")
-    ax[0, 2].set_xlim(xlim)
-    ax[0, 2].set_ylim(ylim)
+    ax[0, 0].set_title(f"Routine: {len(routine_catalog)}")
+    if (xlim is None) and (ylim is None):
+        ax[0, 0].invert_yaxis()
+        xlim = ax[0, 0].get_xlim()
+        ylim = ax[0, 0].get_ylim()
+    else:
+        ax[0, 0].set_xlim(xlim)
+        ax[0, 0].set_ylim(ylim)
 
 if qtm_exist and (len(qtm_catalog) > 0):
     ax[1, 2].scatter(
@@ -669,30 +684,34 @@
 fig, ax = plt.subplots(4, 3, squeeze=False, figsize=(20, 30), sharex=True, sharey=True)
 cmin = 0
 cmax = 10
+xlim = None
+ylim = None
 for i in range(4):
     for j in range(3):
         ax[i, j].grid()
 
-if routine_exist and (len(routine_catalog) > 0):
-    ax[0, 0].scatter(
-        routine_catalog["latitude"],
-        routine_catalog["depth_km"],
-        c=routine_catalog["depth_km"],
-        s=8000 / len(routine_catalog),
+if adloc_exist and (len(adloc_catalog) > 0):
+    ax[0, 2].scatter(
+        adloc_catalog["latitude"],
+        adloc_catalog["depth_km"],
+        c=adloc_catalog["depth_km"],
+        s=8000 / len(adloc_catalog),
         alpha=1.0,
         linewidth=0,
         vmin=cmin,
         vmax=cmax,
         cmap="viridis_r",
-        label=f"Routine: {len(routine_catalog)}",
+        label=f"AdLoc: {len(adloc_catalog)}",
     )
-    ax[0, 0].set_title(f"Routine: {len(routine_catalog)}")
-    # ax[0, 0].invert_yaxis()
-    xlim = ax[0, 0].get_xlim()
-    ylim = ax[0, 0].get_ylim()
-else:
-    xlim = None
-    ylim = None
+    # ax[0, 2].legend()
+    ax[0, 2].set_title(f"AdLoc: {len(adloc_catalog)}")
+    if (xlim is None) and (ylim is None):
+        ax[0, 2].invert_yaxis()
+        xlim = ax[0, 2].get_xlim()
+        ylim = ax[0, 2].get_ylim()
+    else:
+        ax[0, 2].set_xlim(xlim)
+        ax[0, 2].set_ylim(ylim)
 
 if gamma_exist and (len(gamma_catalog) > 0):
     ax[0, 1].scatter(
@@ -708,30 +727,35 @@
         label=f"GaMMA: {len(gamma_catalog)}",
     )
     ax[0, 1].set_title(f"GaMMA: {len(gamma_catalog)}")
-    ax[0, 1].invert_yaxis()
-    xlim = ax[0, 1].get_xlim()
-    ylim = ax[0, 1].get_ylim()
-else:
-    xlim = None
-    ylim = None
+    if (xlim is None) and (ylim is None):
+        ax[0, 1].invert_yaxis()
+        xlim = ax[0, 1].get_xlim()
+        ylim = ax[0, 1].get_ylim()
+    else:
+        ax[0, 1].set_xlim(xlim)
+        ax[0, 1].set_ylim(ylim)
 
-if adloc_exist and (len(adloc_catalog) > 0):
-    ax[0, 2].scatter(
-        adloc_catalog["latitude"],
-        adloc_catalog["depth_km"],
-        c=adloc_catalog["depth_km"],
-        s=8000 / len(adloc_catalog),
+if routine_exist and (len(routine_catalog) > 0):
+    ax[0, 0].scatter(
+        routine_catalog["latitude"],
+        routine_catalog["depth_km"],
+        c=routine_catalog["depth_km"],
+        s=8000 / len(routine_catalog),
         alpha=1.0,
         linewidth=0,
         vmin=cmin,
         vmax=cmax,
         cmap="viridis_r",
-        label=f"AdLoc: {len(adloc_catalog)}",
+        label=f"Routine: {len(routine_catalog)}",
     )
-    # ax[0, 2].legend()
-    ax[0, 2].set_title(f"AdLoc: {len(adloc_catalog)}")
-    ax[0, 2].set_xlim(xlim)
-    ax[0, 2].set_ylim(ylim)
+    ax[0, 0].set_title(f"Routine: {len(routine_catalog)}")
+    if (xlim is None) and (ylim is None):
+        ax[0, 0].invert_yaxis()
+        xlim = ax[0, 0].get_xlim()
+        ylim = ax[0, 0].get_ylim()
+    else:
+        ax[0, 0].set_xlim(xlim)
+        ax[0, 0].set_ylim(ylim)
 
 if qtm_exist and (len(qtm_catalog) > 0):
     ax[1, 2].scatter(
@@ -906,8 +930,17 @@
 
 # %%
 fig, ax = plt.subplots(2, 1, squeeze=False, figsize=(10, 10))
-xlim = [int(np.floor(gamma_catalog["magnitude"].min())), int(np.ceil(gamma_catalog["magnitude"].max()))]
-bins = np.arange(xlim[0], xlim[1] + 1, 0.2)
+if gamma_exist:
+    xlim = [int(np.floor(gamma_catalog["magnitude"].min())), int(np.ceil(gamma_catalog["magnitude"].max()))]
+    bins = np.arange(xlim[0], xlim[1] + 1, 0.2)
+elif adloc_exist:
+    xlim = [int(np.floor(adloc_catalog["magnitude"].min())), int(np.ceil(adloc_catalog["magnitude"].max()))]
+    bins = np.arange(xlim[0], xlim[1] + 1, 0.2)
+elif routine_exist:
+    xlim = [int(np.floor(routine_catalog["magnitude"].min())), int(np.ceil(routine_catalog["magnitude"].max()))]
+    bins = np.arange(xlim[0], xlim[1] + 1, 0.2)
+else:
+    raise ValueError("No catalog found")
 if routine_exist:
     ax[0, 0].hist(routine_catalog["magnitude"], bins=bins, alpha=0.5, label="Routine")
     ax[1, 0].hist(routine_catalog["magnitude"], bins=bins, alpha=0.5, label="Routine")
diff --git a/scripts/run_adloc.py b/scripts/run_adloc.py
@@ -51,10 +51,10 @@ def run_adloc(
     # result_path = f"{root_path}/{region}/adloc_gamma_plus"
     # figure_path = f"{root_path}/{region}/adloc_gamma_plus/figures"
 
-    # picks_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_picks_associated.csv"
-    # events_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_events_associated.csv"
-    # result_path = f"{root_path}/{region}/adloc_plus"
-    # figure_path = f"{root_path}/{region}/adloc_plus/figures"
+    picks_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_picks_associated.csv"
+    events_file = f"{root_path}/{region}/phasenet_plus/phasenet_plus_events_associated.csv"
+    result_path = f"{root_path}/{region}/adloc_plus"
+    figure_path = f"{root_path}/{region}/adloc_plus/figures"
 
     # %%
     if not os.path.exists(result_path):
diff --git a/scripts/run_phasenet_plus.py b/scripts/run_phasenet_plus.py
@@ -29,8 +29,8 @@ def run_phasenet(
 
     # %%
     if data_type == "continuous":
-        subdir = 3
-        # subdir = 2
+        # subdir = 3
+        subdir = 2
     elif data_type == "event":
         subdir = 1
 
@@ -49,8 +49,8 @@ def run_phasenet(
     #         fs.get(f"{bucket}/{waveform_dir}/", f"{root_path}/{waveform_dir}/", recursive=True)
 
     if data_type == "continuous":
-        mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/??/*.mseed"))
-        # mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/*.mseed"))
+        # mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/??/*.mseed"))
+        mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/*.mseed"))
     elif data_type == "event":
         mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/*.mseed"))
     else:
@@ -68,8 +68,8 @@ def run_phasenet(
 
     # %% skip processed files
     if not overwrite:
-        processed = sorted(glob(f"{root_path}/{result_path}/picks_phasenet_plus/????/???/??/*.csv"))
-        # processed = sorted(glob(f"{root_path}/{result_path}/picks_phasenet_plus/????/???/*.csv"))
+        # processed = sorted(glob(f"{root_path}/{result_path}/picks_phasenet_plus/????/???/??/*.csv"))
+        processed = sorted(glob(f"{root_path}/{result_path}/picks_phasenet_plus/????/???/*.csv"))
         processed = ["/".join(f.replace(".csv", "").split("/")[-subdir - 1 :]) for f in processed]
         processed = [p[:-1] for p in processed]  ## remove the channel suffix
         print(f"Number of processed files: {len(processed)}")
diff --git a/scripts/run_qtm.py b/scripts/run_qtm.py
@@ -58,7 +58,7 @@ def parse_args():
         {
             "station_id": lambda x: ",".join(x.unique()),
             "begin_time": lambda x: ",".join(x.unique()),
-            "file_name": lambda x: "_".join(sorted(x)),
+            "file_name": lambda x: "|".join(sorted(x)),
         }
     )
     .reset_index()

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def parse_args():`
`58`	`58`	`{`
`59`	`59`	`"station_id": lambda x: ",".join(x.unique()),`
`60`	`60`	`"begin_time": lambda x: ",".join(x.unique()),`
`61`		`- "file_name": lambda x: "_".join(sorted(x)),`
	`61`	`+ "file_name": lambda x: "\|".join(sorted(x)),`
`62`	`62`	`}`
`63`	`63`	`)`
`64`	`64`	`.reset_index()`