change from organize by hour to by day

zhuwq0 · zhuwq0 · commit 31fe4e69a41e · 2024-12-17T20:59:54.000-08:00
diff --git a/scripts/download_waveform_v2.py b/scripts/download_waveform_v2.py
@@ -210,8 +210,8 @@ def download_waveform(
 
         client = obspy.clients.fdsn.Client(provider)
 
-        DELTATIME = "1H"  # 1H or 1D
-        # DELTATIME = "1D"
+        # DELTATIME = "1H"  # 1H or 1D
+        DELTATIME = "1D"
         if DELTATIME == "1H":
             start = datetime.fromisoformat(config["starttime"]).strftime("%Y-%m-%dT%H")
         elif DELTATIME == "1D":
@@ -290,31 +290,6 @@ def download_waveform(
                     if out is not None:
                         print(out)
 
-    tmp_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/??/*.mseed", recursive=True))
-    mseed_list = []
-    for mseed in tmp_list:
-        tmp = mseed.split("/")
-        # year, jday = tmp[-3].split("-")
-        # hour = tmp[-2]
-        year, jday, hour = tmp[-4], tmp[-3], tmp[-2]
-        if starttimes[0].strftime("%Y-%jT%H") <= f"{year}-{jday}T{hour}" <= starttimes[-1].strftime("%Y-%jT%H"):
-            mseed_list.append(mseed)
-
-    print(f"rank {rank}: {len(mseed_list) = }, {mseed_list[0]}, {mseed_list[-1]}")
-
-    # %% copy to results/network
-    if not os.path.exists(f"{root_path}/{region}/results/network"):
-        os.makedirs(f"{root_path}/{region}/results/network")
-    with open(f"{root_path}/{region}/results/network/mseed_list_{rank:03d}_{num_nodes:03d}.csv", "w") as fp:
-        fp.write("\n".join(mseed_list))
-    if protocol != "file":
-        fs.put(
-            f"{root_path}/{region}/results/network/mseed_list_{rank:03d}_{num_nodes:03d}.csv",
-            f"{bucket}/{region}/results/network/mseed_list_{rank:03d}_{num_nodes:03d}.csv",
-        )
-
-    return f"{region}/results/network/mseed_list_{rank:03d}_{num_nodes:03d}.csv"
-
 
 if __name__ == "__main__":
 
diff --git a/scripts/merge_adloc_picks.py b/scripts/merge_adloc_picks.py
@@ -4,19 +4,19 @@
 import os
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from datetime import datetime, timedelta, timezone
+from glob import glob
 from threading import Lock, Thread
 
 import fsspec
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pyproj
+from args import parse_args
 from obspy import read_inventory
 from obspy.clients.fdsn import Client
 from sklearn.cluster import DBSCAN
 from tqdm import tqdm
-from args import parse_args
-from glob import glob
-import matplotlib.pyplot as plt
 from utils.plotting import plotting_ransac
 
 # %%
@@ -26,6 +26,7 @@
     root_path = args.root_path
     region = args.region
     iter = args.iter
+    print(f"Merge adloc picks iter={iter}")
 
     data_path = f"{region}/adloc"
     result_path = f"{region}/adloc"
@@ -125,8 +126,12 @@
     picks.to_csv(f"{root_path}/{result_path}/adloc_picks_sst_{iter}.csv", index=False)
     stations.to_csv(f"{root_path}/{result_path}/adloc_stations_sst_{iter}.csv", index=False)
 
-    # %%
+    ## save current iteration as the latest
+    events.to_csv(f"{root_path}/{result_path}/adloc_events.csv", index=False)
+    picks.to_csv(f"{root_path}/{result_path}/adloc_picks.csv", index=False)
+    stations.to_csv(f"{root_path}/{result_path}/adloc_stations.csv", index=False)
 
+    # %%
     events = pd.read_csv(f"{root_path}/{result_path}/adloc_events_sst_{iter}.csv")
     picks = pd.read_csv(f"{root_path}/{result_path}/adloc_picks_sst_{iter}.csv")
     stations = pd.read_csv(f"{root_path}/{result_path}/adloc_stations_sst_{iter}.csv")
diff --git a/scripts/merge_phasenet_picks.py b/scripts/merge_phasenet_picks.py
@@ -30,8 +30,8 @@ def scan_csv(year, root_path, region, model, fs=None, bucket=None, protocol="fil
         if protocol != "file":
             csvs = fs.glob(f"{jday}/??/*.csv")
         else:
-            csvs = glob(f"{root_path}/{region}/{model}/picks/{year}/{jday}/??/*.csv")
-            # csvs = glob(f"{root_path}/{region}/{model}/picks/{year}/{jday}/*.csv")
+            # csvs = glob(f"{root_path}/{region}/{model}/picks/{year}/{jday}/??/*.csv")
+            csvs = glob(f"{root_path}/{region}/{model}/picks/{year}/{jday}/*.csv")
 
         csv_list.extend([[year, jday, csv] for csv in csvs])
 
diff --git a/scripts/run_adloc.py b/scripts/run_adloc.py
@@ -39,10 +39,10 @@ def run_adloc(
     picks_file = f"{data_path}/gamma_picks.csv"
     events_file = f"{data_path}/gamma_events.csv"
 
-    # picks_file = f"{root_path}/{region}/gamma/gamma_picks.csv"
-    # events_file = f"{root_path}/{region}/gamma/gamma_events.csv"
-    # result_path = f"{root_path}/{region}/adloc_gamma"
-    # figure_path = f"{root_path}/{region}/adloc_gamma/figures"
+    picks_file = f"{root_path}/{region}/gamma/gamma_picks.csv"
+    events_file = f"{root_path}/{region}/gamma/gamma_events.csv"
+    result_path = f"{root_path}/{region}/adloc_gamma"
+    figure_path = f"{root_path}/{region}/adloc_gamma/figures"
 
     # picks_file = f"{root_path}/{region}/gamma_plus/gamma_picks.csv"
     # events_file = f"{root_path}/{region}/gamma_plus/gamma_events.csv"
@@ -242,7 +242,7 @@ def run_adloc(
             stations["idx_sta"].map(station_term_time[station_term_time["phase_type"] == 1]["residual_time"]).fillna(0)
         )
 
-        plotting_ransac(stations, figure_path, config, picks, events_init, events, suffix=f"_ransac_sst_{iter}")
+        plotting_ransac(stations, figure_path, config, picks, events_init, events, suffix=f"_adloc_sst_{iter}")
 
         if "event_index" not in events.columns:
             events["event_index"] = events.merge(picks[["idx_eve", "event_index"]], on="idx_eve")["event_index"]
@@ -254,9 +254,9 @@ def run_adloc(
         picks["adloc_mask"] = picks["mask"]
         picks["adloc_residual_time"] = picks["residual_time"]
         picks["adloc_residual_amplitude"] = picks["residual_amplitude"]
-        picks.to_csv(os.path.join(result_path, f"ransac_picks_sst_{iter}.csv"), index=False)
-        events.to_csv(os.path.join(result_path, f"ransac_events_sst_{iter}.csv"), index=False)
-        stations.to_csv(os.path.join(result_path, f"ransac_stations_sst_{iter}.csv"), index=False)
+        picks.to_csv(os.path.join(result_path, f"adloc_picks_sst_{iter}.csv"), index=False)
+        events.to_csv(os.path.join(result_path, f"adloc_events_sst_{iter}.csv"), index=False)
+        stations.to_csv(os.path.join(result_path, f"adloc_stations_sst_{iter}.csv"), index=False)
 
         if iter == 0:
             MIN_SST_S = (
@@ -290,9 +290,9 @@ def run_adloc(
     stations.drop(["idx_sta", "x_km", "y_km", "z_km"], axis=1, inplace=True, errors="ignore")
     # stations.rename({"station_term": "adloc_station_term_s"}, axis=1, inplace=True)
 
-    picks.to_csv(os.path.join(result_path, "ransac_picks.csv"), index=False)
-    events.to_csv(os.path.join(result_path, "ransac_events.csv"), index=False)
-    stations.to_csv(os.path.join(result_path, "ransac_stations.csv"), index=False)
+    picks.to_csv(os.path.join(result_path, "adloc_picks.csv"), index=False)
+    events.to_csv(os.path.join(result_path, "adloc_events.csv"), index=False)
+    stations.to_csv(os.path.join(result_path, "adloc_stations.csv"), index=False)
 
 
 # %%
diff --git a/scripts/run_adloc_v2.py b/scripts/run_adloc_v2.py
@@ -3,6 +3,7 @@
 import json
 import multiprocessing as mp
 import os
+from glob import glob
 from typing import Dict, List, NamedTuple
 
 import fsspec
@@ -13,11 +14,10 @@
 from adloc.sacloc2d import ADLoc
 from adloc.utils import invert_location, invert_location_iter
 from args import parse_args
-from glob import glob
+from pyproj import Proj
 
 # from utils import plotting_ransac
 from utils.plotting import plotting, plotting_ransac
-from pyproj import Proj
 
 
 # %%
@@ -375,28 +375,19 @@ def run_adloc(
     # %%
     print(f"{jdays[node_rank] = }")
     if num_nodes == 1:
-        for i in range(10):
-            run_adloc(
-                root_path=root_path,
-                region=region,
-                config=config,
-                jdays=jdays[node_rank],
-                iter=i,
-                protocol=protocol,
-                token=token,
-                bucket=bucket,
-            )
-            os.system(
-                f"python merge_adloc_picks.py --region {region} --root_path {root_path} --bucket {bucket} --iter {i}"
-            )
+        max_iter = 10
     else:
+        max_iter = 1
+
+    for i in range(max_iter):
         run_adloc(
             root_path=root_path,
             region=region,
             config=config,
             jdays=jdays[node_rank],
-            iter=iter,
+            iter=i,
             protocol=protocol,
             token=token,
             bucket=bucket,
         )
+        os.system(f"python merge_adloc_picks.py --region {region} --root_path {root_path} --bucket {bucket} --iter {i}")
diff --git a/scripts/run_cctorch.py b/scripts/run_cctorch.py
@@ -50,17 +50,25 @@
         f"../CCTorch/run.py --pair_list={root_path}/{data_path}/pairs.txt --data_path1={root_path}/{data_path}/template.dat --data_format1=memmap "
         f"--data_list1={root_path}/{data_path}/cctorch_picks.csv "
         f"--events_csv={root_path}/{data_path}/cctorch_events.csv --picks_csv={root_path}/{data_path}/cctorch_picks.csv --stations_csv={root_path}/{data_path}/cctorch_stations.csv "
-        f"--config={root_path}/{data_path}/config.json  --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} --result_path={root_path}/{result_path}"
+        f"--config={root_path}/{data_path}/config.json  --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} "
+        f"--result_path={root_path}/{result_path}"
     )
 
-num_gpu = torch.cuda.device_count()
-if num_gpu == 0:
-    if os.uname().sysname == "Darwin":
-        cmd = f"python {base_cmd} --device=cpu"
-    else:
-        cmd = f"python {base_cmd} --device=cpu"
+
+if torch.cuda.is_available():
+    device = "cuda"
+    num_gpu = torch.cuda.device_count()
+elif torch.backends.mps.is_available():
+    device = "mps"
+    num_gpu = 0
+else:
+    device = "cpu"
+    num_gpu = 0
+
+if num_gpu > 0:
+    cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd} --device={device}"
 else:
-    cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd}"
+    cmd = f"python {base_cmd} --device={device}"
 print(cmd)
 os.system(cmd)
 
diff --git a/scripts/run_phasenet_v2.py b/scripts/run_phasenet_v2.py
@@ -34,10 +34,10 @@ def run_phasenet(
 
     # %%
     waveform_dir = f"{region}/waveforms"
-    mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/??/*.mseed"))
-    subdir = 3
-    # mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/*.mseed"))
-    # subdir = 2
+    # mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/??/*.mseed"))
+    # subdir = 3
+    mseed_list = sorted(glob(f"{root_path}/{waveform_dir}/????/???/*.mseed"))
+    subdir = 2
 
     # %%
     mseed_3c = defaultdict(list)
@@ -49,8 +49,8 @@ def run_phasenet(
 
     # %%
     if not overwrite:
-        processed = sorted(glob(f"{root_path}/{result_path}/picks/????/???/??/*.csv"))
-        # processed = sorted(glob(f"{root_path}/{result_path}/picks/????/???/*.csv"))
+        # processed = sorted(glob(f"{root_path}/{result_path}/picks/????/???/??/*.csv"))
+        processed = sorted(glob(f"{root_path}/{result_path}/picks/????/???/*.csv"))
         processed = ["/".join(f.replace(".csv", "").split("/")[-subdir - 1 :]) for f in processed]
         processed = [p[:-1] for p in processed]  ## remove the channel suffix
         print(f"Number of processed files: {len(processed)}")