From 994ce0d2b8bc8a55514a184d62877bc0fcc30f7c Mon Sep 17 00:00:00 2001 From: "djgagne@ou.edu" Date: Wed, 20 Jan 2021 11:32:31 -0700 Subject: [PATCH 01/15] Updated hsdata --- bin/hsdata | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/hsdata b/bin/hsdata index 8cea126..7388582 100755 --- a/bin/hsdata +++ b/bin/hsdata @@ -171,11 +171,11 @@ def process_ensemble_member(run_date, member, config): obs_tracks_to_json(mrms_tracks, member, run_date, config, track_proc.model_grid.proj) print("Output csv", run_date, member) for table_name, table_data in obs_data.items(): - csv_filename = config.csv_path + "{0}_{1}_{2}_{3}.csv".format(table_name, + csv_filename = join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, "obs", member, run_date.strftime( - config.run_date_format)) + config.run_date_format))) table_data.to_csv(csv_filename, na_rep="nan", float_format="%0.5f", @@ -201,10 +201,10 @@ def process_ensemble_member(run_date, member, config): forecast_data = {} for table_name, table_data in forecast_data.items(): - csv_filename = config.csv_path + "{0}_{1}_{2}_{3}.csv".format(table_name, + csv_filename = join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, config.ensemble_name, member, - run_date.strftime(config.run_date_format)) + run_date.strftime(config.run_date_format))) print("Output csv file " + csv_filename) table_data.to_csv(csv_filename, na_rep="nan", @@ -333,10 +333,10 @@ def rematch_ensemble_tracks(run_date, member, config): obs_data = make_obs_track_data(mrms_tracks, member, run_date, config, track_proc.model_grid.proj, ) for table_name, table_data in obs_data.items(): - table_data.to_csv(config.csv_path + "{0}_{1}_{2}_{3}.csv".format(table_name, + table_data.to_csv(join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, "obs", member, - run_date.strftime("%Y%m%d")), + run_date.strftime("%Y%m%d"))), na_rep="nan", float_format="%0.5f", index=False) @@ -345,10 +345,10 @@ def rematch_ensemble_tracks(run_date, member, config): else: forecast_data = {} for table_name, table_data in forecast_data.items(): - table_data.to_csv(config.csv_path + "{0}_{1}_{2}_{3}.csv".format(table_name, + table_data.to_csv(join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, config.ensemble_name, member, - run_date.strftime("%Y%m%d")), + run_date.strftime("%Y%m%d"))), na_rep="nan", float_format="%0.5f", index=False) From fd8ef447387fb2730638be3ff0094f6a20f6e158 Mon Sep 17 00:00:00 2001 From: "djgagne@ou.edu" Date: Mon, 8 Mar 2021 15:10:20 -0700 Subject: [PATCH 02/15] Added sspf eval script --- demos/eval_sspf_days.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 demos/eval_sspf_days.py diff --git a/demos/eval_sspf_days.py b/demos/eval_sspf_days.py new file mode 100644 index 0000000..ee7e5bf --- /dev/null +++ b/demos/eval_sspf_days.py @@ -0,0 +1,40 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.basemap import Basemap +from hagelslag.evaluation import DistributedROC, DistributedReliability +from scipy.ndimage import convolve +from glob import glob +import xarray as xr +import os +from os.path import join + +eval_path = "/glade/p/work/dgagne/ncar_coarse_neighbor_eval_2016_s_2/" +eval_files = sorted(os.listdir(eval_path)) +eval_test = pd.read_csv(join(eval_path, eval_files[0])) +models = eval_test.columns[eval_test.columns.str.contains("mean")] +run_dates = pd.DatetimeIndex([e.split("_")[-1][:8] for e in eval_files]) +thresholds = [25, 50, 75] +prob_thresholds = np.concatenate(([0, 0.01], np.arange(0.1, 1.1, 0.1), [1.05])) +brier = {} +roc = {} +for thresh in thresholds: + brier[thresh] = pd.DataFrame(index=run_dates, columns=models, dtype=object) + roc[thresh] = pd.DataFrame(index=run_dates, columns=models, dtype=object) +for ev, eval_file in enumerate(eval_files): + print(eval_file) + eval_data = pd.read_csv(join(eval_path, eval_file)) + us_mask = eval_data["us_mask"] == 1 + for thresh in thresholds: + obs = eval_data.loc[us_mask, "MESH_Max_60min_00.50_{0:2d}".format(thresh)] + for model in models: + brier[thresh].loc[run_dates[ev], model] = DistributedReliability(thresholds=prob_thresholds) + brier[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], + obs) + roc[thresh].loc[run_dates[ev], model] = DistributedROC(thresholds=prob_thresholds) + roc[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], + obs) +out_path = "/glade/p/work/dgagne/ncar_coarse_neighbor_scores_2016/" +for thresh in [25, 50, 75]: + brier[thresh].to_csv(join(out_path, "ncar_2016_s_2_brier_objs_{0:02d}.csv".format(thresh)), index_label="Date") + roc[thresh].to_csv(join(out_path, "ncar_2016_s_2_roc_objs_{0:02d}.csv".format(thresh)), index_label="Date") From 1a0237e9f1fdde0d05687b1c7cdaf0d9c14c9281 Mon Sep 17 00:00:00 2001 From: "djgagne@ou.edu" Date: Mon, 8 Mar 2021 15:26:30 -0700 Subject: [PATCH 03/15] Updated travis file --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 787f279..ec3da2d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ before_install: install: - conda env create -f environment.yml - source activate hagelslag + - echo `which python` script: - pytest notifications: From 0f62f696507acca58b98e07e47114b5291f36234 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 22 Sep 2021 10:12:15 -0600 Subject: [PATCH 04/15] Updated environment file and Grib2 writing to use Grib2io package instead of ncepgrib2 github. --- environment.yml | 3 ++- hagelslag/data/ModelOutput.py | 1 - hagelslag/processing/EnsembleProducts.py | 31 +++++++++++------------- hagelslag/processing/Watershed.py | 2 +- requirements.txt | 1 - test/test_data_MRMSGrid.py | 2 +- 6 files changed, 18 insertions(+), 22 deletions(-) diff --git a/environment.yml b/environment.yml index 696b873..4113cb1 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,7 @@ dependencies: - python=3.8 - numpy - scipy + - s3fs - matplotlib - xarray - netcdf4 @@ -23,4 +24,4 @@ dependencies: - sphinx - mock - pip: - - git+https://github.com/jswhit/ncepgrib2.git + - grib2io diff --git a/hagelslag/data/ModelOutput.py b/hagelslag/data/ModelOutput.py index 19de456..f1bba04 100644 --- a/hagelslag/data/ModelOutput.py +++ b/hagelslag/data/ModelOutput.py @@ -12,7 +12,6 @@ import numpy as np from scipy.spatial import cKDTree from scipy.ndimage import gaussian_filter -from pyproj import Proj class ModelOutput(object): """ diff --git a/hagelslag/processing/EnsembleProducts.py b/hagelslag/processing/EnsembleProducts.py index 5c94a97..96bc41d 100644 --- a/hagelslag/processing/EnsembleProducts.py +++ b/hagelslag/processing/EnsembleProducts.py @@ -4,23 +4,16 @@ import pandas as pd from scipy.ndimage import gaussian_filter from scipy.signal import fftconvolve -from scipy.stats import gamma, bernoulli +from scipy.stats import gamma from scipy.interpolate import interp1d from scipy.spatial import cKDTree from skimage.morphology import disk -from netCDF4 import Dataset, date2num, num2date -import os +from netCDF4 import Dataset, num2date from glob import glob from os.path import join, exists import json from datetime import timedelta - - -try: - from ncepgrib2 import Grib2Encode - grib_support = True -except ImportError("ncepgrib2 not available"): - grib_support = False +from grib2io import Grib2Message class EnsembleMemberProduct(object): @@ -219,6 +212,7 @@ def load_forecast_netcdf_data(self, nc_path): self.nc_patches = None return return + def quantile_match(self): """ For each storm object, get the percentiles of the enhanced watershed variable field relative to the training @@ -403,7 +397,9 @@ def encode_grib2_percentile(self): for t, time in enumerate(self.times): time_list = list(self.run_date.utctimetuple()[0:6]) if grib_objects[time] is None: - grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + #grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + grib_objects[time] = Grib2Message(discipline=0, + idsect=np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time].addgrid(gdsinfo, gdtmp1) pdtmp1[8] = (time.to_pydatetime() - self.run_date).total_seconds() / 3600.0 data = self.percentile_data[:, t] / 1000.0 @@ -469,7 +465,9 @@ def encode_grib2_data(self): for t, time in enumerate(self.times): time_list = list(self.run_date.utctimetuple()[0:6]) if grib_objects[time] is None: - grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + #grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + grib_objects[time] = Grib2Message(discipline=0, + idsect=np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time].addgrid(gdsinfo, gdtmp1) pdtmp1[8] = (time.to_pydatetime() - self.run_date).total_seconds() / 3600.0 data = self.data[t] / 1000.0 @@ -490,13 +488,12 @@ def write_grib2_files(self, grib_objects, path): """ for t, time in enumerate(self.times.to_pydatetime()): grib_objects[time].end() - filename = path + "{0}_{1}_{2}_{3}_{4}.grib2".format(self.ensemble_name, + filename = join(path, "{0}_{1}_{2}_{3}_{4}.grib2".format(self.ensemble_name, self.member, self.model_name.replace(" ", "-"), self.variable, self.run_date.strftime("%Y%m%d%H") + "f{0:02d}".format(self.forecast_hours[t]) - ) - fo = open(filename, "wb") - fo.write(grib_objects[time].msg) - fo.close() + )) + with open(filename, "wb") as fo: + fo.write(grib_objects[time]._msg) diff --git a/hagelslag/processing/Watershed.py b/hagelslag/processing/Watershed.py index 16b6b66..6e25308 100644 --- a/hagelslag/processing/Watershed.py +++ b/hagelslag/processing/Watershed.py @@ -1,4 +1,4 @@ -from skimage.morphology import watershed +from skimage.segmentation import watershed from scipy.ndimage import label, find_objects import numpy as np diff --git a/requirements.txt b/requirements.txt index 486ac44..a69d52f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ scikit-image pygrib arrow>=0.8 cython -ncepgrib2 diff --git a/test/test_data_MRMSGrid.py b/test/test_data_MRMSGrid.py index 94bdcc2..64fbccc 100644 --- a/test/test_data_MRMSGrid.py +++ b/test/test_data_MRMSGrid.py @@ -11,7 +11,7 @@ def setUp(self): self.mrms = MRMSGrid(self.start_date, self.end_date, self.variable, self.path) def test_constructor(self): - self.assertEquals(self.mrms.all_dates.size, 22, "Number of dates is wrong") + self.assertEqual(self.mrms.all_dates.size, 22, "Number of dates is wrong") self.assertIsNone(self.mrms.data, "Data already loaded") self.assertIsNone(self.mrms.valid_dates, "Valid dates already loaded") From c312c37734624e51b8f62b4e1d5b3071620f9953 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 22 Sep 2021 10:16:37 -0600 Subject: [PATCH 05/15] Updated requirements list and version. --- requirements.txt | 6 ++++++ setup.py | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a69d52f..e63baa4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,9 @@ scikit-image pygrib arrow>=0.8 cython +grib2io +s3fs +xarray +sphinx +pytest + diff --git a/setup.py b/setup.py index 76c1ad1..07508cd 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,10 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', ] on_rtd = os.environ.get('READTHEDOCS', None) == 'True' @@ -22,7 +24,7 @@ pkg_description = "Hagelslag is a Python package for storm-based analysis, forecasting, and evaluation." setup(name="hagelslag", - version="0.4.0b1", + version="0.4.1b1", description="Object-based severe weather forecast system", author="David John Gagne", author_email="dgagne@ucar.edu", From ba1102309434813e22aef23dcc0d73f8e3167485 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 11:04:11 -0600 Subject: [PATCH 06/15] Updated environment.yml to install hagelslag too --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 4113cb1..f360e5c 100644 --- a/environment.yml +++ b/environment.yml @@ -25,3 +25,4 @@ dependencies: - mock - pip: - grib2io + - . From b0bb9a6c0c9a13e449e9695c4a6b687c5c767d2a Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 11:09:45 -0600 Subject: [PATCH 07/15] move workflow to github actions folder --- .../workflows/python-package-conda.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python-package-conda.yml => .github/workflows/python-package-conda.yml (100%) diff --git a/python-package-conda.yml b/.github/workflows/python-package-conda.yml similarity index 100% rename from python-package-conda.yml rename to .github/workflows/python-package-conda.yml From d0ca3455245a569774ef858e6afd289e6680f304 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 11:33:50 -0600 Subject: [PATCH 08/15] Added jasper to environment to help with grib2io install --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index f360e5c..64c50fe 100644 --- a/environment.yml +++ b/environment.yml @@ -23,6 +23,7 @@ dependencies: - cython - sphinx - mock + - jasper - pip: - grib2io - . From ecd9d035ad40a431153bf45d114f55e8f5293222 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 13:12:52 -0600 Subject: [PATCH 09/15] Trying to install jasper with apt-get command in CI --- .github/workflows/python-package-conda.yml | 3 +++ environment.yml | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 7bae7e2..252f1bc 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -18,6 +18,9 @@ jobs: run: | # $CONDA is an environment variable pointing to the root of the miniconda directory echo $CONDA/bin >> $GITHUB_PATH + - name: Install grib dependencies + run: | + sudo apt-get install -y jasper - name: Install dependencies run: | conda env update --file environment.yml --name base diff --git a/environment.yml b/environment.yml index 64c50fe..f360e5c 100644 --- a/environment.yml +++ b/environment.yml @@ -23,7 +23,6 @@ dependencies: - cython - sphinx - mock - - jasper - pip: - grib2io - . From 35cb9482bb63c002ded8d7f0210a59d0c0bf9ff6 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 14:00:31 -0600 Subject: [PATCH 10/15] update with new package names to install --- .github/workflows/python-package-conda.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 252f1bc..0c5ceec 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -20,7 +20,9 @@ jobs: echo $CONDA/bin >> $GITHUB_PATH - name: Install grib dependencies run: | - sudo apt-get install -y jasper + sudo apt-get install -y libjasper-dev + sudo apt-get install -y libpng-dev + sudo apt-get install -y zlib1g-dev - name: Install dependencies run: | conda env update --file environment.yml --name base From 3f5f2bd662ea155b908544a29329149c5d4b54a8 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 14:01:47 -0600 Subject: [PATCH 11/15] removed libjasper --- .github/workflows/python-package-conda.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 0c5ceec..fecd4e2 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -20,7 +20,6 @@ jobs: echo $CONDA/bin >> $GITHUB_PATH - name: Install grib dependencies run: | - sudo apt-get install -y libjasper-dev sudo apt-get install -y libpng-dev sudo apt-get install -y zlib1g-dev - name: Install dependencies From 0ef91f39a8e5ef7c0bf842fc0d688eac94b4126f Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Wed, 29 Sep 2021 14:02:14 -0600 Subject: [PATCH 12/15] jasper in env --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index f360e5c..64c50fe 100644 --- a/environment.yml +++ b/environment.yml @@ -23,6 +23,7 @@ dependencies: - cython - sphinx - mock + - jasper - pip: - grib2io - . From 19f021ff0cc1e12e47205a8d931a42d8375291d4 Mon Sep 17 00:00:00 2001 From: "djgagne@ou.edu" Date: Wed, 29 Sep 2021 17:00:24 -0600 Subject: [PATCH 13/15] Added way to install libjasper with ubuntu --- .github/workflows/python-package-conda.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index fecd4e2..b5db039 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -20,8 +20,10 @@ jobs: echo $CONDA/bin >> $GITHUB_PATH - name: Install grib dependencies run: | - sudo apt-get install -y libpng-dev - sudo apt-get install -y zlib1g-dev + sudo add-apt-repository "deb http://security.ubuntu.com/ubuntu xenial-security main" + sudo apt update + sudo apt install libjasper1 libjasper-dev + ldconfig -p | grep libjasper - name: Install dependencies run: | conda env update --file environment.yml --name base From 6f3b7e54f8776e45a6e757d8cd51d1209239f0b2 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Thu, 30 Sep 2021 09:10:57 -0600 Subject: [PATCH 14/15] Cleaned up code formatting with auto-format in PyCharm. --- bin/hsdata | 111 ++++---- bin/hsforecast | 86 +++--- demos/eval_sspf_days.py | 17 +- demos/ncar_data_2015.config | 41 +-- demos/obj_tracking.py | 269 +++++++++--------- hagelslag/data/FV3ModelGrid.py | 20 +- hagelslag/data/GribModelGrid.py | 20 +- hagelslag/data/HREFv2ModelGrid.py | 38 +-- hagelslag/data/HRRREModelGrid.py | 46 +-- hagelslag/data/HRRRModelGrid.py | 12 +- hagelslag/data/HRRRZarrModelGrid.py | 3 +- hagelslag/data/HailForecastGrid.py | 17 +- hagelslag/data/MRMSGrid.py | 18 +- hagelslag/data/ModelGrid.py | 19 +- hagelslag/data/ModelOutput.py | 105 +++---- hagelslag/data/NCARModelGrid.py | 7 +- hagelslag/data/NCARStormEventModelGrid.py | 7 +- hagelslag/data/NCARWRF2020ModelGrid.py | 6 +- hagelslag/data/SSEFModelGrid.py | 11 +- hagelslag/data/VSEModelGrid.py | 38 +-- hagelslag/data/WRFModelGrid.py | 16 +- hagelslag/data/ZarrModelGrid.py | 7 +- hagelslag/evaluation/ContingencyTable.py | 5 +- hagelslag/evaluation/GridEvaluator.py | 33 +-- hagelslag/evaluation/MetricPlotter.py | 4 +- .../evaluation/MulticlassContingencyTable.py | 6 +- hagelslag/evaluation/NeighborEvaluator.py | 12 +- hagelslag/evaluation/ObjectEvaluator.py | 44 +-- hagelslag/evaluation/ProbabilityMetrics.py | 20 +- .../processing/EnhancedWatershedSegmenter.py | 3 +- hagelslag/processing/EnsembleProducts.py | 137 ++++----- hagelslag/processing/ObjectMatcher.py | 4 +- hagelslag/processing/STObject.py | 28 +- hagelslag/processing/TrackModeler.py | 59 ++-- hagelslag/processing/TrackProcessing.py | 84 +++--- hagelslag/processing/TrackSampler.py | 44 +-- hagelslag/processing/Watershed.py | 5 +- hagelslag/processing/tracker.py | 13 +- hagelslag/util/Config.py | 2 +- hagelslag/util/convert_mrms_grids.py | 31 +- hagelslag/util/create_model_grid_us_mask.py | 17 +- hagelslag/util/create_sector_grid_data.py | 193 +++++++------ hagelslag/util/custom_grib_table.py | 5 +- hagelslag/util/derived_vars.py | 2 +- hagelslag/util/hrefv2_symbolic_Links.py | 141 ++++----- hagelslag/util/lsr_calibration_dataset.py | 128 +++++---- hagelslag/util/make_proj_grids.py | 7 +- hagelslag/util/merge_forecast_data.py | 29 +- .../util/mrms_mesh_calibration_dataset.py | 132 +++++---- hagelslag/util/munkres.py | 50 ++-- hagelslag/util/output_tree_ensembles.py | 8 +- hagelslag/util/show_importance_ranks.py | 4 +- hagelslag/util/storm_patch_center_coords.py | 29 +- hagelslag/util/test_size_distributions.py | 27 +- hagelslag/util/testing_sector.py | 147 +++++----- setup.py | 13 +- 56 files changed, 1233 insertions(+), 1147 deletions(-) diff --git a/bin/hsdata b/bin/hsdata index a7d9f69..0b32dcc 100755 --- a/bin/hsdata +++ b/bin/hsdata @@ -1,18 +1,18 @@ #!/usr/bin/env python -import argparse, pdb -from multiprocessing import Pool -from hagelslag.util.Config import Config -from hagelslag.processing.TrackProcessing import TrackProcessor -from hagelslag.util.make_proj_grids import read_ncar_map_file -from hagelslag.util.create_sector_grid_data import SectorProcessor -from datetime import timedelta -import pandas as pd -import numpy as np +import argparse import os import traceback -from netCDF4 import Dataset, date2num +from datetime import timedelta +from multiprocessing import Pool from os.path import join, exists +import numpy as np +import pandas as pd +from netCDF4 import Dataset + +from hagelslag.processing.TrackProcessing import TrackProcessor +from hagelslag.util.Config import Config + def main(): parser = argparse.ArgumentParser("hsdata - Hagelslag Data Processor") @@ -126,13 +126,13 @@ def process_ensemble_member(run_date, member, config): if config.train: print("Find obs tracks", run_date, member) mrms_tracks = track_proc.find_mrms_tracks() - + print("Find model tracks", run_date, member) if patch_radius is None: model_tracks = track_proc.find_model_tracks() else: model_tracks = track_proc.find_model_patch_tracks() - + if model_tracks: print(run_date, member, "Found this many model tracks: {0:d}".format(len(model_tracks))) print("Extract model attributes", run_date, member) @@ -154,62 +154,65 @@ def process_ensemble_member(run_date, member, config): track_errors = None else: track_pairings = track_proc.match_tracks(model_tracks, mrms_tracks, - unique_matches=config.unique_matches, - closest_matches=config.closest_matches) + unique_matches=config.unique_matches, + closest_matches=config.closest_matches) track_proc.match_size_distributions(model_tracks, mrms_tracks, track_pairings) track_errors = track_proc.calc_track_errors(model_tracks, - mrms_tracks, - track_pairings) + mrms_tracks, + track_pairings) print("Output data", run_date, member) forecast_data = make_forecast_track_data(model_tracks, run_date, member, - config, track_proc.model_grid.proj, mrms_tracks, track_errors) + config, track_proc.model_grid.proj, mrms_tracks, + track_errors) if patch_radius is not None: print("Output netCDF", run_date, member) forecast_track_patches_to_netcdf(model_tracks, patch_radius, run_date, member, config) if config.json: print("Output json", run_date, member) forecast_tracks_to_json(model_tracks, run_date, member, config, track_proc.model_grid.proj, - observed_tracks=mrms_tracks, - track_errors=track_errors) + observed_tracks=mrms_tracks, + track_errors=track_errors) obs_data = make_obs_track_data(mrms_tracks, member, run_date, config, track_proc.model_grid.proj) if config.json: obs_tracks_to_json(mrms_tracks, member, run_date, config, track_proc.model_grid.proj) print("Output csv", run_date, member) for table_name, table_data in obs_data.items(): csv_filename = join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, - "obs", - member, - run_date.strftime( - config.run_date_format))) + "obs", + member, + run_date.strftime( + config.run_date_format))) table_data.to_csv(csv_filename, - na_rep="nan", - float_format="%0.5f", - index=False) + na_rep="nan", + float_format="%0.5f", + index=False) os.chmod(csv_filename, 0o666) else: forecast_data = make_forecast_track_data(model_tracks, run_date, member, config, - track_proc.model_grid.proj) + track_proc.model_grid.proj) if patch_radius is not None: forecast_track_patches_to_netcdf(model_tracks, patch_radius, run_date, member, config) if config.json: forecast_tracks_to_json(model_tracks, run_date, member, config, track_proc.model_grid.proj) elif len(model_tracks) > 0: print(run_date, member, "Make Forecast Track Data") - forecast_data = make_forecast_track_data(model_tracks, run_date, member, config, track_proc.model_grid.proj) + forecast_data = make_forecast_track_data(model_tracks, run_date, member, config, + track_proc.model_grid.proj) if patch_radius is not None: print(run_date, member, "Track Data to netCDF") forecast_track_patches_to_netcdf(model_tracks, patch_radius, run_date, member, config) if config.json: forecast_tracks_to_json(model_tracks, run_date, member, config, track_proc.model_grid.proj) else: - print('No {0} {1} modeled tracks found'.format(run_date,member)) + print('No {0} {1} modeled tracks found'.format(run_date, member)) forecast_data = {} for table_name, table_data in forecast_data.items(): csv_filename = join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, - config.ensemble_name, - member, - run_date.strftime(config.run_date_format))) + config.ensemble_name, + member, + run_date.strftime( + config.run_date_format))) print("Output csv file " + csv_filename) table_data.to_csv(csv_filename, na_rep="nan", @@ -264,7 +267,7 @@ def process_observed_tracks(run_date, member, config): mrms_tracks = track_proc.find_mrms_tracks() if len(mrms_tracks) > 0: obs_data = make_obs_track_data(mrms_tracks, member, run_date, config, track_proc.model_grid.proj) - if config.json: + if config.json: obs_tracks_to_json(mrms_tracks, member, run_date, config, track_proc.model_grid.proj) # if not os.access(config.csv_path + run_date.strftime("%Y%m%d"), os.R_OK): # try: @@ -339,9 +342,9 @@ def rematch_ensemble_tracks(run_date, member, config): ) for table_name, table_data in obs_data.items(): table_data.to_csv(join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, - "obs", - member, - run_date.strftime("%Y%m%d"))), + "obs", + member, + run_date.strftime("%Y%m%d"))), na_rep="nan", float_format="%0.5f", index=False) @@ -351,9 +354,9 @@ def rematch_ensemble_tracks(run_date, member, config): forecast_data = {} for table_name, table_data in forecast_data.items(): table_data.to_csv(join(config.csv_path, "{0}_{1}_{2}_{3}.csv".format(table_name, - config.ensemble_name, - member, - run_date.strftime("%Y%m%d"))), + config.ensemble_name, + member, + run_date.strftime("%Y%m%d"))), na_rep="nan", float_format="%0.5f", index=False) @@ -483,11 +486,11 @@ def make_forecast_track_data(forecast_tracks, run_date, member, config, proj, ob else: if forecast_track.observations is not None: num_labels = len(forecast_track.observations) - for l in range(num_labels): + for label in range(num_labels): if config.label_type == "gamma": - hail_label = forecast_track.observations[l].loc[step].values.tolist() + hail_label = forecast_track.observations[label].loc[step].values.tolist() else: - hail_label = [forecast_track.observations[l].loc[step, "Max_Hail_Size"]] + hail_label = [forecast_track.observations[label].loc[step, "Max_Hail_Size"]] forecast_data['track_step'].loc[track_step_count] = record + hail_label track_step_count += 1 else: @@ -545,11 +548,11 @@ def forecast_tracks_to_json(forecast_tracks, run_date, member, config, proj, obs except OSError: print("directory already created") - json_filename = config.geojson_path + "/".join(full_path) + \ - "/{0}_{1}_{2}_model_track_{3:03d}.json".format(ensemble_name, - run_date.strftime("%Y%m%d"), - member, - f) + json_filename = join(config.geojson_path, "/".join(full_path), + "{0}_{1}_{2}_model_track_{3:03d}.json".format(ensemble_name, + run_date.strftime("%Y%m%d"), + member, + f)) json_metadata = dict(id=track_id, ensemble_name=ensemble_name, ensemble_member=member, @@ -662,7 +665,7 @@ def forecast_track_patches_to_netcdf(forecast_tracks, patch_radius, run_date, me if hasattr(config, "future_variables"): for f_variable in config.future_variables: out_file.variables[f_variable + "_future"][:] = np.vstack([f_track.attributes[f_variable + "-future"] - for f_track in forecast_tracks]) + for f_track in forecast_tracks]) if config.train: for label_column in label_columns: try: @@ -672,12 +675,12 @@ def forecast_track_patches_to_netcdf(forecast_tracks, patch_radius, run_date, me out_file.variables[label_column][:] = 0 # Save configuration dictionary as global attributes. - for k,v in config.__dict__.items(): + for k, v in config.__dict__.items(): # Don't save attributes that are already netCDF varaible names - if k=="dates": continue - if k=="storm_variables": continue - if k=="potential_variables": continue - if k=="tendency_variables": continue + if k == "dates": continue + if k == "storm_variables": continue + if k == "potential_variables": continue + if k == "tendency_variables": continue v = str(v) # Don't clobber existing attribute. if hasattr(out_file, k): @@ -685,7 +688,7 @@ def forecast_track_patches_to_netcdf(forecast_tracks, patch_radius, run_date, me alt_key = "config_" + k if hasattr(out_file, alt_key): # If alternative key exists too, raise attribute error. - raise AttributeError("Can't save "+k+":"+" "+v+" as global attribute. It already exists.") + raise AttributeError("Can't save " + k + ":" + " " + v + " as global attribute. It already exists.") else: setattr(out_file, alt_key, v) else: diff --git a/bin/hsforecast b/bin/hsforecast index aba3219..02a4263 100755 --- a/bin/hsforecast +++ b/bin/hsforecast @@ -1,18 +1,12 @@ #!/usr/bin/env python import argparse -from netCDF4 import Dataset -from hagelslag.util.Config import Config -from hagelslag.processing.TrackModeler import TrackModeler -from multiprocessing import Pool, Manager -from hagelslag.processing.EnsembleProducts import * -from hagelslag.util.make_proj_grids import read_ncar_map_file -from scipy.ndimage import gaussian_filter -import pandas as pd -import numpy as np -import traceback -from datetime import timedelta -from glob import glob import os +import traceback +from multiprocessing import Pool + +from hagelslag.processing.EnsembleProducts import * +from hagelslag.processing.TrackModeler import TrackModeler +from hagelslag.util.Config import Config def main(): @@ -48,11 +42,11 @@ def main(): if args.train: train_models(track_modeler, config) training_data_percentiles(config.ensemble_members, - config.ensemble_name, - config.watershed_variable, - config.train_data_path, - config.size_dis_training_path, - config.weighting_function) + config.ensemble_name, + config.watershed_variable, + config.train_data_path, + config.size_dis_training_path, + config.weighting_function) if args.fore: forecasts = make_forecasts(track_modeler, config) output_forecasts_csv(forecasts, track_modeler, config) @@ -106,9 +100,9 @@ def make_forecasts(track_modeler, config): print("Size Distribution Forecasts") forecasts["dist"] = track_modeler.predict_size_distribution_models(config.size_distribution_model_names, - config.size_distribution_input_columns, - config.metadata_columns, - location=config.size_distribution_loc) + config.size_distribution_input_columns, + config.metadata_columns, + location=config.size_distribution_loc) return forecasts @@ -156,13 +150,13 @@ def generate_ml_grids(config, mode="forecast"): """ pool = Pool(config.num_procs) run_dates = pd.date_range(start=config.start_dates[mode], - end=config.end_dates[mode], - freq='1D') + end=config.end_dates[mode], + freq='1D') ml_model_list = config.size_distribution_model_names print(ml_model_list) print() print('Size Distribution coming from: {0}'.format(config.size_dis_training_path)) - print() + print() ml_var = "hail" for run_date in run_dates: @@ -218,7 +212,7 @@ def generate_ml_member_grid(ensemble_name, model_names, member, run_date, variab ep.load_forecast_csv_data(forecast_csv_path) ep.load_forecast_netcdf_data(netcdf_path) ep.quantile_match() - #ep.load_data(num_samples=num_samples, percentiles=ml_grid_percentiles) + # ep.load_data(num_samples=num_samples, percentiles=ml_grid_percentiles) grib_objects = ep.encode_grib2_data() if grib_objects is None: return @@ -236,9 +230,9 @@ def generate_ml_member_grid(ensemble_name, model_names, member, run_date, variab return -def training_data_percentiles(member_list, ensemble_name, watershed_obj, csv_data_path, - csv_outpath, weighting_function=None, - percentiles=np.linspace(0.1, 99.9, 100)): +def training_data_percentiles(member_list, ensemble_name, watershed_obj, csv_data_path, + csv_outpath, weighting_function=None, + percentiles=np.linspace(0.1, 99.9, 100)): """ Creates watershed object distribution of sizes for a given set of training netcdf patches @@ -257,52 +251,54 @@ def training_data_percentiles(member_list, ensemble_name, watershed_obj, csv_dat ensemble_obj_data = [] ######################### - #Individual Member Distributions + # Individual Member Distributions ######################### for member in member_list: member_obj_data = [] - member_files = sorted(glob(csv_data_path+'*step*{0}*{1}*.csv'.format(ensemble_name,member))) - dataset = pd.concat(map(pd.read_csv, member_files),ignore_index=True,sort='True') + member_files = sorted(glob(csv_data_path + '*step*{0}*{1}*.csv'.format(ensemble_name, member))) + dataset = pd.concat(map(pd.read_csv, member_files), ignore_index=True, sort=True) match_inds = np.where(dataset["Matched"] == 1)[0] - match_dataset = dataset.loc[match_inds,:] + match_dataset = dataset.loc[match_inds, :] if weighting_function is None: - objs_stat = match_dataset.loc[:,"{0}_max".format(watershed_obj)].dropna() + objs_stat = match_dataset.loc[:, "{0}_max".format(watershed_obj)].dropna() member_obj_data.append(objs_stat) else: training_weights = weighting_function(match_dataset) weight_inds = np.where(training_weights >= 1.0)[0] - weights_objs_stat = match_dataset.reindex(index=weight_inds,columns=["{0}_max".format(watershed_obj)]).dropna() + weights_objs_stat = match_dataset.reindex(index=weight_inds, + columns=["{0}_max".format(watershed_obj)]).dropna() member_obj_data.append(weights_objs_stat) if member_obj_data: obj_values = np.concatenate(member_obj_data) - print('{0} {1} model objects: {2}'.format(watershed_obj,member,obj_values.shape[0])) - print() + print('{0} {1} model objects: {2}'.format(watershed_obj, member, obj_values.shape[0])) + print() obj_percent_val = np.percentile(obj_values, percentiles) - dataframe_values = {'Obj_Values':obj_percent_val, 'Percentiles':percentiles} + dataframe_values = {'Obj_Values': obj_percent_val, 'Percentiles': percentiles} obj_value_percentile = pd.DataFrame(data=dataframe_values) if csv_outpath: - csv_path = csv_outpath+'{0}_{1}_{2}_Size_Distribution.csv'.format(ensemble_name, - watershed_obj,member) + csv_path = csv_outpath + '{0}_{1}_{2}_Size_Distribution.csv'.format(ensemble_name, + watershed_obj, member) obj_value_percentile.to_csv(csv_path, index=False) print('Output csv: {0}'.format(csv_path)) ensemble_obj_data.append(obj_values) - + ######################### - #Ensemble Distribution + # Ensemble Distribution ######################### if ensemble_obj_data: ens_obj_values = [obj for member in ensemble_obj_data for obj in member] - print() - print('Total ensemble {0} model objects: {1}'.format(watershed_obj,np.shape(ens_obj_values)[0])) + print() + print('Total ensemble {0} model objects: {1}'.format(watershed_obj, np.shape(ens_obj_values)[0])) ens_obj_percent_val = np.percentile(ens_obj_values, percentiles) - dataframe_values = {'Obj_Values':ens_obj_percent_val, 'Percentiles':percentiles} + dataframe_values = {'Obj_Values': ens_obj_percent_val, 'Percentiles': percentiles} ens_obj_value_percentile = pd.DataFrame(data=dataframe_values) if csv_outpath: - csv_path = csv_outpath+'{0}_{1}_Size_Distribution.csv'.format(ensemble_name, - watershed_obj) + csv_path = csv_outpath + '{0}_{1}_Size_Distribution.csv'.format(ensemble_name, + watershed_obj) ens_obj_value_percentile.to_csv(csv_path, index=False) print('Output csv: {0}'.format(csv_path)) return + if __name__ == "__main__": main() diff --git a/demos/eval_sspf_days.py b/demos/eval_sspf_days.py index ee7e5bf..8343a70 100644 --- a/demos/eval_sspf_days.py +++ b/demos/eval_sspf_days.py @@ -1,14 +1,11 @@ -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from mpl_toolkits.basemap import Basemap -from hagelslag.evaluation import DistributedROC, DistributedReliability -from scipy.ndimage import convolve -from glob import glob -import xarray as xr import os from os.path import join +import numpy as np +import pandas as pd + +from hagelslag.evaluation import DistributedROC, DistributedReliability + eval_path = "/glade/p/work/dgagne/ncar_coarse_neighbor_eval_2016_s_2/" eval_files = sorted(os.listdir(eval_path)) eval_test = pd.read_csv(join(eval_path, eval_files[0])) @@ -29,10 +26,10 @@ obs = eval_data.loc[us_mask, "MESH_Max_60min_00.50_{0:2d}".format(thresh)] for model in models: brier[thresh].loc[run_dates[ev], model] = DistributedReliability(thresholds=prob_thresholds) - brier[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], + brier[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], obs) roc[thresh].loc[run_dates[ev], model] = DistributedROC(thresholds=prob_thresholds) - roc[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], + roc[thresh].loc[run_dates[ev], model].update(eval_data.loc[us_mask, model], obs) out_path = "/glade/p/work/dgagne/ncar_coarse_neighbor_scores_2016/" for thresh in [25, 50, 75]: diff --git a/demos/ncar_data_2015.config b/demos/ncar_data_2015.config index bc2670c..bbe28f2 100644 --- a/demos/ncar_data_2015.config +++ b/demos/ncar_data_2015.config @@ -1,36 +1,37 @@ #!/usr/bin/env python -from hagelslag.processing.ObjectMatcher import shifted_centroid_distance, start_time_distance -from hagelslag.processing.ObjectMatcher import mean_minimum_centroid_distance, mean_area_distance, \ - duration_distance, start_centroid_distance,nonoverlap,mean_min_time_distance -import pandas as pd import numpy as np +import pandas as pd + +from hagelslag.processing.ObjectMatcher import mean_minimum_centroid_distance, duration_distance, nonoverlap, \ + mean_min_time_distance scratch_path = "/glade/p/nmmm0001/sobash/OBJECT_TRACK/" date_index = pd.DatetimeIndex(start="2015-06-20T00:00", end="2015-06-20T00:00", freq="1D") config = dict(dates=date_index.to_pydatetime(), start_hour=12, end_hour=36, - #watershed_variable="GRPL_MAX", + # watershed_variable="GRPL_MAX", watershed_variable="UP_HELI_MAX", ensemble_name="NCAR", - ensemble_members=["mem{0:d}".format(m) for m in range(1,11)], + ensemble_members=["mem{0:d}".format(m) for m in range(1, 11)], model_path="/glade/scratch/sobash/RT2015/", - model_watershed_params=(25, 5, 250, 200, 200), #decreasing the size threshold results in more broken tracks - #model_watershed_params=(6, 1, 100, 100, 100), - #size_filter=10, + model_watershed_params=(25, 5, 250, 200, 200), + # decreasing the size threshold results in more broken tracks + # model_watershed_params=(6, 1, 100, 100, 100), + # size_filter=10, size_filter=0, gaussian_window=0, - #gaussian_window=1, + # gaussian_window=1, mrms_path=None, mrms_variable="MRMS_RotationTrackML60min_00.50", mrms_watershed_params=(2, 1, 50, 100, 100), object_matcher_params=( [nonoverlap], np.array([1.0]), - np.array([24000])), #used 24000 for 11/16 - #object_matcher_params=([shifted_centroid_distance], np.array([1.0]), + np.array([24000])), # used 24000 for 11/16 + # object_matcher_params=([shifted_centroid_distance], np.array([1.0]), # np.array([24000])), - #object_matcher_params=( + # object_matcher_params=( # [shifted_centroid_distance,nonoverlap], # np.array([0.5,0.5]), # np.array([24000,24000])), #used 24000 for 11/16 @@ -38,15 +39,17 @@ config = dict(dates=date_index.to_pydatetime(), [mean_minimum_centroid_distance, mean_min_time_distance, duration_distance], np.array([0.4, 0.3, 0.3]), np.array([500000, 1, 6])), - #storm_variables=["UP_HELI_MAX", "GRPL_MAX", "W_UP_MAX", "W_DN_MAX", "HAIL_MAX2D", "HAIL_MAXK1", + # storm_variables=["UP_HELI_MAX", "GRPL_MAX", "W_UP_MAX", "W_DN_MAX", "HAIL_MAX2D", "HAIL_MAXK1", # "LTG3_MAX", "RVORT1_MAX", "UP_HELI_MAX03", "UP_HELI_MIN", "WSPD10MAX"], - storm_variables=["UP_HELI_MAX", "UP_HELI_MAX03", "RVORT1_MAX", "GRPL_MAX", "W_UP_MAX", "W_DN_MAX", "WSPD10MAX", "T2", "Q2"], - potential_variables=["CAPE_SFC", "MLCAPE", "MUCAPE", "CIN_SFC", "MLCIN", "LCL_HEIGHT", "SRH1", "SRH3", "UBSHR1", "VBSHR1", "UBSHR6", "VBSHR6", "T2", "Q2", "PREC_ACC_NC"], + storm_variables=["UP_HELI_MAX", "UP_HELI_MAX03", "RVORT1_MAX", "GRPL_MAX", "W_UP_MAX", "W_DN_MAX", + "WSPD10MAX", "T2", "Q2"], + potential_variables=["CAPE_SFC", "MLCAPE", "MUCAPE", "CIN_SFC", "MLCIN", "LCL_HEIGHT", "SRH1", "SRH3", + "UBSHR1", "VBSHR1", "UBSHR6", "VBSHR6", "T2", "Q2", "PREC_ACC_NC"], tendency_variables=[], - #shape_variables=["area", "eccentricity", "major_axis_length", "minor_axis_length", "orientation", + # shape_variables=["area", "eccentricity", "major_axis_length", "minor_axis_length", "orientation", # "extent"] + ["weighted_moments_hu_{0:d}".format(h) for h in range(7)], - shape_variables=["area","major_axis_length","minor_axis_length"], - #variable_statistics=["mean", "max", "min", "std", "mean_dt", "max_dt"], + shape_variables=["area", "major_axis_length", "minor_axis_length"], + # variable_statistics=["mean", "max", "min", "std", "mean_dt", "max_dt"], variable_statistics=["max", "mean", "min"], csv_path=scratch_path + "track_data_ncar_2015_csv_UH25_NEW/", geojson_path=scratch_path + "track_data_ncar_2015_json/", diff --git a/demos/obj_tracking.py b/demos/obj_tracking.py index 1cf1080..3b20a2a 100755 --- a/demos/obj_tracking.py +++ b/demos/obj_tracking.py @@ -9,41 +9,40 @@ # This module demonstrates how data science tools from the image processing and machine learning families can be used to create a forecast of severe hail. It aims to teach the advantages, challenges, and limitations of these tools through hands-on interaction. # -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.patches import Polygon, PathPatch -from matplotlib.collections import PatchCollection -from datetime import datetime, timedelta -from scipy.ndimage import gaussian_filter, find_objects +import argparse +import os from copy import deepcopy -import pdb, sys, argparse, os - +from datetime import datetime, timedelta -# In[2]: +import matplotlib.pyplot as plt +import numpy as np +from scipy.ndimage import gaussian_filter, find_objects -from hagelslag.processing.EnhancedWatershedSegmenter import EnhancedWatershed from hagelslag.data import ModelOutput -from hagelslag.processing.ObjectMatcher import ObjectMatcher, closest_distance from hagelslag.processing import STObject +from hagelslag.processing.EnhancedWatershedSegmenter import EnhancedWatershed +from hagelslag.processing.ObjectMatcher import ObjectMatcher, closest_distance + +# In[2]: parser = argparse.ArgumentParser(description='object tracker') parser.add_argument('-m', '--member', type=str, help='member description (e.g. 1km_pbl7)', default='1km_on_3km_pbl1') parser.add_argument('-d', '--date', help='date yyyymmddhh', default='2005011312') parser.add_argument('-f', '--field', default='UP_HELI_MAX03', help='field in which to find objects') -parser.add_argument('-t','--timethresh', type=int, default=3, help='time threshold (hours)') -parser.add_argument('-v','--verbose', action="store_true", help='print more output. useful for debugging') +parser.add_argument('-t', '--timethresh', type=int, default=3, help='time threshold (hours)') +parser.add_argument('-v', '--verbose', action="store_true", help='print more output. useful for debugging') args = parser.parse_args() -if args.verbose: +if args.verbose: print(args) odir = '/glade/p/work/ahijevyc/hagelslag/out/' model_path = "/glade/scratch/ahijevyc/VSE/" ensemble_name = "NCAR" -run_date = datetime.strptime(args.date,'%Y%m%d%H') +run_date = datetime.strptime(args.date, '%Y%m%d%H') member = args.member field = args.field -start_date = run_date + timedelta(hours=10) # remember first time is usually all zeros +start_date = run_date + timedelta(hours=10) # remember first time is usually all zeros # Attributes: # min_thresh (int): minimum pixel value for pixel to be part of a region @@ -54,25 +53,29 @@ # From ahij's config file. if field == "MAX_UPDRAFT_HELICITY" or field == "UP_HELI_MAX03": - params = {"min_thresh":75, "step":5, "max_thresh":250, "max_size":50, "delta":75, "min_size":1, "filter_size":0} + params = {"min_thresh": 75, "step": 5, "max_thresh": 250, "max_size": 50, "delta": 75, "min_size": 1, + "filter_size": 0} if field == "HAIL2D": - params = {"min_thresh":0.025, "step":0.005, "max_thresh":0.1, "max_size":150, "delta":75, "min_size":0, "filter_size":1} -levels = params['min_thresh'] * np.arange(1,8) + params = {"min_thresh": 0.025, "step": 0.005, "max_thresh": 0.1, "max_size": 150, "delta": 75, "min_size": 0, + "filter_size": 1} +levels = params['min_thresh'] * np.arange(1, 8) levels = np.append(levels, params['min_thresh'] * 15) -model_watershed_params = (params['min_thresh'],params['step'],params['max_thresh'],params["max_size"],params["delta"]) +model_watershed_params = ( +params['min_thresh'], params['step'], params['max_thresh'], params["max_size"], params["delta"]) end_date = start_date + timedelta(hours=0) from netCDF4 import Dataset -model_grid = ModelOutput(ensemble_name, - member, - run_date, - field, - start_date, + +model_grid = ModelOutput(ensemble_name, + member, + run_date, + field, + start_date, end_date, model_path, single_step=True) -model_map_file="/glade/p/work/ahijevyc/hagelslag/mapfiles/VSE.txt" +model_map_file = "/glade/p/work/ahijevyc/hagelslag/mapfiles/VSE.txt" model_grid.load_map_info(model_map_file) model_grid.data = [] @@ -80,88 +83,95 @@ d = start_date deltat = timedelta(minutes=60) + def add_grid(m): - m.drawstates() - m.drawcoastlines(linewidth=0.5) - parallels = np.arange(0.,81.,1.) - m.drawparallels(parallels,labels=[True,False,False,False],linewidth=0.5) - meridians = np.arange(0.,351.,2.) - m.drawmeridians(meridians,labels=[False,False,False,True],linewidth=0.5) - return m - + m.drawstates() + m.drawcoastlines(linewidth=0.5) + parallels = np.arange(0., 81., 1.) + m.drawparallels(parallels, labels=[True, False, False, False], linewidth=0.5) + meridians = np.arange(0., 351., 2.) + m.drawmeridians(meridians, labels=[False, False, False, True], linewidth=0.5) + return m + while d <= end_date: - fhr = (d - run_date).total_seconds()/3600 - # list of potential paths to diagnostic files - dfiles = [model_path+member+run_date.strftime("/%Y%m%d%H")+"/wrf/wrfout_d01_"+d.strftime("%Y-%m-%d_%H:%M:%S"), - model_path+member+run_date.strftime("/%Y%m%d%H")+"/wrf/diags_d01."+d.strftime("%Y-%m-%d_%H:%M:%S")+".nc", - model_path+member+run_date.strftime("/%Y%m%d%H")+"/post_AGAIN/"+'fhr_%d'%fhr+"/WRFTWO"+'%02d'%fhr+".nc", - model_path+member+run_date.strftime("/%Y%m%d%H")+"/wrf/vse_d01."+d.strftime("%Y-%m-%d_%H:%M:%S")+".nc"] - for dfile in dfiles: - # see if each path exists - if not os.path.isfile(dfile): - continue - # If it does, see if 'field' is a variable. - ncf = Dataset(dfile) - if field in ncf.variables: - print(dfile) - model_grid.data.append(ncf.variables[field][0,:,:]) - ncf.close() - break - ncf.close() - d += deltat - -print(model_grid.lon.shape, np.maximum.reduce(model_grid.data).shape) # max across time dimension + fhr = (d - run_date).total_seconds() / 3600 + # list of potential paths to diagnostic files + dfiles = [ + model_path + member + run_date.strftime("/%Y%m%d%H") + "/wrf/wrfout_d01_" + d.strftime("%Y-%m-%d_%H:%M:%S"), + model_path + member + run_date.strftime("/%Y%m%d%H") + "/wrf/diags_d01." + d.strftime( + "%Y-%m-%d_%H:%M:%S") + ".nc", + model_path + member + run_date.strftime( + "/%Y%m%d%H") + "/post_AGAIN/" + 'fhr_%d' % fhr + "/WRFTWO" + '%02d' % fhr + ".nc", + model_path + member + run_date.strftime("/%Y%m%d%H") + "/wrf/vse_d01." + d.strftime( + "%Y-%m-%d_%H:%M:%S") + ".nc"] + for dfile in dfiles: + # see if each path exists + if not os.path.isfile(dfile): + continue + # If it does, see if 'field' is a variable. + ncf = Dataset(dfile) + if field in ncf.variables: + print(dfile) + model_grid.data.append(ncf.variables[field][0, :, :]) + ncf.close() + break + ncf.close() + d += deltat + +print(model_grid.lon.shape, np.maximum.reduce(model_grid.data).shape) # max across time dimension print(model_grid.data[0].max(), model_grid.data[-1].max(), np.maximum.reduce(model_grid.data).max()) -plt.figure(figsize=(10,8)) +plt.figure(figsize=(10, 8)) plt.contourf(model_grid.lon, model_grid.lat, - np.maximum.reduce(model_grid.data), # max across time dimension - levels, - extend="max", - latlon= True, - cmap="Accent") + np.maximum.reduce(model_grid.data), # max across time dimension + levels, + extend="max", + latlon=True, + cmap="Accent") plt.colorbar(shrink=0.9, fraction=0.1, ticks=levels) -title_info = plt.title(field + "\n"+member+" {0}-{1}".format(start_date.strftime("%d %b %Y %H:%M"), - end_date.strftime("%d %b %Y %H:%M")), - fontweight="bold", fontsize=14) -dtstr = "_"+member+run_date.strftime("_%Y%m%d%H") -ret = plt.savefig(odir+"uh_swaths/"+field+"_swaths"+dtstr+".png") +title_info = plt.title(field + "\n" + member + " {0}-{1}".format(start_date.strftime("%d %b %Y %H:%M"), + end_date.strftime("%d %b %Y %H:%M")), + fontweight="bold", fontsize=14) +dtstr = "_" + member + run_date.strftime("_%Y%m%d%H") +ret = plt.savefig(odir + "uh_swaths/" + field + "_swaths" + dtstr + ".png") def get_forecast_objects(model_grid, ew_params, min_size, gaussian_window): - ew = EnhancedWatershed(*ew_params) - model_objects = [] - print("Find model objects Hour:") - for h in range(int((model_grid.end_date - model_grid.start_date).total_seconds()/deltat.total_seconds())+1): - print(h) - hour_labels = ew.size_filter(ew.label(gaussian_filter(model_grid.data[h], gaussian_window)), min_size) - obj_slices = find_objects(hour_labels) - num_slices = len(obj_slices) - model_objects.append([]) - if num_slices > 0: - fig, ax = plt.subplots() - t = plt.contourf(model_grid.lon,model_grid.lat,hour_labels,np.arange(0,num_slices+1)+0.5,extend="max",cmap="Set1",latlon=True,title=str(run_date)+" "+field+" "+str(h)) - ret = plt.savefig(odir+"enh_watershed_ex/ew{0:02d}.png".format(h)) - for s, sl in enumerate(obj_slices): - model_objects[-1].append(STObject(model_grid.data[h][sl], - #np.where(hour_labels[sl] > 0, 1, 0), - # For some objects (especially long, diagonal ones), the rectangular - # slice encompasses part of other objects (i.e. non-zero elements of slice). - # We don't want them in our mask. - np.where(hour_labels[sl] == s+1, 1, 0), - model_grid.x[sl], - model_grid.y[sl], - model_grid.i[sl], - model_grid.j[sl], - h, - h, - dx=model_grid.dx)) - if h > 0: - dims = model_objects[-1][-1].timesteps[0].shape - model_objects[-1][-1].estimate_motion(h, model_grid.data[h-1], dims[1], dims[0]) - return model_objects + ew = EnhancedWatershed(*ew_params) + model_objects = [] + print("Find model objects Hour:") + for h in range(int((model_grid.end_date - model_grid.start_date).total_seconds() / deltat.total_seconds()) + 1): + print(h) + hour_labels = ew.size_filter(ew.label(gaussian_filter(model_grid.data[h], gaussian_window)), min_size) + obj_slices = find_objects(hour_labels) + num_slices = len(obj_slices) + model_objects.append([]) + if num_slices > 0: + fig, ax = plt.subplots() + t = plt.contourf(model_grid.lon, model_grid.lat, hour_labels, np.arange(0, num_slices + 1) + 0.5, + extend="max", cmap="Set1", latlon=True, title=str(run_date) + " " + field + " " + str(h)) + ret = plt.savefig(odir + "enh_watershed_ex/ew{0:02d}.png".format(h)) + for s, sl in enumerate(obj_slices): + model_objects[-1].append(STObject(model_grid.data[h][sl], + # np.where(hour_labels[sl] > 0, 1, 0), + # For some objects (especially long, diagonal ones), the rectangular + # slice encompasses part of other objects (i.e. non-zero elements of slice). + # We don't want them in our mask. + np.where(hour_labels[sl] == s + 1, 1, 0), + model_grid.x[sl], + model_grid.y[sl], + model_grid.i[sl], + model_grid.j[sl], + h, + h, + dx=model_grid.dx)) + if h > 0: + dims = model_objects[-1][-1].timesteps[0].shape + model_objects[-1][-1].estimate_motion(h, model_grid.data[h - 1], dims[1], dims[0]) + return model_objects + model_objects = get_forecast_objects(model_grid, model_watershed_params, params['min_size'], params['filter_size']) @@ -169,37 +179,36 @@ def get_forecast_objects(model_grid, ew_params, min_size, gaussian_window): # In[12]: def track_forecast_objects(input_model_objects, model_grid, object_matcher): - model_objects = deepcopy(input_model_objects) - hours = np.arange(int((model_grid.end_date-model_grid.start_date).total_seconds()/deltat.total_seconds()) + 1) - print("hours = ",hours) - tracked_model_objects = [] - for h in hours: - past_time_objs = [] - for obj in tracked_model_objects: - # Potential trackable objects are identified - # In other words, objects whose end_time is the previous hour - if obj.end_time == h - 1: - past_time_objs.append(obj) - # If no objects existed in the last time step, then consider objects in current time step all new - if len(past_time_objs) == 0: - print("time",h, " no objects existed in the last time step. consider objects in current time step all new") - tracked_model_objects.extend(deepcopy(model_objects[h])) - # Match from previous time step with current time step - elif len(past_time_objs) > 0 and len(model_objects[h]) > 0: - assignments = object_matcher.match_objects(past_time_objs, model_objects[h], h - 1, h) - print("assignments:", assignments) - unpaired = range(len(model_objects[h])) - for pair in assignments: - past_time_objs[pair[0]].extend(model_objects[h][pair[1]]) - unpaired.remove(pair[1]) - if len(unpaired) > 0: - for up in unpaired: - tracked_model_objects.append(model_objects[h][up]) - print("Tracked Model Objects: {0:03d} hour {1:2d}".format(len(tracked_model_objects), h)) - return tracked_model_objects - -#object_matcher = ObjectMatcher([shifted_centroid_distance, centroid_distance], + model_objects = deepcopy(input_model_objects) + hours = np.arange(int((model_grid.end_date - model_grid.start_date).total_seconds() / deltat.total_seconds()) + 1) + print("hours = ", hours) + tracked_model_objects = [] + for h in hours: + past_time_objs = [] + for obj in tracked_model_objects: + # Potential trackable objects are identified + # In other words, objects whose end_time is the previous hour + if obj.end_time == h - 1: + past_time_objs.append(obj) + # If no objects existed in the last time step, then consider objects in current time step all new + if len(past_time_objs) == 0: + print("time", h, " no objects existed in the last time step. consider objects in current time step all new") + tracked_model_objects.extend(deepcopy(model_objects[h])) + # Match from previous time step with current time step + elif len(past_time_objs) > 0 and len(model_objects[h]) > 0: + assignments = object_matcher.match_objects(past_time_objs, model_objects[h], h - 1, h) + print("assignments:", assignments) + unpaired = range(len(model_objects[h])) + for pair in assignments: + past_time_objs[pair[0]].extend(model_objects[h][pair[1]]) + unpaired.remove(pair[1]) + if len(unpaired) > 0: + for up in unpaired: + tracked_model_objects.append(model_objects[h][up]) + print("Tracked Model Objects: {0:03d} hour {1:2d}".format(len(tracked_model_objects), h)) + return tracked_model_objects + + +# object_matcher = ObjectMatcher([shifted_centroid_distance, centroid_distance], # np.array([dist_weight, 1-dist_weight]), np.array([max_distance] * 2)) -object_matcher = ObjectMatcher([closest_distance],np.array([1]),np.array([4*model_grid.dx])) - - +object_matcher = ObjectMatcher([closest_distance], np.array([1]), np.array([4 * model_grid.dx])) diff --git a/hagelslag/data/FV3ModelGrid.py b/hagelslag/data/FV3ModelGrid.py index e391d89..5232c80 100644 --- a/hagelslag/data/FV3ModelGrid.py +++ b/hagelslag/data/FV3ModelGrid.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import numpy as np + from .GribModelGrid import GribModelGrid from .ModelGrid import ModelGrid @@ -18,24 +19,24 @@ class FV3ModelGrid(GribModelGrid): file (True) or one file containing all timesteps (False). """ - def __init__(self, member, run_date, variable, start_date, - end_date, path, single_step=True): + def __init__(self, member, run_date, variable, start_date, + end_date, path, single_step=True): self.path = path self.member = member filenames = [] self.forecast_hours = np.arange((start_date - run_date).total_seconds() / 3600, (end_date - run_date).total_seconds() / 3600 + 1, dtype=int) - + for forecast_hr in self.forecast_hours: - file_name=self.path+'{0}/{1}_{2}f{3:03d}.grib2'.format( - run_date.strftime("%Y%m%d"), - self.member, - run_date.strftime("%Y%m%d%H"), - forecast_hr) + file_name = self.path + '{0}/{1}_{2}f{3:03d}.grib2'.format( + run_date.strftime("%Y%m%d"), + self.member, + run_date.strftime("%Y%m%d%H"), + forecast_hr) filenames.append(file_name) self.netcdf_variables = ["hske_1000", "hske_3000", "hmf_1000", "hmf_3000", "ihm_1000", "ihm_3000"] super(FV3ModelGrid, self).__init__(filenames, run_date, start_date, end_date, variable, member) - + return def load_data(self): @@ -43,4 +44,3 @@ def load_data(self): return ModelGrid.load_data(self) else: return GribModelGrid.load_data(self) - diff --git a/hagelslag/data/GribModelGrid.py b/hagelslag/data/GribModelGrid.py index 67ae733..7fe78e6 100644 --- a/hagelslag/data/GribModelGrid.py +++ b/hagelslag/data/GribModelGrid.py @@ -1,9 +1,9 @@ -#!/usr/bin/env python +import datetime +from os.path import exists + +import numpy as np import pandas as pd import pygrib -import numpy as np -from os.path import exists -import datetime from netCDF4 import Dataset @@ -81,11 +81,11 @@ def format_grib_name(self, selected_variable): data values of unknown variable name, given the ID. """ names = self.unknown_names - Id = None + grib_id = None for key, value in names.items(): if selected_variable == value: - Id = key - return Id, None + grib_id = key + return grib_id, None def load_lightning_data(self): """ @@ -135,11 +135,7 @@ def load_data(self): if type(self.variable) is int: data_values = grib[self.variable].values print(grib[self.variable]) - if grib[self.variable].units == 'unknown': - Id = grib[self.variable].parameterNumber - # units = self.unknown_units[Id] - # else: - # units = grib[self.variable].units + elif type(self.variable) is str: if '_' in self.variable: # Multiple levels diff --git a/hagelslag/data/HREFv2ModelGrid.py b/hagelslag/data/HREFv2ModelGrid.py index a434148..de6d5ac 100644 --- a/hagelslag/data/HREFv2ModelGrid.py +++ b/hagelslag/data/HREFv2ModelGrid.py @@ -1,9 +1,11 @@ #!/usr/bin/env python -from .GribModelGrid import GribModelGrid -from datetime import timedelta -import numpy as np +from datetime import timedelta from glob import glob +import numpy as np + +from .GribModelGrid import GribModelGrid + class HREFv2ModelGrid(GribModelGrid): """ @@ -17,35 +19,39 @@ class HREFv2ModelGrid(GribModelGrid): path (str): Path to model output files """ - def __init__(self, member, run_date, variable, start_date, - end_date, path): + def __init__(self, member, run_date, variable, start_date, + end_date, path): self.path = path self.member = member filenames = [] self.forecast_hours = np.arange((start_date - run_date).total_seconds() / 3600, (end_date - run_date).total_seconds() / 3600 + 1, dtype=int) - - day_before_date = (run_date-timedelta(days=1)).strftime("%Y%m%d") + + day_before_date = (run_date - timedelta(days=1)).strftime("%Y%m%d") member_name = str(self.member.split("_")[0]) if '00' in self.member: - inilization='00' + initial_hour = '00' hours = self.forecast_hours date = run_date.strftime("%Y%m%d") elif '12' in self.member: - inilization='12' - hours = self.forecast_hours+12 + initial_hour = '12' + hours = self.forecast_hours + 12 date = day_before_date + else: + initial_hour = '00' + hours = self.forecast_hours + date = run_date.strftime("%Y%m%d") for forecast_hr in hours: if 'nam' in self.member: files = glob('{0}/{1}/nam*conusnest*{2}f*{3}*'.format(self.path, - date,inilization,forecast_hr)) + date, initial_hour, forecast_hr)) if not files: files = glob('{0}/{1}/nam*t{2}z*conusnest*{3}*'.format(self.path, - date,inilization,forecast_hr)) + date, initial_hour, forecast_hr)) else: files = glob('{0}/{1}/*hiresw*conus{2}*{3}f*{4}*'.format(self.path, - date,member_name,inilization,forecast_hr)) - if len(files) >=1: + date, member_name, initial_hour, forecast_hr)) + if len(files) >= 1: filenames.append(files[0]) - super(HREFv2ModelGrid, self).__init__(filenames,run_date,start_date,end_date,variable,member) - return + super(HREFv2ModelGrid, self).__init__(filenames, run_date, start_date, end_date, variable, member) + return diff --git a/hagelslag/data/HRRREModelGrid.py b/hagelslag/data/HRRREModelGrid.py index 8804b8d..8c07924 100644 --- a/hagelslag/data/HRRREModelGrid.py +++ b/hagelslag/data/HRRREModelGrid.py @@ -1,6 +1,8 @@ #!/usr/bin/env python -import numpy as np from os.path import exists + +import numpy as np + from .GribModelGrid import GribModelGrid @@ -18,33 +20,33 @@ class HRRREModelGrid(GribModelGrid): file (True) or one file containing all timesteps (False). """ - def __init__(self, member, run_date, variable, start_date, - end_date, path,single_step=True): + def __init__(self, member, run_date, variable, start_date, + end_date, path, single_step=True): self.path = path self.member = member filenames = [] self.forecast_hours = np.arange((start_date - run_date).total_seconds() / 3600, (end_date - run_date).total_seconds() / 3600 + 1, dtype=int) - + for forecast_hr in self.forecast_hours: - file_name=self.path+'{1}00/{0}_{1}00f0{2:02}.grib2'.format( - self.member, - run_date.strftime("%Y%m%d"), - forecast_hr) + file_name = self.path + '{1}00/{0}_{1}00f0{2:02}.grib2'.format( + self.member, + run_date.strftime("%Y%m%d"), + forecast_hr) if not exists(file_name): - member_number=self.member.split("0")[-1] - file_name=self.path+'{0}00/wrftwo_hrrre_clue_mem000{1}_{2}.grib2'.format( - run_date.strftime("%Y%m%d"), - member_number, - forecast_hr) + member_number = self.member.split("0")[-1] + file_name = self.path + '{0}00/wrftwo_hrrre_clue_mem000{1}_{2}.grib2'.format( + run_date.strftime("%Y%m%d"), + member_number, + forecast_hr) if not exists(file_name): - file_name = self.path+'{0}00/wrftwo_conus_mem000{1}_{2}.grib2'.format( - run_date.strftime("%Y%m%d"), - member_number, - forecast_hr) - + file_name = self.path + '{0}00/wrftwo_conus_mem000{1}_{2}.grib2'.format( + run_date.strftime("%Y%m%d"), + member_number, + forecast_hr) + filenames.append(file_name) - - super(HRRREModelGrid, self).__init__(filenames,run_date,start_date,end_date,variable,member) - - return + + super(HRRREModelGrid, self).__init__(filenames, run_date, start_date, end_date, variable, member) + + return diff --git a/hagelslag/data/HRRRModelGrid.py b/hagelslag/data/HRRRModelGrid.py index 8adcfb8..ab425bc 100644 --- a/hagelslag/data/HRRRModelGrid.py +++ b/hagelslag/data/HRRRModelGrid.py @@ -1,7 +1,9 @@ +from os.path import join + import numpy as np import pandas as pd + from hagelslag.data.ModelGrid import ModelGrid -from os.path import join class HRRRModelGrid(ModelGrid): @@ -12,9 +14,9 @@ def __init__(self, run_date, variable, start_date, end_date, path, frequency="1H self.end_date = pd.Timestamp(end_date) self.frequency = frequency self.path = path - self.valid_dates = pd.DatetimeIndex(start=self.start_date, - end=self.end_date, - freq=self.frequency) + self.valid_dates = pd.date_range(start=self.start_date, + end=self.end_date, + freq=self.frequency) self.forecast_hours = np.array((self.valid_dates - self.run_date).total_seconds() / 3600, dtype=int) filenames = [] for forecast_hour in self.forecast_hours: @@ -23,5 +25,3 @@ def __init__(self, run_date, variable, start_date, end_date, path, frequency="1H forecast_hour))) super(HRRRModelGrid, self).__init__(filenames, run_date, start_date, end_date, variable) - - diff --git a/hagelslag/data/HRRRZarrModelGrid.py b/hagelslag/data/HRRRZarrModelGrid.py index 515f9ee..35de55b 100644 --- a/hagelslag/data/HRRRZarrModelGrid.py +++ b/hagelslag/data/HRRRZarrModelGrid.py @@ -1,7 +1,6 @@ -import numpy as np import pandas as pd + from hagelslag.data.ZarrModelGrid import ZarrModelGrid -from os.path import join class HRRRZarrModelGrid(ZarrModelGrid): diff --git a/hagelslag/data/HailForecastGrid.py b/hagelslag/data/HailForecastGrid.py index 99fdaad..1e1456f 100644 --- a/hagelslag/data/HailForecastGrid.py +++ b/hagelslag/data/HailForecastGrid.py @@ -1,10 +1,11 @@ +from os.path import exists + import numpy as np import pandas as pd -from pyproj import Proj import pygrib -from scipy.spatial import cKDTree +from pyproj import Proj from scipy.ndimage import gaussian_filter -from os.path import exists +from scipy.spatial import cKDTree class HailForecastGrid(object): @@ -33,6 +34,7 @@ class HailForecastGrid(object): proj (Proj): a pyproj projection object used for converting lat-lon points to x-y coordinate values projparams (dict): PROJ4 parameters describing map projection """ + def __init__(self, run_date, start_date, end_date, ensemble_name, ml_model, members, variable, message_number, path): self.run_date = run_date @@ -91,7 +93,7 @@ def load_data(self): grbs.close() return - def period_neighborhood_probability(self, radius, smoothing, threshold, stride,start_time,end_time): + def period_neighborhood_probability(self, radius, smoothing, threshold, stride, start_time, end_time): """ Calculate the neighborhood probability over the full period of the forecast @@ -110,14 +112,15 @@ def period_neighborhood_probability(self, radius, smoothing, threshold, stride,s neighbor_prob = np.zeros((self.data.shape[0], neighbor_x.shape[0], neighbor_x.shape[1])) print('Forecast Hours: {0}-{1}'.format(start_time, end_time)) for m in range(len(self.members)): - period_max = self.data[m,start_time:end_time,:,:].max(axis=0) + period_max = self.data[m, start_time:end_time, :, :].max(axis=0) valid_i, valid_j = np.where(period_max >= threshold) print(self.members[m], len(valid_i)) if len(valid_i) > 0: var_kd_tree = cKDTree(np.vstack((self.x[valid_i, valid_j], self.y[valid_i, valid_j])).T) - exceed_points = np.unique(np.concatenate(var_kd_tree.query_ball_tree(neighbor_kd_tree, radius))).astype(int) + exceed_points = np.unique(np.concatenate(var_kd_tree.query_ball_tree(neighbor_kd_tree, radius))).astype( + int) exceed_i, exceed_j = np.unravel_index(exceed_points, neighbor_x.shape) neighbor_prob[m][exceed_i, exceed_j] = 1 if smoothing > 0: - neighbor_prob[m] = gaussian_filter(neighbor_prob[m], smoothing,mode='constant') + neighbor_prob[m] = gaussian_filter(neighbor_prob[m], smoothing, mode='constant') return neighbor_prob diff --git a/hagelslag/data/MRMSGrid.py b/hagelslag/data/MRMSGrid.py index 311f026..d24ab1f 100644 --- a/hagelslag/data/MRMSGrid.py +++ b/hagelslag/data/MRMSGrid.py @@ -1,9 +1,10 @@ -from netCDF4 import Dataset, num2date -import pandas as pd -import numpy as np import os -from scipy.spatial import cKDTree + +import numpy as np +import pandas as pd +from netCDF4 import Dataset, num2date from scipy.ndimage import gaussian_filter +from scipy.spatial import cKDTree class MRMSGrid(object): @@ -31,6 +32,7 @@ class MRMSGrid(object): data (ndarray or None): Array of gridded observations after load_data is called. None otherwise. valid_dates (ndarray): Contains the dates where data loaded successfully. """ + def __init__(self, start_date, end_date, variable, path, freq="1H"): self.start_date = start_date self.end_date = end_date @@ -49,7 +51,7 @@ def load_data(self): valid_dates = [] mrms_files = np.array(sorted(os.listdir(self.path + self.variable + "/"))) mrms_file_dates = np.array([m_file.split("_")[-2].split("-")[0] - for m_file in mrms_files]) + for m_file in mrms_files]) old_mrms_file = None file_obj = None for t in range(self.all_dates.shape[0]): @@ -60,8 +62,8 @@ def load_data(self): if file_obj is not None: file_obj.close() file_obj = Dataset(self.path + self.variable + "/" + mrms_file) - #old_mrms_file = mrms_file - + # old_mrms_file = mrms_file + if "time" in file_obj.variables.keys(): time_var = "time" else: @@ -76,7 +78,7 @@ def load_data(self): valid_dates.append(self.all_dates[t]) if file_obj is not None: file_obj.close() - + self.data = np.array(data) self.data[self.data < 0] = 0 self.data[self.data > 150] = 150 diff --git a/hagelslag/data/ModelGrid.py b/hagelslag/data/ModelGrid.py index 6a1eefa..a8fa9ed 100644 --- a/hagelslag/data/ModelGrid.py +++ b/hagelslag/data/ModelGrid.py @@ -1,7 +1,8 @@ -from netCDF4 import Dataset +from os.path import exists + import numpy as np +from netCDF4 import Dataset from pandas import date_range -from os.path import exists class ModelGrid(object): @@ -21,10 +22,11 @@ class ModelGrid(object): forecast_hours: array of all hours in the forecast file_objects (list): List of the file objects for each model time step """ - def __init__(self, - filenames, - run_date, - start_date, + + def __init__(self, + filenames, + run_date, + start_date, end_date, variable, frequency="1H"): @@ -35,8 +37,8 @@ def __init__(self, self.end_date = np.datetime64(end_date) self.frequency = frequency self.valid_dates = date_range(start=self.start_date, - end=self.end_date, - freq=self.frequency) + end=self.end_date, + freq=self.frequency) self.forecast_hours = (self.valid_dates.values - self.run_date).astype("timedelta64[h]").astype(int) self.file_objects = [] self.__enter__() @@ -164,4 +166,3 @@ def close(self): Close links to all open file objects and delete the objects. """ self.__exit__() - diff --git a/hagelslag/data/ModelOutput.py b/hagelslag/data/ModelOutput.py index 2276785..4a4d196 100644 --- a/hagelslag/data/ModelOutput.py +++ b/hagelslag/data/ModelOutput.py @@ -1,18 +1,21 @@ from __future__ import division -from .SSEFModelGrid import SSEFModelGrid -from .VSEModelGrid import VSEModelGrid -from .NCARModelGrid import NCARModelGrid + +import numpy as np +from scipy.ndimage import gaussian_filter +from scipy.spatial import cKDTree + +from hagelslag.util.derived_vars import relative_humidity_pressure_level, melting_layer_height +from hagelslag.util.make_proj_grids import make_proj_grids, read_arps_map_file, read_ncar_map_file, get_proj_obj from .FV3ModelGrid import FV3ModelGrid -from .HRRRModelGrid import HRRRModelGrid -from .HRRREModelGrid import HRRREModelGrid from .HREFv2ModelGrid import HREFv2ModelGrid +from .HRRREModelGrid import HRRREModelGrid +from .HRRRModelGrid import HRRRModelGrid from .HRRRZarrModelGrid import HRRRZarrModelGrid +from .NCARModelGrid import NCARModelGrid from .NCARStormEventModelGrid import NCARStormEventModelGrid -from hagelslag.util.make_proj_grids import make_proj_grids, read_arps_map_file, read_ncar_map_file, get_proj_obj -from hagelslag.util.derived_vars import relative_humidity_pressure_level, melting_layer_height -import numpy as np -from scipy.spatial import cKDTree -from scipy.ndimage import gaussian_filter +from .SSEFModelGrid import SSEFModelGrid +from .VSEModelGrid import VSEModelGrid + class ModelOutput(object): """ @@ -31,12 +34,13 @@ class ModelOutput(object): map_file (str): path to data map file """ - def __init__(self, - ensemble_name, - member_name, - run_date, - variable, - start_date, + + def __init__(self, + ensemble_name, + member_name, + run_date, + variable, + start_date, end_date, path, map_file, @@ -128,41 +132,41 @@ def load_data(self): mg.close() elif self.ensemble_name.upper() == "HREFV2": mg = HREFv2ModelGrid(self.member_name, - self.run_date, - self.variable, - self.start_date, - self.end_date, - self.path) + self.run_date, + self.variable, + self.start_date, + self.end_date, + self.path) self.data, self.units = mg.load_data() - + elif self.ensemble_name.upper() == "HRRRE": mg = HRRREModelGrid(self.member_name, - self.run_date, - self.variable, - self.start_date, - self.end_date, - self.path, - single_step=self.single_step) + self.run_date, + self.variable, + self.start_date, + self.end_date, + self.path, + single_step=self.single_step) self.data, self.units = mg.load_data() - + elif self.ensemble_name.upper() == "SAR-FV3": mg = FV3ModelGrid(self.member_name, - self.run_date, - self.variable, - self.start_date, - self.end_date, - self.path, - single_step=self.single_step) + self.run_date, + self.variable, + self.start_date, + self.end_date, + self.path, + single_step=self.single_step) self.data, self.units = mg.load_data() elif self.ensemble_name.upper() == "VSE": mg = VSEModelGrid(self.member_name, - self.run_date, - self.variable, - self.start_date, - self.end_date, - self.path, - single_step=self.single_step) + self.run_date, + self.variable, + self.start_date, + self.end_date, + self.path, + single_step=self.single_step) self.data, self.units = mg.load_data() mg.close() elif self.ensemble_name.upper() == "HRRR": @@ -173,7 +177,7 @@ def load_data(self): self.path) self.data, self.units = mg.load_data() mg.close() - + elif self.ensemble_name.upper() == "NCARSTORM": mg = NCARStormEventModelGrid(self.run_date, self.variable, @@ -182,13 +186,13 @@ def load_data(self): self.path) self.data, self.units = mg.load_data() mg.close() - + elif self.ensemble_name.upper() == "HRRR-ZARR": mg = HRRRZarrModelGrid(self.run_date, - self.variable, - self.start_date, - self.end_date, - self.path) + self.variable, + self.start_date, + self.end_date, + self.path) self.data, self.units = mg.load_data() else: print(self.ensemble_name + " not supported.") @@ -210,13 +214,14 @@ def load_map_info(self, map_file): self.proj = get_proj_obj(proj_dict) else: proj_dict, grid_dict = read_ncar_map_file(map_file) - if self.member_name[0:7] == "1km_pbl": # Don't just look at the first 3 characters. You have to differentiate '1km_pbl1' and '1km_on_3km_pbl1' + if self.member_name[ + 0:7] == "1km_pbl": # Don't just look at the first 3 characters. You have to differentiate '1km_pbl1' and '1km_on_3km_pbl1' grid_dict["dx"] = 1000 grid_dict["dy"] = 1000 grid_dict["sw_lon"] = 258.697 grid_dict["sw_lat"] = 23.999 grid_dict["ne_lon"] = 282.868269206236 - grid_dict["ne_lat"] = 36.4822338520542 + grid_dict["ne_lat"] = 36.4822338520542 self.dx = int(grid_dict["dx"]) mapping_data = make_proj_grids(proj_dict, grid_dict) @@ -225,7 +230,6 @@ def load_map_info(self, map_file): self.i, self.j = np.indices(self.lon.shape) self.proj = get_proj_obj(proj_dict) - def period_neighborhood_probability(self, radius, smoothing, threshold, stride, x=None, y=None, dx=None): """ Calculate the neighborhood probability over the full period of the forecast @@ -259,4 +263,3 @@ def period_neighborhood_probability(self, radius, smoothing, threshold, stride, if smoothing > 0: neighbor_prob = gaussian_filter(neighbor_prob, smoothing) return neighbor_prob - diff --git a/hagelslag/data/NCARModelGrid.py b/hagelslag/data/NCARModelGrid.py index 8c9f906..533aa35 100644 --- a/hagelslag/data/NCARModelGrid.py +++ b/hagelslag/data/NCARModelGrid.py @@ -1,7 +1,9 @@ -from .ModelGrid import ModelGrid -import numpy as np from datetime import timedelta +import numpy as np + +from .ModelGrid import ModelGrid + class NCARModelGrid(ModelGrid): """ @@ -16,6 +18,7 @@ class NCARModelGrid(ModelGrid): single_step (boolean (default=False)): Whether variable information is stored with each time step in a separate file or one file containing all timesteps. """ + def __init__(self, member, run_date, variable, start_date, end_date, path, single_step=False): self.member = member self.path = path diff --git a/hagelslag/data/NCARStormEventModelGrid.py b/hagelslag/data/NCARStormEventModelGrid.py index 439897b..655a1e3 100644 --- a/hagelslag/data/NCARStormEventModelGrid.py +++ b/hagelslag/data/NCARStormEventModelGrid.py @@ -1,14 +1,17 @@ -import numpy as np -from .ModelGrid import ModelGrid from datetime import timedelta from os.path import join +import numpy as np + +from .ModelGrid import ModelGrid + class NCARStormEventModelGrid(ModelGrid): """ Loads model output from the NCAR MMM 1 and 3 km WRF runs on Cheyenne. """ + def __init__(self, run_date, variable, start_date, end_date, path): self.pressure_levels = np.array([1000, 925, 850, 700, 600, 500, 400, 300, 250, 200, 150, 100]) self.path = path diff --git a/hagelslag/data/NCARWRF2020ModelGrid.py b/hagelslag/data/NCARWRF2020ModelGrid.py index 5dd6c4c..9cfc6b8 100644 --- a/hagelslag/data/NCARWRF2020ModelGrid.py +++ b/hagelslag/data/NCARWRF2020ModelGrid.py @@ -1,7 +1,8 @@ +from os.path import join + import numpy as np + from .ModelGrid import ModelGrid -from datetime import timedelta -from os.path import join class NCARWRF2020ModelGrid(ModelGrid): @@ -9,6 +10,7 @@ class NCARWRF2020ModelGrid(ModelGrid): Loads model output from the NCAR MMM WRF 2020 3 km real-time runs """ + def __init__(self, member_name, run_date, variable, start_date, end_date, path): self.member_name = member_name self.pressure_levels = np.array([1000, 925, 850, 700, 600, 500, 400, 300, 250, 200, 150, 100]) diff --git a/hagelslag/data/SSEFModelGrid.py b/hagelslag/data/SSEFModelGrid.py index 364ebe5..750ab11 100644 --- a/hagelslag/data/SSEFModelGrid.py +++ b/hagelslag/data/SSEFModelGrid.py @@ -1,8 +1,10 @@ -from .ModelGrid import ModelGrid -import numpy as np import os from os.path import exists +import numpy as np + +from .ModelGrid import ModelGrid + class SSEFModelGrid(ModelGrid): """ @@ -17,6 +19,7 @@ class SSEFModelGrid(ModelGrid): single_step (boolean (default=False)): Whether variable information is stored with each time step in a separate file or one file containing all timesteps. """ + def __init__(self, member, run_date, variable, start_date, end_date, path, single_step=False): self.path = path self.member = member @@ -31,9 +34,9 @@ def __init__(self, member, run_date, variable, start_date, end_date, path, singl potential_filenames = [] if single_step: for hour in forecast_hours: - potential_filenames.append("{0}ar{1}00.net{2}{3:06d}".format(full_path, + potential_filenames.append("{0}ar{1}00.net{2}{3:06d}".format(full_path, run_date.strftime("%Y%m%d"), - variable.ljust(6,"_"), + variable.ljust(6, "_"), int(hour) * 3600)) else: potential_filenames.append("{0}ssef_{1}_{2}_{3}.nc".format(full_path, diff --git a/hagelslag/data/VSEModelGrid.py b/hagelslag/data/VSEModelGrid.py index 4363efd..05c789a 100644 --- a/hagelslag/data/VSEModelGrid.py +++ b/hagelslag/data/VSEModelGrid.py @@ -1,10 +1,10 @@ -from .ModelGrid import ModelGrid -from glob import glob -import pandas as pd +import os from datetime import timedelta + import numpy as np -import os, pdb -from os.path import exists + +from .ModelGrid import ModelGrid + class VSEModelGrid(ModelGrid): """ @@ -19,24 +19,30 @@ class VSEModelGrid(ModelGrid): single_step (boolean (default=False)): Whether variable information is stored with each time step in a separate file or one file containing all timesteps. """ + def __init__(self, member, run_date, variable, start_date, end_date, path, single_step=False): self.path = path self.member = member forecast_hours = np.arange((start_date - run_date).total_seconds() / 3600, (end_date - run_date).total_seconds() / 3600 + 1) # Maybe you need the vse-style file. - if variable in ["XLAT","XLONG", "MU", "MUB", "Q2", "T2", "PSFC", "U10", "V10", "HGT", "RAINNC", "GRAUPELNC", "HAILNC", "PBLH", "WSPD10MAX", "W_UP_MAX", "W_DN_MAX", "REFD_MAX", "UP_HELI_MAX", "UP_HELI_MAX03", "UP_HELI_MAX01", "UP_HELI_MIN", "RVORT1_MAX", "RVORT0_MAX", "W_MEAN", "GRPL_MAX", "C_PBLH", "W_PBLH", "W_MAX_PBL", "W_1KM_AGL", "HAIL_MAXK1", "HAIL_MAX2D", "PREC_ACC_NC", "REFD_COM", "REFD", "ECHOTOP", "AFWA_MSLP", "AFWA_HAIL", "AFWA_HAIL_NEWMEAN", "AFWA_HAIL_NEWSTD"]: - full_path = "/".join([self.path, member, run_date.strftime("%Y%m%d%H"), "wrf"]) - potential_filenames = [] - for hour in forecast_hours: - valid_time = run_date + timedelta(hours=hour) - potential_filenames.append(full_path +"/vse_d01."+valid_time.strftime("%Y-%m-%d_%H:%M:%S")+".nc") + if variable in ["XLAT", "XLONG", "MU", "MUB", "Q2", "T2", "PSFC", "U10", "V10", "HGT", "RAINNC", "GRAUPELNC", + "HAILNC", "PBLH", "WSPD10MAX", "W_UP_MAX", "W_DN_MAX", "REFD_MAX", "UP_HELI_MAX", + "UP_HELI_MAX03", "UP_HELI_MAX01", "UP_HELI_MIN", "RVORT1_MAX", "RVORT0_MAX", "W_MEAN", + "GRPL_MAX", "C_PBLH", "W_PBLH", "W_MAX_PBL", "W_1KM_AGL", "HAIL_MAXK1", "HAIL_MAX2D", + "PREC_ACC_NC", "REFD_COM", "REFD", "ECHOTOP", "AFWA_MSLP", "AFWA_HAIL", "AFWA_HAIL_NEWMEAN", + "AFWA_HAIL_NEWSTD"]: + full_path = "/".join([self.path, member, run_date.strftime("%Y%m%d%H"), "wrf"]) + potential_filenames = [] + for hour in forecast_hours: + valid_time = run_date + timedelta(hours=hour) + potential_filenames.append(full_path + "/vse_d01." + valid_time.strftime("%Y-%m-%d_%H:%M:%S") + ".nc") - else: # use the WRFTWO??.nc file - full_path = "/".join([self.path, member, run_date.strftime("%Y%m%d%H"), "post_AGAIN"]) - potential_filenames = [] - for hour in forecast_hours: - potential_filenames.append("{0}/fhr_{1:d}/WRFTWO{2:02d}.nc".format(full_path, int(hour), int(hour) )) + else: # use the WRFTWO??.nc file + full_path = "/".join([self.path, member, run_date.strftime("%Y%m%d%H"), "post_AGAIN"]) + potential_filenames = [] + for hour in forecast_hours: + potential_filenames.append("{0}/fhr_{1:d}/WRFTWO{2:02d}.nc".format(full_path, int(hour), int(hour))) filenames = [] for filename in potential_filenames: if os.access(filename, os.R_OK): diff --git a/hagelslag/data/WRFModelGrid.py b/hagelslag/data/WRFModelGrid.py index a979d3e..3a412b2 100644 --- a/hagelslag/data/WRFModelGrid.py +++ b/hagelslag/data/WRFModelGrid.py @@ -1,14 +1,16 @@ -from netCDF4 import Dataset -from os.path import isdir, exists, join -import numpy as np from glob import glob +from os.path import isdir, exists, join + import arrow +import numpy as np +from netCDF4 import Dataset class WRFModelGrid(object): """ Load data from raw WRF model output files. """ + def __init__(self, forecast_date, variable, domain, path, nocolons=False): self.forecast_date = arrow.get(forecast_date) self.variable = variable @@ -47,7 +49,7 @@ def load_time_var(self, time_var="XTIME"): if time_var in var_list: for attr in patch_zero.variables[time_var].ncattrs(): var_attrs[attr] = getattr(patch_zero.variables[time_var], attr) - var_val = patch_zero.variables[time_var][:] + var_val = patch_zero.variables[time_var][:] patch_zero.close() else: wrf_data = Dataset(join(self.path, self.wrf_filename)) @@ -55,9 +57,9 @@ def load_time_var(self, time_var="XTIME"): if time_var in var_list: for attr in wrf_data.variables[time_var].ncattrs(): var_attrs[attr] = getattr(wrf_data.variables[time_var], attr) - var_val = wrf_data.variables[time_var][:] + var_val = wrf_data.variables[time_var][:] wrf_data.close() - return var_val, var_attrs + return var_val, var_attrs def load_full_grid(self): var_data = None @@ -149,7 +151,7 @@ def load_full_grid(self): else: var_attrs[attr] = getattr(wrf_data.variables[self.variable], attr) - var_data = wrf_data.variables[self.variable][:] + var_data = wrf_data.variables[self.variable][:] if np.any(is_stag): stag_dim = np.where(is_stag)[0][0] if stag_dim == 1: diff --git a/hagelslag/data/ZarrModelGrid.py b/hagelslag/data/ZarrModelGrid.py index 9ea770f..2417487 100644 --- a/hagelslag/data/ZarrModelGrid.py +++ b/hagelslag/data/ZarrModelGrid.py @@ -1,10 +1,10 @@ +from os.path import join + import numpy as np import pandas as pd -from pandas import date_range -from os.path import exists, join import s3fs import xarray as xr -from datetime import timedelta +from pandas import date_range class ZarrModelGrid(object): @@ -23,6 +23,7 @@ class ZarrModelGrid(object): valid_dates: DatetimeIndex of all model timesteps forecast_hours: array of all hours in the forecast """ + def __init__(self, path, run_date, diff --git a/hagelslag/evaluation/ContingencyTable.py b/hagelslag/evaluation/ContingencyTable.py index 78e6f0d..9a8ead1 100644 --- a/hagelslag/evaluation/ContingencyTable.py +++ b/hagelslag/evaluation/ContingencyTable.py @@ -16,6 +16,7 @@ class ContingencyTable(object): N: total number of items in table """ + def __init__(self, a, b, c, d): self.table = np.array([[a, b], [c, d]], dtype=float) self.N = self.table.sum() @@ -111,8 +112,8 @@ def ets(self): def hss(self): """Doolittle (Heidke) Skill Score. 2(ad-bc)/((a+b)(b+d) + (a+c)(c+d))""" return 2 * (self.table[0, 0] * self.table[1, 1] - self.table[0, 1] * self.table[1, 0]) / ( - (self.table[0, 0] + self.table[0, 1]) * (self.table[0, 1] + self.table[1, 1]) + - (self.table[0, 0] + self.table[1, 0]) * (self.table[1, 0] + self.table[1, 1])) + (self.table[0, 0] + self.table[0, 1]) * (self.table[0, 1] + self.table[1, 1]) + + (self.table[0, 0] + self.table[1, 0]) * (self.table[1, 0] + self.table[1, 1])) def pss(self): """Peirce (Hansen-Kuipers, True) Skill Score (ad - bc)/((a+c)(b+d))""" diff --git a/hagelslag/evaluation/GridEvaluator.py b/hagelslag/evaluation/GridEvaluator.py index 6a747ba..0fe18f7 100644 --- a/hagelslag/evaluation/GridEvaluator.py +++ b/hagelslag/evaluation/GridEvaluator.py @@ -1,9 +1,11 @@ -from netCDF4 import Dataset +from datetime import timedelta + import numpy as np +from netCDF4 import Dataset +from scipy.ndimage import binary_dilation + from hagelslag.data.MRMSGrid import MRMSGrid from hagelslag.evaluation.ProbabilityMetrics import DistributedROC, DistributedReliability -from datetime import timedelta -from scipy.ndimage import binary_dilation class GridEvaluator(object): @@ -45,6 +47,7 @@ class GridEvaluator(object): mask_variable : str, optional (default="RadarQualityIndex_00.00") Name of the MRMS variable used for masking. """ + def __init__(self, run_date, ensemble_name, ensemble_member, model_names, size_thresholds, start_hour, end_hour, window_size, time_skip, forecast_path, mrms_path, mrms_variable, obs_mask=True, @@ -60,9 +63,9 @@ def __init__(self, run_date, ensemble_name, ensemble_member, self.window_size = window_size self.time_skip = time_skip self.hour_windows = [] - for hour in self.valid_hours[self.window_size-1::self.time_skip]: - self.hour_windows.append(slice(hour-self.window_size+1 - self.start_hour, - hour+1-self.start_hour)) + for hour in self.valid_hours[self.window_size - 1::self.time_skip]: + self.hour_windows.append(slice(hour - self.window_size + 1 - self.start_hour, + hour + 1 - self.start_hour)) self.forecast_path = forecast_path self.mrms_path = mrms_path self.mrms_variable = mrms_variable @@ -82,7 +85,8 @@ def load_forecasts(self): for model_name in self.model_names: self.raw_forecasts[model_name] = {} forecast_file = self.forecast_path + run_date_str + "/" + \ - model_name.replace(" ", "-") + "_hailprobs_{0}_{1}.nc".format(self.ensemble_member, run_date_str) + model_name.replace(" ", "-") + "_hailprobs_{0}_{1}.nc".format(self.ensemble_member, + run_date_str) forecast_obj = Dataset(forecast_file) forecast_hours = forecast_obj.variables["forecast_hour"][:] valid_hour_indices = np.where((self.start_hour <= forecast_hours) & (forecast_hours <= self.end_hour))[0] @@ -102,7 +106,7 @@ def get_window_forecasts(self): np.array([self.raw_forecasts[model_name][size_threshold][sl].sum(axis=0) for sl in self.hour_windows]) - def load_obs(self, mask_threshold=0.5): + def load_obs(self, mask_threshold=0.5): """ Loads observations and masking grid (if needed). @@ -122,7 +126,7 @@ def load_obs(self, mask_threshold=0.5): mask_grid.load_data() self.raw_obs[self.mask_variable] = np.where(mask_grid.data >= mask_threshold, 1, 0) self.window_obs[self.mask_variable] = np.array([self.raw_obs[self.mask_variable][sl].max(axis=0) - for sl in self.hour_windows]) + for sl in self.hour_windows]) def dilate_obs(self, dilation_radius): """ @@ -134,7 +138,8 @@ def dilate_obs(self, dilation_radius): for s in self.size_thresholds: self.dilated_obs[s] = np.zeros(self.window_obs[self.mrms_variable].shape) for t in range(self.dilated_obs[s].shape[0]): - self.dilated_obs[s][t][binary_dilation(self.window_obs[self.mrms_variable][t] >= s, iterations=dilation_radius)] = 1 + self.dilated_obs[s][t][ + binary_dilation(self.window_obs[self.mrms_variable][t] >= s, iterations=dilation_radius)] = 1 def roc_curves(self, prob_thresholds): """ @@ -195,11 +200,3 @@ def reliability_curves(self, prob_thresholds): self.dilated_obs[size_threshold][h] ) return all_rel_curves - - - - - - - - diff --git a/hagelslag/evaluation/MetricPlotter.py b/hagelslag/evaluation/MetricPlotter.py index 06fbee9..4a40f40 100644 --- a/hagelslag/evaluation/MetricPlotter.py +++ b/hagelslag/evaluation/MetricPlotter.py @@ -1,5 +1,5 @@ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from mpl_toolkits.axes_grid1.inset_locator import inset_axes, InsetPosition @@ -218,7 +218,7 @@ def reliability_diagram(rel_objs, obj_labels, colors, markers, filename, figsize def attributes_diagram(rel_objs, obj_labels, colors, markers, filename, figsize=(8, 8), xlabel="Forecast Probability", - ylabel="Observed Relative Frequency", ticks=np.arange(0, 1.05, 0.05), dpi=300, + ylabel="Observed Relative Frequency", ticks=np.arange(0, 1.05, 0.05), dpi=300, title="Attributes Diagram", legend_params=None, inset_params=None, inset_position=(0.12, 0.72, 0.25, 0.25), bootstrap_sets=None, ci=(2.5, 97.5)): """ diff --git a/hagelslag/evaluation/MulticlassContingencyTable.py b/hagelslag/evaluation/MulticlassContingencyTable.py index 81bdfa5..ef301a9 100644 --- a/hagelslag/evaluation/MulticlassContingencyTable.py +++ b/hagelslag/evaluation/MulticlassContingencyTable.py @@ -1,4 +1,5 @@ import numpy as np + __author__ = 'David John Gagne ' @@ -19,6 +20,7 @@ class MulticlassContingencyTable(object): The contingency table is stored in table as a numpy array with the rows corresponding to forecast categories, and the columns corresponding to observation categories. """ + def __init__(self, table=None, n_classes=2, class_names=("1", "0")): self.table = table self.n_classes = n_classes @@ -55,9 +57,9 @@ def gerrity_score(self): s = np.zeros(self.table.shape, dtype=float) for (i, j) in np.ndindex(*s.shape): if i == j: - s[i, j] = 1.0 / (k - 1.0) * (np.sum(1.0 / a[0:j]) + np.sum(a[j:k-1])) + s[i, j] = 1.0 / (k - 1.0) * (np.sum(1.0 / a[0:j]) + np.sum(a[j:k - 1])) elif i < j: - s[i, j] = 1.0 / (k - 1.0) * (np.sum(1.0 / a[0:i]) - (j - i) + np.sum(a[j:k-1])) + s[i, j] = 1.0 / (k - 1.0) * (np.sum(1.0 / a[0:i]) - (j - i) + np.sum(a[j:k - 1])) else: s[i, j] = s[j, i] return np.sum(self.table / float(self.table.sum()) * s) diff --git a/hagelslag/evaluation/NeighborEvaluator.py b/hagelslag/evaluation/NeighborEvaluator.py index 916518f..fa46a49 100644 --- a/hagelslag/evaluation/NeighborEvaluator.py +++ b/hagelslag/evaluation/NeighborEvaluator.py @@ -1,12 +1,14 @@ +from datetime import timedelta + import numpy as np import pandas as pd -from hagelslag.data.MRMSGrid import MRMSGrid -from hagelslag.evaluation.ProbabilityMetrics import DistributedReliability, DistributedROC from netCDF4 import Dataset -from datetime import timedelta from scipy.signal import fftconvolve from skimage.morphology import disk +from hagelslag.data.MRMSGrid import MRMSGrid +from hagelslag.evaluation.ProbabilityMetrics import DistributedReliability, DistributedROC + class NeighborEvaluator(object): """ @@ -92,7 +94,7 @@ def load_forecasts(self): self.period_forecasts[period_var] = forecast_data.variables[period_var][:] forecast_data.close() - def load_obs(self, mask_threshold=0.5): + def load_obs(self, mask_threshold=0.5): """ Loads observations and masking grid (if needed). @@ -177,7 +179,7 @@ def evaluate_period_forecasts(self): A pandas DataFrame with full-period metadata and verification statistics """ score_columns = ["Run_Date", "Ensemble Name", "Model_Name", "Forecast_Variable", "Neighbor_Radius", - "Smoothing_Radius", "Size_Threshold", "ROC", "Reliability"] + "Smoothing_Radius", "Size_Threshold", "ROC", "Reliability"] all_scores = pd.DataFrame(columns=score_columns) if self.coordinate_file is not None: coord_mask = np.where((self.coordinates["lon"] >= self.lon_bounds[0]) & diff --git a/hagelslag/evaluation/ObjectEvaluator.py b/hagelslag/evaluation/ObjectEvaluator.py index 1f6c7bb..9f45958 100644 --- a/hagelslag/evaluation/ObjectEvaluator.py +++ b/hagelslag/evaluation/ObjectEvaluator.py @@ -1,10 +1,13 @@ -import numpy as np -import pandas as pd import json from glob import glob -from hagelslag.evaluation.ProbabilityMetrics import DistributedCRPS, DistributedReliability, DistributedROC +from os.path import join + +import numpy as np +import pandas as pd from scipy.stats import gamma +from hagelslag.evaluation.ProbabilityMetrics import DistributedCRPS, DistributedReliability, DistributedROC + class ObjectEvaluator(object): """ @@ -31,6 +34,7 @@ class ObjectEvaluator(object): matched_forecasts (dict): Forecasts merged with observation information. """ + def __init__(self, run_date, ensemble_name, ensemble_member, model_names, model_types, forecast_bins, dist_thresholds, forecast_json_path, track_data_csv_path): self.run_date = run_date @@ -57,7 +61,9 @@ def __init__(self, run_date, ensemble_name, ensemble_member, model_names, model_ self.forecasts[model_type] = {} for model_name in self.model_names[model_type]: self.forecasts[model_type][model_name] = pd.DataFrame(columns=self.metadata_columns + - list(self.forecast_bins[model_type].astype(str))) + list( + self.forecast_bins[model_type].astype( + str))) def load_forecasts(self): """ @@ -90,14 +96,14 @@ def load_obs(self): """ Loads the track total and step files and merges the information into a single data frame. """ - track_total_file = self.track_data_csv_path + \ - "track_total_{0}_{1}_{2}.csv".format(self.ensemble_name, - self.ensemble_member, - self.run_date.strftime("%Y%m%d")) - track_step_file = self.track_data_csv_path + \ - "track_step_{0}_{1}_{2}.csv".format(self.ensemble_name, - self.ensemble_member, - self.run_date.strftime("%Y%m%d")) + track_total_file = join(self.track_data_csv_path, + "track_total_{0}_{1}_{2}.csv".format(self.ensemble_name, + self.ensemble_member, + self.run_date.strftime("%Y%m%d"))) + track_step_file = join(self.track_data_csv_path, + "track_step_{0}_{1}_{2}.csv".format(self.ensemble_name, + self.ensemble_member, + self.run_date.strftime("%Y%m%d"))) track_total_cols = ["Track_ID", "Translation_Error_X", "Translation_Error_Y", "Start_Time_Error"] track_step_cols = ["Step_ID", "Track_ID", "Hail_Size", "Shape", "Location", "Scale"] track_total_data = pd.read_csv(track_total_file, usecols=track_total_cols) @@ -159,7 +165,7 @@ def gamma_cdf(x, a, loc, b): f_params = sub_forecasts[self.forecast_bins[model_type]].values[f] forecast_cdfs[f] = gamma_cdf(self.dist_thresholds, f_params[0], f_params[1], f_params[2]) obs_cdfs = np.array([gamma_cdf(self.dist_thresholds, *params) - for params in sub_forecasts[self.type_cols[model_type]].values]) + for params in sub_forecasts[self.type_cols[model_type]].values]) crps_obj.update(forecast_cdfs, obs_cdfs) else: crps_obj.update(sub_forecasts[self.forecast_bins[model_type].astype(str)].values, @@ -193,12 +199,12 @@ def roc(self, model_type, model_name, intensity_threshold, prob_thresholds, quer forecast_values = np.array([gamma_sf(intensity_threshold, *params) for params in sub_forecasts[self.forecast_bins[model_type]].values]) obs_probs = np.array([gamma_sf(intensity_threshold, *params) - for params in sub_forecasts[self.type_cols[model_type]].values]) + for params in sub_forecasts[self.type_cols[model_type]].values]) obs_values[obs_probs >= 0.01] = 1 elif len(self.forecast_bins[model_type]) > 1: fbin = np.argmin(np.abs(self.forecast_bins[model_type] - intensity_threshold)) - forecast_values = 1 - sub_forecasts[self.forecast_bins[model_type].astype(str)].values.cumsum(axis=1)[:, - fbin] + forecast_values = 1 - sub_forecasts[ + self.forecast_bins[model_type].astype(str)].values.cumsum(axis=1)[:, fbin] obs_values[sub_forecasts[self.type_cols[model_type]].values >= intensity_threshold] = 1 else: forecast_values = sub_forecasts[self.forecast_bins[model_type].astype(str)[0]].values @@ -232,12 +238,12 @@ def reliability(self, model_type, model_name, intensity_threshold, prob_threshol forecast_values = np.array([gamma_sf(intensity_threshold, *params) for params in sub_forecasts[self.forecast_bins[model_type]].values]) obs_probs = np.array([gamma_sf(intensity_threshold, *params) - for params in sub_forecasts[self.type_cols[model_type]].values]) + for params in sub_forecasts[self.type_cols[model_type]].values]) obs_values[obs_probs >= 0.01] = 1 elif len(self.forecast_bins[model_type]) > 1: fbin = np.argmin(np.abs(self.forecast_bins[model_type] - intensity_threshold)) forecast_values = 1 - sub_forecasts[self.forecast_bins[model_type].astype(str)].values.cumsum(axis=1)[:, - fbin] + fbin] obs_values[sub_forecasts[self.type_cols[model_type]].values >= intensity_threshold] = 1 else: forecast_values = sub_forecasts[self.forecast_bins[model_type].astype(str)[0]].values @@ -307,8 +313,6 @@ def max_hail_sample_crps(self, forecast_max_hail, obs_max_hail): return crps - - def gamma_sf(x, a, loc, b): if a == 0 or b == 0: sf = 0 diff --git a/hagelslag/evaluation/ProbabilityMetrics.py b/hagelslag/evaluation/ProbabilityMetrics.py index 51618a4..5b550bd 100644 --- a/hagelslag/evaluation/ProbabilityMetrics.py +++ b/hagelslag/evaluation/ProbabilityMetrics.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd + from .ContingencyTable import ContingencyTable __author__ = "David John Gagne " @@ -56,6 +57,7 @@ class DistributedROC(object): >>> roc.update(forecasts, obs) >>> print(roc.auc()) """ + def __init__(self, thresholds=np.arange(0, 1.1, 0.1), obs_threshold=1.0, input_str=None): """ Initializes the DistributedROC object. If input_str is not None, then the DistributedROC object is @@ -90,7 +92,7 @@ def update(self, forecasts, observations): tn = np.count_nonzero((forecasts < threshold) & (observations < self.obs_threshold)) self.contingency_tables.iloc[t] += [tp, fp, fn, tn] - + def clear(self): self.contingency_tables.loc[:, :] = 0 @@ -207,7 +209,7 @@ def from_str(self, in_str): self.thresholds = np.array(value.split(), dtype=float) self.contingency_tables = pd.DataFrame(columns=self.contingency_tables.columns, data=np.zeros((self.thresholds.size, - self.contingency_tables.columns.size))) + self.contingency_tables.columns.size))) elif var_name in self.contingency_tables.columns: self.contingency_tables[var_name] = np.array(value.split(), dtype=int) @@ -227,10 +229,10 @@ def calc_reliability_curve(self): total_frequency = np.zeros(self.thresholds.shape) for t, threshold in enumerate(self.thresholds[:-1]): pos_frequency[t] = np.count_nonzero((threshold <= self.forecasts) & - (self.forecasts < self.thresholds[t+1]) & + (self.forecasts < self.thresholds[t + 1]) & (self.observations > self.obs_threshold)) total_frequency[t] = np.count_nonzero((threshold <= self.forecasts) & - (self.forecasts < self.thresholds[t+1])) + (self.forecasts < self.thresholds[t + 1])) if total_frequency[t] > 0: self.pos_relative_frequency[t] = pos_frequency[t] / float(total_frequency[t]) self.total_relative_frequency[t] = total_frequency / self.forecasts.size @@ -315,11 +317,11 @@ def update(self, forecasts, observations): """ for t, threshold in enumerate(self.thresholds[:-1]): self.frequencies.loc[t, "Positive_Freq"] += np.count_nonzero((threshold <= forecasts) & - (forecasts < self.thresholds[t+1]) & + (forecasts < self.thresholds[t + 1]) & (observations >= self.obs_threshold)) self.frequencies.loc[t, "Total_Freq"] += np.count_nonzero((threshold <= forecasts) & - (forecasts < self.thresholds[t+1])) - + (forecasts < self.thresholds[t + 1])) + def clear(self): self.frequencies.loc[:, :] = 0 @@ -429,7 +431,7 @@ def from_str(self, in_str): self.thresholds = np.array(value.split(), dtype=float) self.frequencies = pd.DataFrame(columns=self.frequencies.columns, data=np.zeros((self.thresholds.size, - self.frequencies.columns.size))) + self.frequencies.columns.size))) elif var_name in ["Positive_Freq", "Total_Freq"]: self.frequencies[var_name] = np.array(value.split(), dtype=int) @@ -490,7 +492,7 @@ def crps(self): Calculates the continuous ranked probability score. """ return np.sum(self.errors["F_2"].values - self.errors["F_O"].values * 2.0 + self.errors["O_2"].values) / \ - (self.thresholds.size * self.num_forecasts) + (self.thresholds.size * self.num_forecasts) def crps_climo(self): """ diff --git a/hagelslag/processing/EnhancedWatershedSegmenter.py b/hagelslag/processing/EnhancedWatershedSegmenter.py index af37baa..c5622a2 100644 --- a/hagelslag/processing/EnhancedWatershedSegmenter.py +++ b/hagelslag/processing/EnhancedWatershedSegmenter.py @@ -10,9 +10,10 @@ @author: David John Gagne (djgagne@ou.edu) """ +from collections import OrderedDict + import numpy as np from scipy.ndimage import find_objects -from collections import OrderedDict class EnhancedWatershed(object): diff --git a/hagelslag/processing/EnsembleProducts.py b/hagelslag/processing/EnsembleProducts.py index 96bc41d..0b77606 100644 --- a/hagelslag/processing/EnsembleProducts.py +++ b/hagelslag/processing/EnsembleProducts.py @@ -1,19 +1,21 @@ -from hagelslag.data.ModelOutput import ModelOutput -from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file, make_proj_grids +import json +from datetime import timedelta +from glob import glob +from os.path import join, exists + import numpy as np import pandas as pd +from grib2io import Grib2Message +from netCDF4 import Dataset, num2date +from scipy.interpolate import interp1d from scipy.ndimage import gaussian_filter from scipy.signal import fftconvolve -from scipy.stats import gamma -from scipy.interpolate import interp1d from scipy.spatial import cKDTree +from scipy.stats import gamma from skimage.morphology import disk -from netCDF4 import Dataset, num2date -from glob import glob -from os.path import join, exists -import json -from datetime import timedelta -from grib2io import Grib2Message + +from hagelslag.data.ModelOutput import ModelOutput +from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file, make_proj_grids class EnsembleMemberProduct(object): @@ -37,6 +39,7 @@ class EnsembleMemberProduct(object): condition_model_name (str): Name of the condition ML model being used if different from model_name condition_threshold (float): Probability threshold for including or excluding storms. """ + def __init__(self, ensemble_name, model_name, member, run_date, variable, start_date, end_date, path, single_step, size_distribution_training_path, watershed_var, map_file=None, condition_model_name=None, condition_threshold=0.5): @@ -106,7 +109,7 @@ def load_data(self, num_samples=1000, percentiles=None): self.data = np.zeros((self.forecast_hours.size, self.mapping_data["lon"].shape[0], self.mapping_data["lon"].shape[1]), dtype=np.float32) - + if self.percentiles is not None: self.percentile_data = np.zeros([len(self.percentiles)] + list(self.data.shape)) full_condition_name = "condition_" + self.condition_model_name.replace(" ", "-") @@ -123,7 +126,7 @@ def load_data(self, num_samples=1000, percentiles=None): if forecast_time in self.times: t = np.where(self.times == forecast_time)[0][0] mask = np.array(step["properties"]["masks"], dtype=int).ravel() - rankings = np.argsort(np.array(step["properties"]["timesteps"]).ravel()[mask==1]) + rankings = np.argsort(np.array(step["properties"]["timesteps"]).ravel()[mask == 1]) i = np.array(step["properties"]["i"], dtype=int).ravel()[mask == 1][rankings] j = np.array(step["properties"]["j"], dtype=int).ravel()[mask == 1][rankings] if rankings.size > 0 and forecast_params[0] > 0.1 and 1 < forecast_params[2] < 100: @@ -176,8 +179,8 @@ def load_forecast_csv_data(self, csv_path): """ forecast_file = join(csv_path, "hail_forecasts_{0}_{1}_{2}.csv".format(self.ensemble_name, - self.member, - self.run_date.strftime("%Y%m%d-%H%M"))) + self.member, + self.run_date.strftime("%Y%m%d-%H%M"))) if exists(forecast_file): self.hail_forecast_table = pd.read_csv(forecast_file) return @@ -196,7 +199,7 @@ def load_forecast_netcdf_data(self, nc_path): if exists(nc_file): nc_patches = Dataset(nc_file) nc_times = pd.DatetimeIndex(num2date(nc_patches.variables["time"][:], - nc_patches.variables["time"].units)) + nc_patches.variables["time"].units)) time_indices = np.isin(nc_times, self.times) self.nc_patches = dict() self.nc_patches["time"] = nc_times[time_indices] @@ -208,9 +211,9 @@ def load_forecast_netcdf_data(self, nc_path): nc_patches.close() print(nc_file) else: - print('no {0} {1} netCDF4 file'.format(self.member,self.run_date.strftime("%Y%m%d"))) + print('no {0} {1} netCDF4 file'.format(self.member, self.run_date.strftime("%Y%m%d"))) self.nc_patches = None - return + return return def quantile_match(self): @@ -223,13 +226,13 @@ def quantile_match(self): """ if self.nc_patches is None: self.data = None - return + return mask_indices = np.where(self.nc_patches["masks"] == 1) obj_values = self.nc_patches["obj_values"][mask_indices] obj_values = np.array(obj_values) percentiles = np.linspace(0.1, 99.9, 100) - + try: filename = join(self.size_distribution_training_path, '{0}_{1}_{2}_Size_Distribution.csv'.format(self.ensemble_name, @@ -237,11 +240,11 @@ def quantile_match(self): self.member)) if not exists(filename): filename = join(self.size_distribution_training_path, - '{0}_{1}_Size_Distribution.csv'.format(self.ensemble_name, - self.watershed_var)) + '{0}_{1}_Size_Distribution.csv'.format(self.ensemble_name, + self.watershed_var)) train_period_obj_per_vals = pd.read_csv(filename) - train_period_obj_per_vals = train_period_obj_per_vals.loc[:,"Obj_Values"].values - per_func = interp1d(train_period_obj_per_vals, percentiles / 100.0, + train_period_obj_per_vals = train_period_obj_per_vals.loc[:, "Obj_Values"].values + per_func = interp1d(train_period_obj_per_vals, percentiles / 100.0, bounds_error=False, fill_value=(0.1, 99.9)) except FileNotFoundError: obj_per_vals = np.percentile(obj_values, percentiles) @@ -374,22 +377,22 @@ def encode_grib2_percentile(self): self.grid_dict["dx"] * 1e3, self.grid_dict["dy"] * 1e3, 0b00000000, 0b01000000, self.proj_dict["lat_1"] * lscale, self.proj_dict["lat_2"] * lscale, -90 * lscale, 0] - pdtmp1 = np.array([1, # parameter category Moisture - 31, # parameter number Hail - 4, # Type of generating process Ensemble Forecast - 0, # Background generating process identifier - 31, # Generating process or model from NCEP - 0, # Hours after reference time data cutoff - 0, # Minutes after reference time data cutoff - 1, # Forecast time units Hours - 0, # Forecast time - 1, # Type of first fixed surface Ground - 1, # Scale value of first fixed surface - 0, # Value of first fixed surface - 1, # Type of second fixed surface - 1, # Scale value of 2nd fixed surface - 0, # Value of 2nd fixed surface - 0, # Derived forecast type + pdtmp1 = np.array([1, # parameter category Moisture + 31, # parameter number Hail + 4, # Type of generating process Ensemble Forecast + 0, # Background generating process identifier + 31, # Generating process or model from NCEP + 0, # Hours after reference time data cutoff + 0, # Minutes after reference time data cutoff + 1, # Forecast time units Hours + 0, # Forecast time + 1, # Type of first fixed surface Ground + 1, # Scale value of first fixed surface + 0, # Value of first fixed surface + 1, # Type of second fixed surface + 1, # Scale value of 2nd fixed surface + 0, # Value of 2nd fixed surface + 0, # Derived forecast type self.num_samples # Number of ensemble members ], dtype=np.int32) grib_objects = pd.Series(index=self.times, data=[None] * self.times.size, dtype=object) @@ -397,7 +400,7 @@ def encode_grib2_percentile(self): for t, time in enumerate(self.times): time_list = list(self.run_date.utctimetuple()[0:6]) if grib_objects[time] is None: - #grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + # grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time] = Grib2Message(discipline=0, idsect=np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time].addgrid(gdsinfo, gdtmp1) @@ -405,8 +408,8 @@ def encode_grib2_percentile(self): data = self.percentile_data[:, t] / 1000.0 masked_data = np.ma.array(data, mask=data <= 0) for p, percentile in enumerate(self.percentiles): - print("GRIB {3} Percentile {0}. Max: {1} Min: {2}".format(percentile, - masked_data[p].max(), + print("GRIB {3} Percentile {0}. Max: {1} Min: {2}".format(percentile, + masked_data[p].max(), masked_data[p].min(), time)) if percentile in range(1, 100): @@ -425,7 +428,7 @@ def encode_grib2_data(self): Series of GRIB2 messages """ if self.data is None: - return None + return None lscale = 1e6 grib_id_start = [7, 0, 14, 14, 2] gdsinfo = np.array([0, np.product(self.data.shape[-2:]), 0, 0, 30], dtype=np.int32) @@ -442,30 +445,30 @@ def encode_grib2_data(self): self.grid_dict["dx"] * 1e3, self.grid_dict["dy"] * 1e3, 0b00000000, 0b01000000, self.proj_dict["lat_1"] * lscale, self.proj_dict["lat_2"] * lscale, -90 * lscale, 0] - pdtmp1 = np.array([1, # parameter category Moisture - 31, # parameter number Hail - 4, # Type of generating process Ensemble Forecast - 0, # Background generating process identifier - 31, # Generating process or model from NCEP - 0, # Hours after reference time data cutoff - 0, # Minutes after reference time data cutoff - 1, # Forecast time units Hours - 0, # Forecast time - 1, # Type of first fixed surface Ground - 1, # Scale value of first fixed surface - 0, # Value of first fixed surface - 1, # Type of second fixed surface - 1, # Scale value of 2nd fixed surface - 0, # Value of 2nd fixed surface - 0, # Derived forecast type - 1 # Number of ensemble members + pdtmp1 = np.array([1, # parameter category Moisture + 31, # parameter number Hail + 4, # Type of generating process Ensemble Forecast + 0, # Background generating process identifier + 31, # Generating process or model from NCEP + 0, # Hours after reference time data cutoff + 0, # Minutes after reference time data cutoff + 1, # Forecast time units Hours + 0, # Forecast time + 1, # Type of first fixed surface Ground + 1, # Scale value of first fixed surface + 0, # Value of first fixed surface + 1, # Type of second fixed surface + 1, # Scale value of 2nd fixed surface + 0, # Value of 2nd fixed surface + 0, # Derived forecast type + 1 # Number of ensemble members ], dtype=np.int32) grib_objects = pd.Series(index=self.times, data=[None] * self.times.size, dtype=object) drtmp1 = np.array([0, 0, 4, 8, 0], dtype=np.int32) for t, time in enumerate(self.times): time_list = list(self.run_date.utctimetuple()[0:6]) if grib_objects[time] is None: - #grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) + # grib_objects[time] = Grib2Encode(0, np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time] = Grib2Message(discipline=0, idsect=np.array(grib_id_start + time_list + [2, 1], dtype=np.int32)) grib_objects[time].addgrid(gdsinfo, gdtmp1) @@ -489,11 +492,11 @@ def write_grib2_files(self, grib_objects, path): for t, time in enumerate(self.times.to_pydatetime()): grib_objects[time].end() filename = join(path, "{0}_{1}_{2}_{3}_{4}.grib2".format(self.ensemble_name, - self.member, - self.model_name.replace(" ", "-"), - self.variable, - self.run_date.strftime("%Y%m%d%H") + - "f{0:02d}".format(self.forecast_hours[t]) - )) + self.member, + self.model_name.replace(" ", "-"), + self.variable, + self.run_date.strftime("%Y%m%d%H") + + "f{0:02d}".format(self.forecast_hours[t]) + )) with open(filename, "wb") as fo: fo.write(grib_objects[time]._msg) diff --git a/hagelslag/processing/ObjectMatcher.py b/hagelslag/processing/ObjectMatcher.py index 5832035..f284bb4 100644 --- a/hagelslag/processing/ObjectMatcher.py +++ b/hagelslag/processing/ObjectMatcher.py @@ -1,7 +1,8 @@ import numpy as np -from hagelslag.util.munkres import Munkres import pandas as pd +from hagelslag.util.munkres import Munkres + class ObjectMatcher(object): """ @@ -183,6 +184,7 @@ class TrackStepMatcher(object): """ Determine if each step in a track is in close proximity to steps from another set of tracks """ + def __init__(self, cost_function_components, max_values): self.cost_function_components = cost_function_components self.max_values = max_values diff --git a/hagelslag/processing/STObject.py b/hagelslag/processing/STObject.py index 1009c7c..f846d54 100644 --- a/hagelslag/processing/STObject.py +++ b/hagelslag/processing/STObject.py @@ -1,8 +1,9 @@ +import json + import numpy as np from skimage.measure import regionprops -from skimage.segmentation import find_boundaries from skimage.morphology import convex_hull_image -import json +from skimage.segmentation import find_boundaries class STObject(object): @@ -71,7 +72,7 @@ def __init__(self, grid, mask, x, y, i, j, start_time, end_time, step=1, dx=4000 def __str__(self): com_x, com_y = self.center_of_mass(self.start_time) data = dict(maxsize=self.max_size(), comx=com_x, comy=com_y, start=self.start_time, end=self.end_time) - return "ST Object [maxSize=%(maxsize)d,initialCenter=%(comx)0.2f,%(comy)0.2f,duration=%(start)02d-%(end)02d]" %\ + return "ST Object [maxSize=%(maxsize)d,initialCenter=%(comx)0.2f,%(comy)0.2f,duration=%(start)02d-%(end)02d]" % \ data def center_of_mass(self, time): @@ -235,7 +236,7 @@ def boundary_polygon(self, time): chull = convex_hull_image(padded_mask) boundary_image = find_boundaries(chull, mode='inner', background=0) # Now remove the padding. - boundary_image = boundary_image[1:-1,1:-1] + boundary_image = boundary_image[1:-1, 1:-1] boundary_x = self.x[ti].ravel()[boundary_image.ravel()] boundary_y = self.y[ti].ravel()[boundary_image.ravel()] r = np.sqrt((boundary_x - com_x) ** 2 + (boundary_y - com_y) ** 2) @@ -273,7 +274,7 @@ def estimate_motion(self, time, intensity_grid, max_u, max_v): for v in v_shifts: i_shift = i_vals - v if np.all((0 <= i_shift) & (i_shift < intensity_grid.shape[0]) & - (0 <= j_shift) & (j_shift < intensity_grid.shape[1])): + (0 <= j_shift) & (j_shift < intensity_grid.shape[1])): shift_vals = intensity_grid[i_shift, j_shift] else: shift_vals = np.zeros(i_shift.shape) @@ -284,7 +285,7 @@ def estimate_motion(self, time, intensity_grid, max_u, max_v): best_u = u * self.dx best_v = v * self.dx # 60 seems arbitrarily high - #if min_error > 60: + # if min_error > 60: # best_u = 0 # best_v = 0 self.u[ti] = best_u @@ -344,7 +345,6 @@ def extract_attribute_array(self, data_array, var_name): for t in range(self.times.size): self.attributes[var_name].append(data_array[self.i[t], self.j[t]]) - def extract_tendency_grid(self, model_grid): """ Extracts the difference in model outputs @@ -360,7 +360,7 @@ def extract_tendency_grid(self, model_grid): t_index = t - model_grid.start_hour self.attributes[var_name].append( model_grid.data[t_index, self.i[ti], self.j[ti]] - model_grid.data[t_index - 1, self.i[ti], self.j[ti]] - ) + ) def calc_attribute_statistics(self, statistic_name): """ @@ -405,7 +405,7 @@ def calc_attribute_statistic(self, attribute, statistic, time): stat_val = np.median(self.attributes[attribute][ti].ravel()[ma]) elif statistic == "skew": stat_val = np.mean(self.attributes[attribute][ti].ravel()[ma]) - \ - np.median(self.attributes[attribute][ti].ravel()[ma]) + np.median(self.attributes[attribute][ti].ravel()[ma]) elif 'percentile' in statistic: per = int(statistic.split("_")[1]) stat_val = np.percentile(self.attributes[attribute][ti].ravel()[ma], per) @@ -415,7 +415,7 @@ def calc_attribute_statistic(self, attribute, statistic, time): stat_val = 0 else: stat_val = self.calc_attribute_statistic(attribute, stat_name, time) \ - - self.calc_attribute_statistic(attribute, stat_name, time - 1) + - self.calc_attribute_statistic(attribute, stat_name, time - 1) else: stat_val = np.nan return stat_val @@ -445,8 +445,8 @@ def calc_timestep_statistic(self, statistic, time): if ti == 0: stat_val = 0 else: - stat_val = self.calc_timestep_statistic(stat_name, time) -\ - self.calc_timestep_statistic(stat_name, time - 1) + stat_val = self.calc_timestep_statistic(stat_name, time) - \ + self.calc_timestep_statistic(stat_name, time - 1) else: stat_val = np.nan return stat_val @@ -485,7 +485,7 @@ def calc_shape_step(self, stat_names, time): ti = np.where(self.times == time)[0][0] shape_stats = [] try: - props = regionprops(self.masks[ti], self.timesteps[ti])[0] + props = regionprops(self.masks[ti], self.timesteps[ti])[0] except: for stat_name in stat_names: shape_stats.append(np.nan) @@ -544,6 +544,7 @@ def to_geojson(self, filename, proj, metadata=None): file_obj.close() return + def read_geojson(filename): """ Reads a geojson file containing an STObject and initializes a new STObject from the information in the file. @@ -577,4 +578,3 @@ def read_geojson(filename): for k, v in attribute_data.items(): sto.attributes[k] = v return sto - diff --git a/hagelslag/processing/TrackModeler.py b/hagelslag/processing/TrackModeler.py index 71aa2e6..3b6a8df 100644 --- a/hagelslag/processing/TrackModeler.py +++ b/hagelslag/processing/TrackModeler.py @@ -1,17 +1,20 @@ -import numpy as np -import pandas as pd -import pickle import json import os -from sklearn.decomposition import PCA +import pickle from copy import deepcopy from glob import glob from multiprocessing import Pool +from os.path import join + +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression +from sklearn.model_selection import KFold + from hagelslag.evaluation.ProbabilityMetrics import DistributedROC -from os.path import join from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file -from sklearn.model_selection import KFold + class TrackModeler(object): """ @@ -30,7 +33,7 @@ def __init__(self, weighting_function, map_file, group_col="Microphysics"): - + self.train_data_path = train_data_path self.forecast_data_path = forecast_data_path self.ensemble_name = ensemble_name @@ -50,8 +53,8 @@ def __init__(self, if self.map_file[-3:] == "map": self.proj_dict, self.grid_dict = read_arps_map_file(self.map_file) - else: - self.proj_dict, self.grid_dict = read_ncar_map_file(self.map_file) + else: + self.proj_dict, self.grid_dict = read_ncar_map_file(self.map_file) return def load_data(self, mode="train", format="csv"): @@ -66,7 +69,7 @@ def load_data(self, mode="train", format="csv"): """ if mode in self.data.keys(): run_dates = pd.date_range(start=self.start_dates[mode], - end=self.end_dates[mode], freq="1D") + end=self.end_dates[mode], freq="1D") run_date_str = [d.strftime("%Y%m%d-%H%M") for d in run_dates.date] print(np.unique(run_dates.strftime('%Y%m'))) all_total_track_files = sorted(glob(getattr(self, mode + "_data_path") + @@ -84,12 +87,13 @@ def load_data(self, mode="train", format="csv"): if file_date in run_date_str: step_track_files.append(step_file) - self.data[mode]["total"] = pd.concat(map(pd.read_csv, total_track_files), - ignore_index=True,sort='True') + self.data[mode]["total"] = pd.concat([pd.read_csv(total_track_file) + for total_track_file in total_track_files], + ignore_index=True, sort='True') self.data[mode]["total"] = self.data[mode]["total"].fillna(value=0) self.data[mode]["total"] = self.data[mode]["total"].replace([np.inf, -np.inf], 0) self.data[mode]["step"] = pd.concat(map(pd.read_csv, step_track_files), - ignore_index=True,sort='True') + ignore_index=True, sort='True') self.data[mode]["step"] = self.data[mode]["step"].fillna(value=0) self.data[mode]["step"] = self.data[mode]["step"].replace([np.inf, -np.inf], 0) if mode == "forecast": @@ -316,8 +320,8 @@ def fit_size_distribution_models(self, model_names, model_objs, input_columns, self.size_distribution_models[group]["multi"][model_name] = deepcopy(model_objs[m]) try: self.size_distribution_models[group]["multi"][model_name].fit(group_data[input_columns], - (log_labels - log_means) / log_sds, - sample_weight=weights) + (log_labels - log_means) / log_sds, + sample_weight=weights) except: self.size_distribution_models[group]["multi"][model_name].fit(group_data[input_columns], (log_labels - log_means) / log_sds) @@ -389,10 +393,10 @@ def fit_size_distribution_component_models(self, model_names, model_objs, input_ "pc_{0:d}".format(comp)][model_name].fit(group_data[input_columns], out_pc_labels[:, comp]) return - + def predict_size_distribution_models(self, model_names, input_columns, metadata_cols, - data_mode="forecast", location=6, - calibrate=False): + data_mode="forecast", location=6, + calibrate=False): """ Make predictions using fitted size distribution models. Each ML model predicts the normalized shape and scale parameters simultaneously using multitask learning. Only scikit learn models that support @@ -419,7 +423,7 @@ def predict_size_distribution_models(self, model_names, input_columns, metadata_ log_sd = self.size_distribution_models[group]["lognorm"]["sd"] for m, model_name in enumerate(model_names): multi_predictions = self.size_distribution_models[group]["multi"][model_name].predict( - self.data[data_mode]["combo"].loc[group_idxs,input_columns]) + self.data[data_mode]["combo"].loc[group_idxs, input_columns]) if calibrate: multi_predictions[:, 0] = self.size_distribution_models[group]["calshape"][model_name].predict( multi_predictions[:, 0:1]) @@ -428,9 +432,9 @@ def predict_size_distribution_models(self, model_names, input_columns, metadata_ multi_predictions = np.exp(multi_predictions * log_sd + log_mean) if multi_predictions.shape[1] == 2: multi_predictions_temp = np.zeros((multi_predictions.shape[0], 3)) - multi_predictions_temp[:, 0] = multi_predictions[:,0] + multi_predictions_temp[:, 0] = multi_predictions[:, 0] multi_predictions_temp[:, 1] = location - multi_predictions_temp[:, 2] = multi_predictions[:,1] + multi_predictions_temp[:, 2] = multi_predictions[:, 1] for p, pred_col in enumerate(["shape", "location", "scale"]): predictions.loc[group_idxs, model_name.replace(" ", "-") + "_" + pred_col] = \ multi_predictions_temp[:, p] @@ -844,9 +848,7 @@ def output_forecasts_csv(self, forecasts, mode, csv_path, run_date_format="%Y%m% def output_forecasts_json_parallel(self, forecasts, condition_model_names, - size_model_names, dist_model_names, - track_model_names, json_data_path, out_path, num_procs): @@ -864,17 +866,9 @@ def output_forecasts_json_parallel(self, forecasts, for ml_model in condition_model_names: step_forecasts["condition_" + ml_model.replace(" ", "-")] = forecasts["condition"][group].loc[ forecasts["condition"][group]["Track_ID"] == track_id, ml_model].copy() - # for ml_model in size_model_names: - # step_forecasts["size_" + ml_model.replace(" ", "-")] = forecasts["size"][group][ml_model].loc[ - # forecasts["size"][group][ml_model]["Track_ID"] == track_id].copy() for ml_model in dist_model_names: step_forecasts["dist_" + ml_model.replace(" ", "-")] = forecasts["dist"][group][ml_model].loc[ forecasts["dist"][group][ml_model]["Track_ID"] == track_id].copy() - # for model_type in forecasts["track"].keys(): - # for ml_model in track_model_names: - # mframe = forecasts["track"][model_type][group][ml_model] - # step_forecasts[model_type + "_" + ml_model.replace(" ", "-")] = mframe.loc[ - # mframe["Track_ID"] == track_id].copy() pool.apply_async(output_forecast, (step_forecasts, run_date, ensemble_name, member, track_num, json_data_path, out_path)) @@ -925,6 +919,3 @@ def output_forecast(step_forecasts, run_date, ensemble_name, member, track_num, print(out_json_filename + " not found") return return - - - diff --git a/hagelslag/processing/TrackProcessing.py b/hagelslag/processing/TrackProcessing.py index e88535c..489d6e0 100644 --- a/hagelslag/processing/TrackProcessing.py +++ b/hagelslag/processing/TrackProcessing.py @@ -1,19 +1,21 @@ -from hagelslag.data.ModelOutput import ModelOutput +from datetime import timedelta +from glob import glob + +import numpy as np +import pandas as pd +from netCDF4 import Dataset +from scipy.interpolate import interp1d +from scipy.ndimage import find_objects, gaussian_filter +from scipy.stats import gamma + from hagelslag.data.MRMSGrid import MRMSGrid +from hagelslag.data.ModelOutput import ModelOutput from hagelslag.processing.EnhancedWatershedSegmenter import EnhancedWatershed, rescale_data -from hagelslag.processing.Watershed import Watershed from hagelslag.processing.Hysteresis import Hysteresis +from hagelslag.processing.Watershed import Watershed from hagelslag.processing.tracker import label_storm_objects, extract_storm_patches, track_storms from .ObjectMatcher import ObjectMatcher, TrackMatcher, TrackStepMatcher -from scipy.ndimage import find_objects, gaussian_filter from .STObject import STObject, read_geojson -import numpy as np -from scipy.interpolate import interp1d -from glob import glob -import pandas as pd -from datetime import timedelta -from scipy.stats import gamma -from netCDF4 import Dataset class TrackProcessor(object): @@ -46,6 +48,7 @@ class TrackProcessor(object): single_step: Whether model timesteps are in separate files or aggregated into one file. mask_file: netCDF filename containing a mask of valid grid points on the model domain. """ + def __init__(self, run_date, start_date, @@ -99,7 +102,7 @@ def __init__(self, self.single_step = single_step self.model_grid = ModelOutput(self.ensemble_name, self.ensemble_member, self.run_date, self.variable, self.start_date, self.end_date, self.model_path, self.model_map_file, - single_step=self.single_step) + single_step=self.single_step) self.model_grid.load_map_info(self.model_map_file) if self.mrms_path is not None: self.mrms_variable = mrms_variable @@ -181,7 +184,7 @@ def find_model_patch_tracks(self): if len(slices) > 0: dims = (slices[0][0].stop - slices[0][0].start, slices[0][1].stop - slices[0][1].start) if h > 0: - model_obj.estimate_motion(hour, self.model_grid.data[h-1], dims[1], dims[0]) + model_obj.estimate_motion(hour, self.model_grid.data[h - 1], dims[1], dims[0]) del model_data del hour_labels @@ -250,16 +253,16 @@ def find_model_tracks(self): for s, sl in enumerate(obj_slices): model_objects[-1].append(STObject(self.model_grid.data[h][sl], np.where(hour_labels[sl] == s + 1, 1, 0), - self.model_grid.x[sl], - self.model_grid.y[sl], - self.model_grid.i[sl], + self.model_grid.x[sl], + self.model_grid.y[sl], + self.model_grid.i[sl], self.model_grid.j[sl], hour, hour, dx=self.model_grid.dx)) if h > 0: dims = model_objects[-1][-1].timesteps[0].shape - model_objects[-1][-1].estimate_motion(hour, self.model_grid.data[h-1], dims[1], dims[0]) + model_objects[-1][-1].estimate_motion(hour, self.model_grid.data[h - 1], dims[1], dims[0]) del hour_labels del scaled_data del model_data @@ -315,18 +318,18 @@ def find_mrms_tracks(self): tracked_obs_objects = [] if self.mrms_ew is not None: self.mrms_grid.load_data() - + if len(self.mrms_grid.data) != len(self.hours): print('Less than 24 hours of observation data found') - + return tracked_obs_objects - + for h, hour in enumerate(self.hours): mrms_data = np.zeros(self.mrms_grid.data[h].shape) mrms_data[:] = np.array(self.mrms_grid.data[h]) mrms_data[mrms_data < 0] = 0 hour_labels = self.mrms_ew.size_filter(self.mrms_ew.label(gaussian_filter(mrms_data, - self.gaussian_window)), + self.gaussian_window)), self.size_filter) hour_labels[mrms_data < self.mrms_ew.min_intensity] = 0 obj_slices = find_objects(hour_labels) @@ -345,8 +348,8 @@ def find_mrms_tracks(self): dx=self.model_grid.dx)) if h > 0: dims = obs_objects[-1][-1].timesteps[0].shape - obs_objects[-1][-1].estimate_motion(hour, self.mrms_grid.data[h-1], dims[1], dims[0]) - + obs_objects[-1][-1].estimate_motion(hour, self.mrms_grid.data[h - 1], dims[1], dims[0]) + for h, hour in enumerate(self.hours): past_time_objs = [] for obj in tracked_obs_objects: @@ -364,7 +367,7 @@ def find_mrms_tracks(self): for up in unpaired: tracked_obs_objects.append(obs_objects[h][up]) print("Tracked Obs Objects: {0:03d} Hour: {1:02d}".format(len(tracked_obs_objects), hour)) - + return tracked_obs_objects def match_tracks(self, model_tracks, obs_tracks, unique_matches=True, closest_matches=False): @@ -414,11 +417,11 @@ def extract_model_attributes(self, tracked_model_objects, storm_variables, poten for model_obj in tracked_model_objects: model_obj.extract_attribute_array(getattr(self.model_grid, l_var), l_var) for storm_var in storm_variables: - print("Storm {0} {1} {2}".format(storm_var,self.ensemble_member, self.run_date.strftime("%Y%m%d"))) + print("Storm {0} {1} {2}".format(storm_var, self.ensemble_member, self.run_date.strftime("%Y%m%d"))) model_grids[storm_var] = ModelOutput(self.ensemble_name, self.ensemble_member, self.run_date, storm_var, self.start_date - timedelta(hours=1), self.end_date + timedelta(hours=1), - self.model_path,self.model_map_file, + self.model_path, self.model_map_file, self.single_step) model_grids[storm_var].load_data() for model_obj in tracked_model_objects: @@ -426,13 +429,13 @@ def extract_model_attributes(self, tracked_model_objects, storm_variables, poten if storm_var not in potential_variables + tendency_variables + future_variables: del model_grids[storm_var] for potential_var in potential_variables: - print("Potential {0} {1} {2}".format(potential_var,self.ensemble_member, self.run_date.strftime("%Y%m%d"))) + print("Potential {0} {1} {2}".format(potential_var, self.ensemble_member, self.run_date.strftime("%Y%m%d"))) if potential_var not in model_grids.keys(): model_grids[potential_var] = ModelOutput(self.ensemble_name, self.ensemble_member, self.run_date, potential_var, self.start_date - timedelta(hours=1), self.end_date + timedelta(hours=1), - self.model_path, self.model_map_file, + self.model_path, self.model_map_file, self.single_step) model_grids[potential_var].load_data() for model_obj in tracked_model_objects: @@ -443,11 +446,11 @@ def extract_model_attributes(self, tracked_model_objects, storm_variables, poten print("Future {0} {1} {2}".format(future_var, self.ensemble_member, self.run_date.strftime("%Y%m%d"))) if future_var not in model_grids.keys(): model_grids[future_var] = ModelOutput(self.ensemble_name, self.ensemble_member, - self.run_date, future_var, - self.start_date - timedelta(hours=1), - self.end_date + timedelta(hours=1), - self.model_path, self.model_map_file, - self.single_step) + self.run_date, future_var, + self.start_date - timedelta(hours=1), + self.end_date + timedelta(hours=1), + self.model_path, self.model_map_file, + self.single_step) model_grids[future_var].load_data() for model_obj in tracked_model_objects: model_obj.extract_attribute_grid(model_grids[future_var], future=True) @@ -460,13 +463,12 @@ def extract_model_attributes(self, tracked_model_objects, storm_variables, poten self.run_date, tendency_var, self.start_date - timedelta(hours=1), self.end_date, - self.model_path, self.model_map_file, + self.model_path, self.model_map_file, self.single_step) for model_obj in tracked_model_objects: model_obj.extract_tendency_grid(model_grids[tendency_var]) del model_grids[tendency_var] - @staticmethod def match_hail_sizes(model_tracks, obs_tracks, track_pairings): """ @@ -487,10 +489,10 @@ def match_hail_sizes(model_tracks, obs_tracks, track_pairings): obs_hail_sizes = np.array([step[obs_track.masks[t] == 1].max() for t, step in enumerate(obs_track.timesteps)]) if obs_track.times.size > 1 and model_track.times.size > 1: - normalized_obs_times = 1.0 / (obs_track.times.max() - obs_track.times.min())\ - * (obs_track.times - obs_track.times.min()) - normalized_model_times = 1.0 / (model_track.times.max() - model_track.times.min())\ - * (model_track.times - model_track.times.min()) + normalized_obs_times = 1.0 / (obs_track.times.max() - obs_track.times.min()) \ + * (obs_track.times - obs_track.times.min()) + normalized_model_times = 1.0 / (model_track.times.max() - model_track.times.min()) \ + * (model_track.times - model_track.times.min()) hail_interp = interp1d(normalized_obs_times, obs_hail_sizes, kind="nearest", bounds_error=False, fill_value=0) model_track.observations = hail_interp(normalized_model_times) @@ -498,7 +500,7 @@ def match_hail_sizes(model_tracks, obs_tracks, track_pairings): model_track.observations = np.ones(model_track.times.shape) * obs_hail_sizes[0] elif model_track.times.size == 1: model_track.observations = np.array([obs_hail_sizes.max()]) - print(pair[0], "obs", obs_hail_sizes) + print(pair[0], "obs", obs_hail_sizes) print(pair[0], "model", model_track.observations) for u in unpaired: model_tracks[u].observations = np.zeros(model_tracks[u].times.shape) @@ -529,6 +531,7 @@ def match_single_track_dist(model_track, obs_track): for param in obs_hail_dists.columns: model_hail_dists.loc[model_track.times, param] = obs_hail_dists.loc[obs_track.times[0], param] return model_hail_dists + unpaired = list(range(len(model_tracks))) for p, pair in enumerate(track_pairings): unpaired.remove(pair[0]) @@ -611,6 +614,5 @@ def calc_track_errors(model_tracks, obs_tracks, track_pairings): track_errors.loc[pair[0], 'translation_error_x'] = model_com[0] - obs_com[0] track_errors.loc[pair[0], 'translation_error_y'] = model_com[1] - obs_com[1] track_errors.loc[pair[0], 'start_time_difference'] = model_track.start_time - obs_track.start_time - track_errors.loc[pair[0], 'end_time_difference'] = model_track.end_time - obs_track.end_time + track_errors.loc[pair[0], 'end_time_difference'] = model_track.end_time - obs_track.end_time return track_errors - diff --git a/hagelslag/processing/TrackSampler.py b/hagelslag/processing/TrackSampler.py index 3327bc7..d1f27b8 100644 --- a/hagelslag/processing/TrackSampler.py +++ b/hagelslag/processing/TrackSampler.py @@ -1,16 +1,17 @@ -from netCDF4 import Dataset -import numpy as np -import pandas as pd +import argparse import json -from glob import glob import os +import pickle +import traceback from datetime import datetime +from glob import glob from multiprocessing import Pool -import argparse -import pickle + +import numpy as np +import pandas as pd +from netCDF4 import Dataset from scipy.ndimage import binary_dilation from scipy.stats import multivariate_normal -import traceback def load_grid_info(grid_file): @@ -73,20 +74,20 @@ def main(): for run_date in run_dates: for member in members: group = member_info.loc[member, "Microphysics"] - apply(sample_member_run_tracks, (member, - group, - run_date, - model_names, - start_hour, - end_hour, - grid_shape, - dx, - track_path, - num_samples, - thresholds, - copula_file, - out_path - )) + sample_member_run_tracks(member, + group, + run_date, + model_names, + start_hour, + end_hour, + grid_shape, + dx, + track_path, + num_samples, + thresholds, + copula_file, + out_path + ) return @@ -133,6 +134,7 @@ class TrackSampler(object): Monte Carlo sampler of forecast storm tracks. """ + def __init__(self, member, group, run_date, model_names, start_hour, end_hour, grid_shape, dx, track_path, num_samples, copula_file=None): self.member = member diff --git a/hagelslag/processing/Watershed.py b/hagelslag/processing/Watershed.py index 6e25308..174ba12 100644 --- a/hagelslag/processing/Watershed.py +++ b/hagelslag/processing/Watershed.py @@ -1,6 +1,6 @@ -from skimage.segmentation import watershed -from scipy.ndimage import label, find_objects import numpy as np +from scipy.ndimage import label, find_objects +from skimage.segmentation import watershed class Watershed(object): @@ -14,6 +14,7 @@ class Watershed(object): core_intensity: the intensity used to determine the initial objects. """ + def __init__(self, min_intensity, max_intensity): self.min_intensity = min_intensity self.max_intensity = max_intensity diff --git a/hagelslag/processing/tracker.py b/hagelslag/processing/tracker.py index 8f02be4..05b44d7 100644 --- a/hagelslag/processing/tracker.py +++ b/hagelslag/processing/tracker.py @@ -1,10 +1,11 @@ -from .STObject import STObject +import numpy as np +from scipy.ndimage import find_objects, center_of_mass, gaussian_filter + +from hagelslag.processing.ObjectMatcher import ObjectMatcher from .EnhancedWatershedSegmenter import EnhancedWatershed -from .Watershed import Watershed from .Hysteresis import Hysteresis -from hagelslag.processing.ObjectMatcher import ObjectMatcher -from scipy.ndimage import find_objects, center_of_mass, gaussian_filter -import numpy as np +from .STObject import STObject +from .Watershed import Watershed def label_storm_objects(data, method, min_intensity, max_intensity, min_area=1, max_area=100, max_range=1, @@ -220,7 +221,7 @@ def track_storms(storm_objects, times, distance_components, distance_maxima, dis if len(past_time_objects) == 0: tracked_objects.extend(storm_objects[t]) elif len(past_time_objects) > 0 and len(storm_objects[t]) > 0: - assignments = obj_matcher.match_objects(past_time_objects, storm_objects[t], times[t-1], times[t]) + assignments = obj_matcher.match_objects(past_time_objects, storm_objects[t], times[t - 1], times[t]) unpaired = list(range(len(storm_objects[t]))) for pair in assignments: past_time_objects[pair[0]].extend(storm_objects[t][pair[1]]) diff --git a/hagelslag/util/Config.py b/hagelslag/util/Config.py index af1e873..2e9248a 100644 --- a/hagelslag/util/Config.py +++ b/hagelslag/util/Config.py @@ -3,6 +3,7 @@ class Config(object): Class that loads options from a config file and converts them into attributes. """ + def __init__(self, filename, required_attributes=()): config = None print(filename) @@ -25,4 +26,3 @@ def __init__(self, filename, required_attributes=()): if not has_required[i]: print("{0} not found.".format(required_attributes[i])) exit(1) - diff --git a/hagelslag/util/convert_mrms_grids.py b/hagelslag/util/convert_mrms_grids.py index ef96901..8e0df46 100644 --- a/hagelslag/util/convert_mrms_grids.py +++ b/hagelslag/util/convert_mrms_grids.py @@ -1,18 +1,21 @@ -import pandas as pd +import argparse import os +import pickle import subprocess +import traceback +import warnings +from datetime import datetime, timedelta +from multiprocessing import Pool + import numpy as np +import pandas as pd +import xarray as xr +from netCDF4 import Dataset, date2num from scipy.interpolate import RectBivariateSpline from scipy.spatial import cKDTree -import pickle -from netCDF4 import Dataset, date2num -from datetime import datetime, timedelta + from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file, make_proj_grids -from multiprocessing import Pool -import warnings -import traceback -import argparse -import xarray as xr + def main(): warnings.simplefilter("ignore") @@ -117,6 +120,7 @@ class MRMSGrid(object): MRMSGrid reads time series of MRMS grib2 files, interpolates them, and outputs them to netCDF4 format. """ + def __init__(self, start_date, end_date, variable, path_start, freq="1H"): self.start_date = start_date self.end_date = end_date @@ -145,7 +149,7 @@ def load_data(self): data_files = sorted(os.listdir(full_path)) file_dates = pd.to_datetime([d.split("_")[-1][0:13] for d in data_files]) if timestamp in file_dates: - data_file = data_files[np.where(timestamp==file_dates)[0][0]] + data_file = data_files[np.where(timestamp == file_dates)[0][0]] print(full_path + data_file) if data_file[-2:] == "gz": subprocess.call(["gunzip", full_path + data_file]) @@ -217,7 +221,8 @@ def max_neighbor(self, in_lon, in_lat, radius=0.05): if len(nz_points[0]) > 0: nz_vals = self.data[d][nz_points] nz_rank = np.argsort(nz_vals) - original_points = cKDTree(np.vstack((self.lat[nz_points[0][nz_rank]], self.lon[nz_points[1][nz_rank]])).T) + original_points = cKDTree( + np.vstack((self.lat[nz_points[0][nz_rank]], self.lon[nz_points[1][nz_rank]])).T) all_neighbors = original_points.query_ball_tree(in_tree, radius, p=2, eps=0) for n, neighbors in enumerate(all_neighbors): if len(neighbors) > 0: @@ -246,7 +251,7 @@ def interpolate_to_netcdf(self, in_lon, in_lat, out_path, date_unit="seconds sin out_obj.createDimension("time", out_data.shape[0]) out_obj.createDimension("y", out_data.shape[1]) out_obj.createDimension("x", out_data.shape[2]) - data_var = out_obj.createVariable(self.variable, "f4", ("time", "y", "x"), zlib=True, + data_var = out_obj.createVariable(self.variable, "f4", ("time", "y", "x"), zlib=True, fill_value=-9999.0, least_significant_digit=3) data_var[:] = out_data @@ -270,7 +275,7 @@ def interpolate_to_netcdf(self, in_lon, in_lat, out_path, date_unit="seconds sin dates[:] = np.round(date2num(self.all_dates.to_pydatetime(), date_unit)).astype(np.int64) dates.long_name = "Valid date" dates.units = date_unit - out_obj.Conventions="CF-1.6" + out_obj.Conventions = "CF-1.6" out_obj.close() return diff --git a/hagelslag/util/create_model_grid_us_mask.py b/hagelslag/util/create_model_grid_us_mask.py index f387d62..92dd5cb 100644 --- a/hagelslag/util/create_model_grid_us_mask.py +++ b/hagelslag/util/create_model_grid_us_mask.py @@ -1,11 +1,12 @@ -from skimage.draw import polygon +import argparse + import numpy as np import shapefile from netCDF4 import Dataset -from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file, make_proj_grids from pyproj import Proj -import argparse -import matplotlib.pyplot as plt +from skimage.draw import polygon + +from hagelslag.util.make_proj_grids import read_arps_map_file, read_ncar_map_file, make_proj_grids def main(): @@ -19,9 +20,9 @@ def main(): print("Creating mask grid") mask_grid = create_mask_grid(args.shape, mapping_data, proj_dict, grid_dict) output_netcdf_file(args.out, mask_grid, proj_dict, grid_dict) - #plt.figure(figsize=(10, 6)) - #plt.contourf(mapping_data["lon"], mapping_data["lat"], mask_grid, cmap="Reds", vmin=0, vmax=1) - #plt.show() + # plt.figure(figsize=(10, 6)) + # plt.contourf(mapping_data["lon"], mapping_data["lat"], mask_grid, cmap="Reds", vmin=0, vmax=1) + # plt.show() return @@ -42,7 +43,7 @@ def create_mask_grid(mask_shape_file, mapping_data, proj_dict, grid_dict): sf = shapefile.Reader(mask_shape_file) s = 0 for state_shape in sf.shapeRecords(): - print(s, state_shape.record[4]) #changed [5]) + print(s, state_shape.record[4]) # changed [5]) part_start = list(state_shape.shape.parts) part_end = list(state_shape.shape.parts[1:]) + [len(state_shape.shape.points)] for p in range(len(part_start)): diff --git a/hagelslag/util/create_sector_grid_data.py b/hagelslag/util/create_sector_grid_data.py index e3b4b43..4177358 100644 --- a/hagelslag/util/create_sector_grid_data.py +++ b/hagelslag/util/create_sector_grid_data.py @@ -1,31 +1,33 @@ #!/home/tmp/aburke/miniconda3/bin/python -u -from hagelslag.util.make_proj_grids import read_ncar_map_file -from netCDF4 import Dataset -import pandas as pd -import numpy as np import os -from netCDF4 import Dataset from os.path import exists +import numpy as np +import pandas as pd +from netCDF4 import Dataset + +from hagelslag.util.make_proj_grids import read_ncar_map_file + + class SectorProcessor(object): - def __init__(self,mapfile, - ensemble_name,member, - run_date,date_format): + def __init__(self, mapfile, + ensemble_name, member, + run_date, date_format): self.mapfile = mapfile self.ensemble_name = ensemble_name self.member = member self.run_date = run_date self.date_format = date_format - self.inds = None - proj_dict, grid_dict = read_ncar_map_file(self.mapfile) - - self.ne_lat, self.sw_lat = grid_dict["ne_lat"],grid_dict["sw_lat"] - self.ne_lon, self.sw_lon = grid_dict["ne_lon"],grid_dict["sw_lon"] + self.inds = None + proj_dict, grid_dict = read_ncar_map_file(self.mapfile) + + self.ne_lat, self.sw_lat = grid_dict["ne_lat"], grid_dict["sw_lat"] + self.ne_lon, self.sw_lon = grid_dict["ne_lon"], grid_dict["sw_lon"] return - - def output_sector_csv(self,csv_path,file_dict_key,out_path): + + def output_sector_csv(self, csv_path, file_dict_key, out_path): """ Segment forecast tracks to only output data contined within a region in the CONUS, as defined by the mapfile. @@ -39,73 +41,72 @@ def output_sector_csv(self,csv_path,file_dict_key,out_path): Segmented forecast tracks in a csv file. """ csv_file = csv_path + "{0}_{1}_{2}_{3}.csv".format( - file_dict_key, - self.ensemble_name, - self.member, - self.run_date.strftime(self.date_format)) + file_dict_key, + self.ensemble_name, + self.member, + self.run_date.strftime(self.date_format)) if exists(csv_file): csv_data = pd.read_csv(csv_file) - + if self.inds is None: - lon_obj = csv_data.loc[:,"Centroid_Lon"] - lat_obj = csv_data.loc[:,"Centroid_Lat"] - - self.inds = np.where((self.ne_lat>=lat_obj)&(self.sw_lat<=lat_obj)\ - &(self.ne_lon>=lon_obj)&(self.sw_lon<=lon_obj))[0] - + lon_obj = csv_data.loc[:, "Centroid_Lon"] + lat_obj = csv_data.loc[:, "Centroid_Lat"] + + self.inds = np.where((self.ne_lat >= lat_obj) & (self.sw_lat <= lat_obj) \ + & (self.ne_lon >= lon_obj) & (self.sw_lon <= lon_obj))[0] + if np.shape(self.inds)[0] > 0: - csv_data = csv_data.reindex(np.array(self.inds)) + csv_data = csv_data.reindex(np.array(self.inds)) sector_csv_filename = out_path + "{0}_{1}_{2}_{3}.csv".format( - file_dict_key, - self.ensemble_name, - self.member, - self.run_date.strftime(self.date_format)) + file_dict_key, + self.ensemble_name, + self.member, + self.run_date.strftime(self.date_format)) print("Output sector csv file " + sector_csv_filename) csv_data.to_csv(sector_csv_filename, - na_rep="nan", - float_format="%0.5f", - index=False) + na_rep="nan", + float_format="%0.5f", + index=False) os.chmod(sector_csv_filename, 0o666) else: print('No {0} {1} sector data found'.format(self.member, - self.run_date.strftime("%Y%m%d"))) - + self.run_date.strftime("%Y%m%d"))) + else: print('No {0} {1} csv file found'.format(self.member, - self.run_date.strftime("%Y%m%d"))) - return - + self.run_date.strftime("%Y%m%d"))) + return - def load_netcdf_data(self,netcdf_path,patch_radius): - nc_file = netcdf_path+ "{0}_{1}_{2}_model_patches.nc".format( - self.ensemble_name, - self.run_date.strftime(self.date_format), - self.member) + def load_netcdf_data(self, netcdf_path, patch_radius): + nc_file = netcdf_path + "{0}_{1}_{2}_model_patches.nc".format( + self.ensemble_name, + self.run_date.strftime(self.date_format), + self.member) if exists(nc_file): nc_data = Dataset(nc_file) - + lon_obj = nc_data.variables['centroid_lon'][:] lat_obj = nc_data.variables['centroid_lat'][:] - - inds = np.where((self.ne_lat>=lat_obj)&(self.sw_lat<=lat_obj)\ - &(self.ne_lon>=lon_obj)&(self.sw_lon<=lon_obj))[0] + + inds = np.where((self.ne_lat >= lat_obj) & (self.sw_lat <= lat_obj) \ + & (self.ne_lon >= lon_obj) & (self.sw_lon <= lon_obj))[0] if np.shape(inds)[0] > 0: for var in nc_data.variables: - if np.shape(nc_data.variables[var])[0] > (2*patch_radius): + if np.shape(nc_data.variables[var])[0] > (2 * patch_radius): try: - nc_data.variables[var] =\ - nc_data.variables[var][np.array(inds),:,:] + nc_data.variables[var] = \ + nc_data.variables[var][np.array(inds), :, :] except: - nc_data.variables[var] =\ + nc_data.variables[var] = \ nc_data.variables[var][np.array(inds)] else: - nc_data=None + nc_data = None else: - nc_data=None - + nc_data = None + return nc_data - - def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): + + def output_sector_netcdf(self, netcdf_path, out_path, patch_radius, config): """ Segment patches of forecast tracks to only output data contined within a region in the CONUS, as defined by the mapfile. @@ -119,14 +120,14 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): Returns: Segmented patch netcdf files. """ - - nc_data = self.load_netcdf_data(netcdf_path,patch_radius) - + + nc_data = self.load_netcdf_data(netcdf_path, patch_radius) + if nc_data is not None: out_filename = out_path + "{0}_{1}_{2}_model_patches.nc".format( - self.ensemble_name, - self.run_date.strftime(self.date_format), - self.member) + self.ensemble_name, + self.run_date.strftime(self.date_format), + self.member) out_file = Dataset(out_filename, "w") out_file.createDimension("p", np.shape(nc_data.variables['p'])[0]) out_file.createDimension("row", np.shape(nc_data.variables['row'])[0]) @@ -135,26 +136,26 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): out_file.createVariable("row", "i4", ("row",)) out_file.createVariable("col", "i4", ("col",)) out_file.variables["p"][:] = nc_data.variables['p'][:] - out_file.variables["row"][:] = nc_data.variables['row'][:] - out_file.variables["col"][:] = nc_data.variables['col'][:] + out_file.variables["row"][:] = nc_data.variables['row'][:] + out_file.variables["col"][:] = nc_data.variables['col'][:] out_file.Conventions = "CF-1.6" out_file.title = "{0} Storm Patches for run {1} member {2}".format(self.ensemble_name, - self.run_date.strftime(self.date_format), - self.member) + self.run_date.strftime(self.date_format), + self.member) out_file.object_variable = config.watershed_variable meta_variables = ["lon", "lat", "i", "j", "x", "y", "masks"] meta_units = ["degrees_east", "degrees_north", "", "", "m", "m", ""] center_vars = ["time", "centroid_lon", "centroid_lat", "centroid_i", "centroid_j", "track_id", "track_step"] center_units = ["hours since {0}".format(self.run_date.strftime("%Y-%m-%d %H:%M:%S")), - "degrees_east", - "degrees_north", - "", - "", - "", - ""] + "degrees_east", + "degrees_north", + "", + "", + "", + ""] label_columns = ["Matched", "Max_Hail_Size", "Num_Matches", "Shape", "Location", "Scale"] - + for m, meta_variable in enumerate(meta_variables): if meta_variable in ["i", "j", "masks"]: dtype = "i4" @@ -163,7 +164,7 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): m_var = out_file.createVariable(meta_variable, dtype, ("p", "row", "col"), complevel=1, zlib=True) m_var.long_name = meta_variable m_var.units = meta_units[m] - + for c, center_var in enumerate(center_vars): if center_var in ["time", "track_id", "track_step"]: dtype = "i4" @@ -171,19 +172,20 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): dtype = "f4" c_var = out_file.createVariable(center_var, dtype, ("p",), zlib=True, complevel=1) c_var.long_name = center_var - c_var.units =center_units[c] - + c_var.units = center_units[c] + for storm_variable in config.storm_variables: - s_var = out_file.createVariable(storm_variable + "_curr", "f4", ("p", "row", "col"), complevel=1, zlib=True) + s_var = out_file.createVariable(storm_variable + "_curr", "f4", ("p", "row", "col"), complevel=1, + zlib=True) s_var.long_name = storm_variable s_var.units = "" - + for potential_variable in config.potential_variables: p_var = out_file.createVariable(potential_variable + "_prev", "f4", ("p", "row", "col"), - complevel=1, zlib=True) + complevel=1, zlib=True) p_var.long_name = potential_variable p_var.units = "" - + if config.train: for label_column in label_columns: if label_column in ["Matched", "Num_Matches"]: @@ -193,30 +195,30 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): l_var = out_file.createVariable(label_column, dtype, ("p",), zlib=True, complevel=1) l_var.long_name = label_column l_var.units = "" - + out_file.variables["time"][:] = nc_data.variables['time'][:] - + for c_var in ["lon", "lat"]: - out_file.variables["centroid_" + c_var][:] = nc_data.variables['centroid_' + c_var][:] + out_file.variables["centroid_" + c_var][:] = nc_data.variables['centroid_' + c_var][:] for c_var in ["i", "j"]: - out_file.variables["centroid_" + c_var][:] = nc_data.variables["centroid_" + c_var][:] - - out_file.variables["track_id"][:] = nc_data.variables['track_id'][:] - out_file.variables["track_step"][:] = nc_data.variables['track_step'][:] - + out_file.variables["centroid_" + c_var][:] = nc_data.variables["centroid_" + c_var][:] + + out_file.variables["track_id"][:] = nc_data.variables['track_id'][:] + out_file.variables["track_step"][:] = nc_data.variables['track_step'][:] + for meta_var in meta_variables: if meta_var in ["lon", "lat"]: out_file.variables[meta_var][:] = nc_data.variables[meta_var][:] else: out_file.variables[meta_var][:] = nc_data.variables[meta_var][:] - + for storm_variable in config.storm_variables: out_file.variables[storm_variable + "_curr"][:] = nc_data.variables[storm_variable + '_curr'][:] - + for p_variable in config.potential_variables: out_file.variables[p_variable + "_prev"][:] = nc_data.variables[p_variable + '_prev'][:] - + if config.train: for label_column in label_columns: try: @@ -225,13 +227,10 @@ def output_sector_netcdf(self,netcdf_path,out_path,patch_radius,config): out_file.variables[label_column][:] = 0 out_file.close() - + print("Output sector nc file " + out_filename) else: print('No {0} {1} netcdf file/sector data found'.format(self.member, - self.run_date.strftime("%Y%m%d"))) + self.run_date.strftime("%Y%m%d"))) return - -if __name__ == "__main__": - main() diff --git a/hagelslag/util/custom_grib_table.py b/hagelslag/util/custom_grib_table.py index 324c18e..5c644a6 100644 --- a/hagelslag/util/custom_grib_table.py +++ b/hagelslag/util/custom_grib_table.py @@ -1,6 +1,7 @@ -import pandas as pd import argparse +import pandas as pd + def main(): parser = argparse.ArgumentParser() @@ -23,4 +24,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/hagelslag/util/derived_vars.py b/hagelslag/util/derived_vars.py index 0ed99bc..8b3aa8e 100644 --- a/hagelslag/util/derived_vars.py +++ b/hagelslag/util/derived_vars.py @@ -10,4 +10,4 @@ def melting_layer_height(height_surface, height_700mb, height_500mb, temperature def relative_humidity_pressure_level(temperature, specific_humidity, pressure): saturation_vapor_pressure = 611.2 * np.exp((17.67 * temperature) / (temperature + 243.5)) vapor_pressure = specific_humidity / 1000.0 * pressure / 0.622 - return vapor_pressure / saturation_vapor_pressure * 100.0 \ No newline at end of file + return vapor_pressure / saturation_vapor_pressure * 100.0 diff --git a/hagelslag/util/hrefv2_symbolic_Links.py b/hagelslag/util/hrefv2_symbolic_Links.py index f9d17a4..98f6e57 100644 --- a/hagelslag/util/hrefv2_symbolic_Links.py +++ b/hagelslag/util/hrefv2_symbolic_Links.py @@ -1,114 +1,117 @@ -from datetime import datetime, timedelta -from os.path import exists -import pandas as pd import argparse import os +from datetime import timedelta + +import pandas as pd + def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--start_date", required=True, help="Start date for symbolic links in YYYY-MM-DD format") parser.add_argument("-e", "--end_date", required=False, help="End date for symbolic links in YYYY-MM-DD format") parser.add_argument("-p", "--source_path", required=True, help="Path to the source directory of HREFv2 dataset") - parser.add_argument("-o", "--destination_path", required=True, help="Path to the destination of symbolic HREFv2 dataset") + parser.add_argument("-o", "--destination_path", required=True, + help="Path to the destination of symbolic HREFv2 dataset") args = parser.parse_args() source_path = args.source_path - destination_path = args.destination_path + destination_path = args.destination_path if args.end_date: date_index = pd.DatetimeIndex(start=args.start_date, end=args.end_date, freq='1D') else: date_index = pd.DatetimeIndex(start=args.start_date, end=args.start_date, freq='1D') - member = ['nam_00' ,'nam_12','nmmb_00','nmmb_12','nssl_00','nssl_12','arw_00','arw_12'] + member = ['nam_00', 'nam_12', 'nmmb_00', 'nmmb_12', 'nssl_00', 'nssl_12', 'arw_00', 'arw_12'] - for datetime_date in date_index: + for datetime_date in date_index: source_date = datetime_date.strftime("%Y%m%d") - print('\nSource Date',source_date) + print('\nSource Date', source_date) for m in member: source_member = str(m.split('_')[0]) if 'nam' in source_member: - source_date_path = source_path+'nam_conusnest/' - elif source_member in ['arw','nssl','nmmb']: - source_date_path = source_path+'hiresw_conus{0}/'.format(source_member) + source_date_path = source_path + 'nam_conusnest/' + elif source_member in ['arw', 'nssl', 'nmmb']: + source_date_path = source_path + 'hiresw_conus{0}/'.format(source_member) + + destination_date_path = destination_path + '{0}/{1}/'.format(m, source_date) - destination_date_path = destination_path+'{0}/{1}/'.format(m,source_date) - if not os.path.exists(destination_date_path): os.makedirs(destination_date_path) - if '00' in m: - if m in ['nam_00']: - for fore_hr in range(0,37): - source = source_date_path+'nam_conusnest.{1}00/nam_conusnest.{1}00f{0:02}'.format( - fore_hr, - source_date) - destination = destination_date_path+'nam_conusnest_{1}00f0{0:02}.grib2'.format( - fore_hr, - source_date) + if '00' in m: + if m in ['nam_00']: + for fore_hr in range(0, 37): + source = source_date_path + 'nam_conusnest.{1}00/nam_conusnest.{1}00f{0:02}'.format( + fore_hr, + source_date) + destination = destination_date_path + 'nam_conusnest_{1}00f0{0:02}.grib2'.format( + fore_hr, + source_date) if os.path.exists(destination): - continue - if os.path.exists(source): - os.symlink(source,destination) + continue + if os.path.exists(source): + os.symlink(source, destination) else: print('Incorrect source file:') print(source) - elif m in ['arw_00','nssl_00','nmmb_00']: - for fore_hr in range(0,37): - source = source_date_path+'hiresw_conus{0}.{1}00/hiresw_conus{0}.{1}00f{2:02}'.format( - source_member, - source_date, - fore_hr) - destination = destination_date_path+'hiresw_conus{2}_{1}00f0{0:02}.grib2'.format( - fore_hr, - source_date, - source_member) - + elif m in ['arw_00', 'nssl_00', 'nmmb_00']: + for fore_hr in range(0, 37): + source = source_date_path + 'hiresw_conus{0}.{1}00/hiresw_conus{0}.{1}00f{2:02}'.format( + source_member, + source_date, + fore_hr) + destination = destination_date_path + 'hiresw_conus{2}_{1}00f0{0:02}.grib2'.format( + fore_hr, + source_date, + source_member) + if os.path.exists(destination): - continue - if os.path.exists(source): - os.symlink(source,destination) + continue + if os.path.exists(source): + os.symlink(source, destination) else: print('Incorrect source file:') print(source) elif '12' in m: - changed_date = (datetime_date-timedelta(days=1)).strftime("%Y%m%d") + changed_date = (datetime_date - timedelta(days=1)).strftime("%Y%m%d") if 'nam_12' in m: - for fore_hr in range(0,37): - source = source_date_path+'nam_conusnest.{1}12/nam_conusnest.{1}12f{0:02}'.format( - (fore_hr+12), - changed_date) - - destination = destination_date_path+'nam_conusnest_{1}12f0{0:02}.grib2'.format( - fore_hr, - source_date) - + for fore_hr in range(0, 37): + source = source_date_path + 'nam_conusnest.{1}12/nam_conusnest.{1}12f{0:02}'.format( + (fore_hr + 12), + changed_date) + + destination = destination_date_path + 'nam_conusnest_{1}12f0{0:02}.grib2'.format( + fore_hr, + source_date) + if os.path.exists(destination): - continue - if os.path.exists(source): - os.symlink(source,destination) + continue + if os.path.exists(source): + os.symlink(source, destination) else: print('Incorrect source file:') print(source) - - elif m in ['arw_12','nssl_12','nmmb_12']: - for fore_hr in range(0,37): - source = source_date_path+'hiresw_conus{0}.{1}12/hiresw_conus{0}.{1}12f{2:02}'.format( - source_member, - changed_date, - (fore_hr+12)) - - destination = destination_date_path+'hiresw_conus{2}_{1}12f0{0:02}.grib2'.format( - fore_hr, - source_date, - source_member) + + elif m in ['arw_12', 'nssl_12', 'nmmb_12']: + for fore_hr in range(0, 37): + source = source_date_path + 'hiresw_conus{0}.{1}12/hiresw_conus{0}.{1}12f{2:02}'.format( + source_member, + changed_date, + (fore_hr + 12)) + + destination = destination_date_path + 'hiresw_conus{2}_{1}12f0{0:02}.grib2'.format( + fore_hr, + source_date, + source_member) if os.path.exists(destination): - continue - if os.path.exists(source): - os.symlink(source,destination) + continue + if os.path.exists(source): + os.symlink(source, destination) else: print('Incorrect source file:') print(source) - + return + if __name__ == "__main__": - main() + main() diff --git a/hagelslag/util/lsr_calibration_dataset.py b/hagelslag/util/lsr_calibration_dataset.py index 278959b..9872482 100644 --- a/hagelslag/util/lsr_calibration_dataset.py +++ b/hagelslag/util/lsr_calibration_dataset.py @@ -1,11 +1,12 @@ -from make_proj_grids import * -from netCDF4 import Dataset -import cartopy.crs as ccrs -import pandas as pd -import numpy as np import argparse -import os + +import numpy as np +import pandas as pd import pyproj +from netCDF4 import Dataset + +from make_proj_grids import * + def main(): parser = argparse.ArgumentParser() @@ -18,13 +19,14 @@ def main(): run_dates = pd.date_range(start=args.start_date, end=args.end_date, freq='1D').strftime("%y%m%d") else: run_dates = pd.date_range(start=args.start_date, end=args.start_date, freq='1D').strftime("%y%m%d") - out_path = args.out_path + out_path = args.out_path mapfile = args.map_file - LSR_calibration_data(mapfile,out_path,run_dates) - return + LSR_calibration_data(mapfile, out_path, run_dates) + return -def LSR_calibration_data(mapfile,out_path,run_dates,hours=[17,19,21],sector=None): + +def LSR_calibration_data(mapfile, out_path, run_dates, hours=[17, 19, 21], sector=None): """ Using the grid from input ML forecast (netcdf) data, SPC storm reports with a 25 mile radius around the reports can be plotted. @@ -34,11 +36,11 @@ def LSR_calibration_data(mapfile,out_path,run_dates,hours=[17,19,21],sector=None Currently only supports netcdf files. """ - hail_threshold = [25,50] + hail_threshold = [25, 50] lsr_dict = dict() - - proj_dict, grid_dict = read_ncar_map_file(mapfile) + + proj_dict, grid_dict = read_ncar_map_file(mapfile) projection = pyproj.Proj(proj_dict) mapping_data = make_proj_grids(proj_dict, grid_dict) @@ -47,96 +49,102 @@ def LSR_calibration_data(mapfile,out_path,run_dates,hours=[17,19,21],sector=None forecast_x = np.array(mapping_data['x']) forecast_y = np.array(mapping_data['y']) - for date in run_dates: + for date in run_dates: print(date) csv_file = 'https://www.spc.noaa.gov/climo/reports/{0}_rpts_hail.csv'.format(date) try: hail_reports = pd.read_csv(csv_file) except: print('Report CSV file could not be opened.') - continue + continue for threshold in hail_threshold: - #if os.path.exists(out_path+'{0}_{1}_lsr_mask.nc'.format(date,threshold)): + # if os.path.exists(out_path+'{0}_{1}_lsr_mask.nc'.format(date,threshold)): # print('>{0}mm file already exists'.format(threshold)) # continue - #print('Creating LSR mask >{0}mm'.format(threshold)) - - #Get size values from hail reports - inches_thresh = round((threshold)*0.03937)*100 - report_size = hail_reports.loc[:,'Size'].values - + # print('Creating LSR mask >{0}mm'.format(threshold)) + + # Get size values from hail reports + inches_thresh = round((threshold) * 0.03937) * 100 + report_size = hail_reports.loc[:, 'Size'].values + lsr_dict['full_day'] = np.zeros(forecast_lats.shape) full_day_indices = np.where(report_size >= inches_thresh)[0] if len(full_day_indices) < 1: print('No >{0}mm LSRs found'.format(threshold)) - continue - reports_lat_full = hail_reports.loc[full_day_indices,'Lat'].values - reports_lon_full = hail_reports.loc[full_day_indices,'Lon'].values - - lsr_dict['full_day'] = calculate_distance(reports_lat_full,reports_lon_full,forecast_y,forecast_x,projection) - - #Get time values from hail reports - report_time = (hail_reports.loc[:,'Time'].values)/100 - #Get lat/lon of different time periods and hail sizes - + continue + reports_lat_full = hail_reports.loc[full_day_indices, 'Lat'].values + reports_lon_full = hail_reports.loc[full_day_indices, 'Lon'].values + + lsr_dict['full_day'] = calculate_distance(reports_lat_full, reports_lon_full, forecast_y, forecast_x, + projection) + + # Get time values from hail reports + report_time = (hail_reports.loc[:, 'Time'].values) / 100 + # Get lat/lon of different time periods and hail sizes + for start_hour in hours: lsr_dict['{0}'.format(start_hour)] = np.zeros(forecast_lats.shape) - end_hour = (start_hour+4)%24 + end_hour = (start_hour + 4) % 24 if end_hour > 12: - hour_indices = np.where((start_hour <= report_time) & (end_hour >= report_time) & (report_size >= inches_thresh))[0] + hour_indices = \ + np.where((start_hour <= report_time) & (end_hour >= report_time) & (report_size >= inches_thresh))[ + 0] else: - #Find reports before and after 0z + # Find reports before and after 0z hour_before_0z = np.where((start_hour <= report_time) & (report_size >= inches_thresh))[0] - hour_after_0z = np.where((end_hour >= report_time) & (report_size >= inches_thresh))[0] - #Combine two arrays + hour_after_0z = np.where((end_hour >= report_time) & (report_size >= inches_thresh))[0] + # Combine two arrays hour_indices = np.hstack((hour_before_0z, hour_after_0z)) - if len(hour_indices) < 1: + if len(hour_indices) < 1: continue - reports_lat = hail_reports.loc[hour_indices,'Lat'].values - reports_lon = hail_reports.loc[hour_indices,'Lon'].values - lsr_dict['{0}'.format(start_hour)] = calculate_distance(reports_lat,reports_lon,forecast_y,forecast_x,projection) - + reports_lat = hail_reports.loc[hour_indices, 'Lat'].values + reports_lon = hail_reports.loc[hour_indices, 'Lon'].values + lsr_dict['{0}'.format(start_hour)] = calculate_distance(reports_lat, reports_lon, forecast_y, + forecast_x, projection) + # Create netCDF file if sector: - out_filename = out_path+'{0}_{1}_{2}_lsr_mask.nc'.format(date,threshold,sector) + out_filename = out_path + '{0}_{1}_{2}_lsr_mask.nc'.format(date, threshold, sector) else: - out_filename = out_path+'{0}_{1}_lsr_mask.nc'.format(date,threshold) - + out_filename = out_path + '{0}_{1}_lsr_mask.nc'.format(date, threshold) + out_file = Dataset(out_filename, "w") out_file.createDimension("x", forecast_lons.shape[0]) out_file.createDimension("y", forecast_lons.shape[1]) out_file.createVariable("Longitude", "f4", ("x", "y")) - out_file.createVariable("Latitude", "f4",("x", "y")) + out_file.createVariable("Latitude", "f4", ("x", "y")) out_file.createVariable("24_Hour_All_12z_12z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_17z_21z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_19z_23z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_21z_25z", "f4", ("x", "y")) - out_file.variables["Longitude"][:,:] = forecast_lons - out_file.variables["Latitude"][:,:] = forecast_lats - out_file.variables["24_Hour_All_12z_12z"][:,:] = lsr_dict['full_day'] - out_file.variables["4_Hour_All_17z_21z"][:,:] = lsr_dict['17'] - out_file.variables["4_Hour_All_19z_23z"][:,:] = lsr_dict['19'] - out_file.variables["4_Hour_All_21z_25z"][:,:] = lsr_dict['21'] + out_file.variables["Longitude"][:, :] = forecast_lons + out_file.variables["Latitude"][:, :] = forecast_lats + out_file.variables["24_Hour_All_12z_12z"][:, :] = lsr_dict['full_day'] + out_file.variables["4_Hour_All_17z_21z"][:, :] = lsr_dict['17'] + out_file.variables["4_Hour_All_19z_23z"][:, :] = lsr_dict['19'] + out_file.variables["4_Hour_All_21z_25z"][:, :] = lsr_dict['21'] out_file.close() print("Writing to " + out_filename) - print() + print() return -def calculate_distance(obs_lat,obs_lon,forecast_y,forecast_x,projection): + +def calculate_distance(obs_lat, obs_lon, forecast_y, forecast_x, projection): """ Calculate the difference between forecast data points and observed data. Returns: Binary array where ones are within a 40km radius """ - x,y = projection(obs_lon,obs_lat) + x, y = projection(obs_lon, obs_lat) mask_array = np.zeros(forecast_y.shape) for index, point in enumerate(obs_lat): - y_diff = (y[index]-forecast_y)**2.0 - x_diff = (x[index]-forecast_x)**2.0 - total_dist = np.sqrt(y_diff+x_diff) + y_diff = (y[index] - forecast_y) ** 2.0 + x_diff = (x[index] - forecast_x) ** 2.0 + total_dist = np.sqrt(y_diff + x_diff) row, col = np.where(total_dist < 40234.0) - mask_array[row,col] =+ 1.0 + mask_array[row, col] = + 1.0 return mask_array + if __name__ == "__main__": - main() + main() diff --git a/hagelslag/util/make_proj_grids.py b/hagelslag/util/make_proj_grids.py index f9cac39..194071f 100644 --- a/hagelslag/util/make_proj_grids.py +++ b/hagelslag/util/make_proj_grids.py @@ -1,5 +1,5 @@ -from pyproj import Proj import numpy as np +from pyproj import Proj def main(): @@ -22,10 +22,10 @@ def read_arps_map_file(map_filename): grid_names = ['sw_lat', 'sw_lon', 'ne_lat', 'ne_lon', 'dx', 'dy'] for i in range(len(proj_names) + len(grid_names)): if i < len(proj_names): - proj_dict[proj_names[i]] = float(map_params[i+2]) + proj_dict[proj_names[i]] = float(map_params[i + 2]) else: j = i - len(proj_names) - grid_dict[grid_names[j]] = float(map_params[i+2]) + grid_dict[grid_names[j]] = float(map_params[i + 2]) return proj_dict, grid_dict @@ -70,5 +70,6 @@ def make_proj_grids(proj_dict, grid_dict): def get_proj_obj(proj_dict): return Proj(proj_dict) + if __name__ == "__main__": main() diff --git a/hagelslag/util/merge_forecast_data.py b/hagelslag/util/merge_forecast_data.py index 864405c..4a055c9 100644 --- a/hagelslag/util/merge_forecast_data.py +++ b/hagelslag/util/merge_forecast_data.py @@ -1,12 +1,13 @@ -import pandas as pd -import numpy as np -import json import argparse -from os.path import exists -from multiprocessing import Pool -from glob import glob -from datetime import datetime +import json import traceback +from datetime import datetime +from glob import glob +from multiprocessing import Pool +from os.path import exists + +import pandas as pd + def main(): parser = argparse.ArgumentParser() @@ -37,6 +38,7 @@ def output_combined_files(output): else: output[0].to_csv(out_file, mode="w", index_label="Step_ID") return + csv_files = sorted(glob(args.csv + "track_step_{0}_*.csv".format(args.ens))) print(csv_files) for csv_file in csv_files: @@ -83,20 +85,20 @@ def merge_input_csv_forecast_json(input_csv_file, forecast_json_path, condition_ for param in gamma_params: model_pred_cols.append(dist_model.replace(" ", "-") + "_" + param) pred_data = pd.DataFrame(index=input_data.index, columns=model_pred_cols, - dtype=float) + dtype=float) for track_id in track_ids: track_id_num = track_id.split("_")[-1] json_filename = full_json_path + "{0}_{1}_{2}_model_track_{3}.json".format(ens_name, - run_date, - ens_member, - track_id_num) + run_date, + ens_member, + track_id_num) json_file = open(json_filename) json_data = json.load(json_file) json_file.close() for s, step in enumerate(json_data["features"]): step_id = track_id + "_{0:02d}".format(s) for cond_model in condition_models_ns: - pred_data.loc[step_id, cond_model + "_Condition"] = step["properties"]["condition_" + cond_model] + pred_data.loc[step_id, cond_model + "_Condition"] = step["properties"]["condition_" + cond_model] for dist_model in dist_models_ns: pred_data.loc[step_id, [dist_model + "_" + p for p in gamma_params]] = step["properties"]["dist_" + dist_model] @@ -105,6 +107,7 @@ def merge_input_csv_forecast_json(input_csv_file, forecast_json_path, condition_ except Exception as e: print(traceback.format_exc()) raise e + + if __name__ == "__main__": main() - diff --git a/hagelslag/util/mrms_mesh_calibration_dataset.py b/hagelslag/util/mrms_mesh_calibration_dataset.py index eb86d62..bb30a19 100644 --- a/hagelslag/util/mrms_mesh_calibration_dataset.py +++ b/hagelslag/util/mrms_mesh_calibration_dataset.py @@ -1,12 +1,14 @@ -from mpl_toolkits.basemap import Basemap -import matplotlib.pyplot as plt -from make_proj_grids import * -from netCDF4 import Dataset -import pandas as pd -import numpy as np import argparse import os +import numpy as np +import pandas as pd +from pyproj import Proj +from netCDF4 import Dataset + +from make_proj_grids import make_proj_grids, read_ncar_map_file + + def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--start_date", required=True, help="Start date in YYYY-MM-DD format") @@ -21,122 +23,118 @@ def main(): else: run_dates = pd.DatetimeIndex(start=args.start_date, end=args.start_date, freq='1D').strftime("%y%m%d") data_path = args.data_path - out_path = args.out_path + out_path = args.out_path mapfile = args.map_file maskfile = args.mask_file - - MESH_verification_data(data_path,maskfile,mapfile,run_dates,out_path) - return -def MESH_verification_data(data_path,maskfile,mapfile,run_dates,out_path,hours=[17,19,21]): + MESH_verification_data(data_path, maskfile, mapfile, run_dates, out_path) + return + + +def MESH_verification_data(data_path, maskfile, mapfile, run_dates, out_path, hours=[17, 19, 21]): """ Calculate 40 km halos around MESH values greater than a threshold value """ - hail_threshold = [25,50] + hail_threshold = [25, 50] MESH_dict = dict() - + mask_data = Dataset(maskfile) mask = mask_data.variables['usa_mask'][:] - proj_dict, grid_dict = read_ncar_map_file(mapfile) + proj_dict, grid_dict = read_ncar_map_file(mapfile) mapping_data = make_proj_grids(proj_dict, grid_dict) ML_forecast_lons = mapping_data['lon'] ML_forecast_lats = mapping_data['lat'] - m = Basemap(projection='lcc', resolution="l", - rsphere=6371229.0, - lon_0=proj_dict['lon_0'], - lat_0=proj_dict['lat_0'], - lat_1=proj_dict['lat_1'], - lat_2=proj_dict['lat_2'], - llcrnrlon=grid_dict['sw_lon'], - llcrnrlat=grid_dict['sw_lat'], - urcrnrlon=grid_dict['ne_lon'], - urcrnrlat=grid_dict['ne_lat']) + m = Proj(proj_dict) x1, y1 = m(ML_forecast_lons, ML_forecast_lats) forecast_lat = np.array(y1) forecast_lon = np.array(x1) - - for date in run_dates: + + for date in run_dates: print(date) MESH_file = data_path + 'MESH_Max_60min_00.50_20{0}-00:00_20{0}-23:00.nc'.format(date) if not os.path.exists(MESH_file): print('No MESH file') continue MESH_dataset = Dataset(MESH_file) - MESH_var = MESH_dataset.variables['MESH_Max_60min_00.50']*mask + MESH_var = MESH_dataset.variables['MESH_Max_60min_00.50'] * mask MESH_lats = MESH_dataset.variables['latitude'][:] MESH_lons = MESH_dataset.variables['longitude'][:] - - fullday_MESH = MESH_var[0:23,:,:].max(axis=0) - - for threshold in hail_threshold: - if os.path.exists(out_path+'{0}_{1}_mesh_mask.nc'.format(date,threshold)): + + fullday_MESH = MESH_var[0:23, :, :].max(axis=0) + + for threshold in hail_threshold: + if os.path.exists(out_path + '{0}_{1}_mesh_mask.nc'.format(date, threshold)): print('>{0}mm file already exists'.format(threshold)) continue print('Creating MESH mask >{0}mm'.format(threshold)) - + MESH_dict['full_day'] = np.zeros(MESH_lons.shape) - + thresh_row, thresh_col = np.where(fullday_MESH >= threshold) - if len(thresh_row)<1: + if len(thresh_row) < 1: print('No >{0}mm MESH values found'.format(threshold)) - continue - threshold_MESH_lats = MESH_lats[thresh_row,thresh_col] - threshold_MESH_lons = MESH_lons[thresh_row,thresh_col] - MESH_dict['full_day'] = calculate_distance(threshold_MESH_lats,threshold_MESH_lons,forecast_lat,forecast_lon,m) - + continue + threshold_MESH_lats = MESH_lats[thresh_row, thresh_col] + threshold_MESH_lons = MESH_lons[thresh_row, thresh_col] + MESH_dict['full_day'] = calculate_distance(threshold_MESH_lats, threshold_MESH_lons, forecast_lat, + forecast_lon, m) + for hour in hours: MESH_dict['{0}'.format(hour)] = np.zeros(MESH_lons.shape) - - start_hour = (hour-12) - end_hour = start_hour+4 - hourly_MESH = MESH_var[start_hour:end_hour,:,:].max(axis=0) + + start_hour = (hour - 12) + end_hour = start_hour + 4 + hourly_MESH = MESH_var[start_hour:end_hour, :, :].max(axis=0) thresh_row, thresh_col = np.where(hourly_MESH >= threshold) - if len(thresh_row)<1: - continue - threshold_MESH_lats = MESH_lats[thresh_row,thresh_col] - threshold_MESH_lons = MESH_lons[thresh_row,thresh_col] - MESH_dict['{0}'.format(hour)] = calculate_distance(threshold_MESH_lats,threshold_MESH_lons,forecast_lat,forecast_lon,m) - - #Create netcdf file - out_filename = out_path+'{0}_{1}_mesh_mask.nc'.format(date,threshold) + if len(thresh_row) < 1: + continue + threshold_MESH_lats = MESH_lats[thresh_row, thresh_col] + threshold_MESH_lons = MESH_lons[thresh_row, thresh_col] + MESH_dict['{0}'.format(hour)] = calculate_distance(threshold_MESH_lats, threshold_MESH_lons, + forecast_lat, forecast_lon, m) + + # Create netcdf file + out_filename = out_path + '{0}_{1}_mesh_mask.nc'.format(date, threshold) out_file = Dataset(out_filename, "w") out_file.createDimension("x", MESH_lons.shape[0]) out_file.createDimension("y", MESH_lons.shape[1]) out_file.createVariable("Longitude", "f4", ("x", "y")) - out_file.createVariable("Latitude", "f4",("x", "y")) + out_file.createVariable("Latitude", "f4", ("x", "y")) out_file.createVariable("24_Hour_All_12z_12z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_17z_21z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_19z_23z", "f4", ("x", "y")) out_file.createVariable("4_Hour_All_21z_25z", "f4", ("x", "y")) - out_file.variables["Longitude"][:,:] = MESH_lons - out_file.variables["Latitude"][:,:] = MESH_lats - out_file.variables["24_Hour_All_12z_12z"][:,:] = MESH_dict['full_day'] - out_file.variables["4_Hour_All_17z_21z"][:,:] = MESH_dict['17'] - out_file.variables["4_Hour_All_19z_23z"][:,:] = MESH_dict['19'] - out_file.variables["4_Hour_All_21z_25z"][:,:] = MESH_dict['21'] + out_file.variables["Longitude"][:, :] = MESH_lons + out_file.variables["Latitude"][:, :] = MESH_lats + out_file.variables["24_Hour_All_12z_12z"][:, :] = MESH_dict['full_day'] + out_file.variables["4_Hour_All_17z_21z"][:, :] = MESH_dict['17'] + out_file.variables["4_Hour_All_19z_23z"][:, :] = MESH_dict['19'] + out_file.variables["4_Hour_All_21z_25z"][:, :] = MESH_dict['21'] out_file.close() print("Writing to " + out_filename) print() - + return -def calculate_distance(obs_lat,obs_lon,forecast_lat,forecast_lon,basemap_obj): + +def calculate_distance(obs_lat, obs_lon, forecast_lat, forecast_lon, proj_obj): """ Calculate the difference between forecast data points and observed data. Returns: Binary array where ones are within a 30km radius """ - x, y = basemap_obj(obs_lon, obs_lat) + x, y = proj_obj(obs_lon, obs_lat) mask_array = np.zeros(forecast_lat.shape) for index, point in enumerate(obs_lat): - lat_diff = (y[index]-forecast_lat)**2.0 - lon_diff = (x[index]-forecast_lon)**2.0 - total_dist = np.sqrt(lat_diff+lon_diff) + lat_diff = (y[index] - forecast_lat) ** 2.0 + lon_diff = (x[index] - forecast_lon) ** 2.0 + total_dist = np.sqrt(lat_diff + lon_diff) row, col = np.where(total_dist < 30000.0) - mask_array[row,col] =+ 1.0 + mask_array[row, col] = + 1.0 return mask_array + if __name__ == "__main__": - main() + main() diff --git a/hagelslag/util/munkres.py b/hagelslag/util/munkres.py index 0168aeb..d1b218a 100644 --- a/hagelslag/util/munkres.py +++ b/hagelslag/util/munkres.py @@ -276,25 +276,26 @@ def permute(a, results): # Imports # --------------------------------------------------------------------------- -import sys import copy +import sys # --------------------------------------------------------------------------- # Exports # --------------------------------------------------------------------------- -__all__ = ['Munkres', 'make_cost_matrix'] +__all__ = ['Munkres', 'make_cost_matrix'] # --------------------------------------------------------------------------- # Globals # --------------------------------------------------------------------------- # Info about the module -__version__ = "1.0.7" -__author__ = "Brian Clapper, bmc@clapper.org" -__url__ = "http://software.clapper.org/munkres/" +__version__ = "1.0.7" +__author__ = "Brian Clapper, bmc@clapper.org" +__url__ = "http://software.clapper.org/munkres/" __copyright__ = "(c) 2008 Brian M. Clapper" -__license__ = "BSD-style license" +__license__ = "BSD-style license" + # --------------------------------------------------------------------------- # Classes @@ -326,7 +327,6 @@ def make_cost_matrix(profit_matrix, inversion_function): """ return make_cost_matrix(profit_matrix, inversion_function) - def pad_matrix(self, matrix, pad_value=0): """ Pad a possibly non-square matrix to make it square. @@ -398,12 +398,12 @@ def compute(self, cost_matrix): done = False step = 1 - steps = { 1 : self.__step1, - 2 : self.__step2, - 3 : self.__step3, - 4 : self.__step4, - 5 : self.__step5, - 6 : self.__step6 } + steps = {1: self.__step1, + 2: self.__step2, + 3: self.__step3, + 4: self.__step4, + 5: self.__step5, + 6: self.__step6} while not done: try: @@ -482,7 +482,7 @@ def __step3(self): count += 1 if count >= n: - step = 7 # done + step = 7 # done else: step = 4 @@ -542,14 +542,14 @@ def __step5(self): if row >= 0: count += 1 path[count][0] = row - path[count][1] = path[count-1][1] + path[count][1] = path[count - 1][1] else: done = True if not done: col = self.__find_prime_in_row(path[count][0]) count += 1 - path[count][0] = path[count-1][0] + path[count][0] = path[count - 1][0] path[count][1] = col self.__convert_path(path, count) @@ -649,7 +649,7 @@ def __find_prime_in_row(self, row): return col def __convert_path(self, path, count): - for i in range(count+1): + for i in range(count + 1): if self.marked[path[i][0]][path[i][1]] == 1: self.marked[path[i][0]][path[i][1]] = 0 else: @@ -668,6 +668,7 @@ def __erase_primes(self): if self.marked[i][j] == 2: self.marked[i][j] = 0 + # --------------------------------------------------------------------------- # Functions # --------------------------------------------------------------------------- @@ -707,6 +708,7 @@ def make_cost_matrix(profit_matrix, inversion_function): cost_matrix.append([inversion_function(value) for value in row]) return cost_matrix + def print_matrix(matrix, msg=None): """ Convenience function: Displays the contents of a matrix of integers. @@ -740,6 +742,7 @@ def print_matrix(matrix, msg=None): sep = ', ' sys.stdout.write(']\n') + # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- @@ -759,17 +762,16 @@ def print_matrix(matrix, msg=None): [300, 225, 300, 3]], 452), # expected cost - # Square - ([[10, 10, 8], - [9, 8, 1], - [9, 7, 4]], + ([[10, 10, 8], + [9, 8, 1], + [9, 7, 4]], 18), # Rectangular variant - ([[10, 10, 8, 11], - [9, 8, 1, 1], - [9, 7, 4, 10]], + ([[10, 10, 8, 11], + [9, 8, 1, 1], + [9, 7, 4, 10]], 15)] m = Munkres() diff --git a/hagelslag/util/output_tree_ensembles.py b/hagelslag/util/output_tree_ensembles.py index c54a978..051a2c8 100644 --- a/hagelslag/util/output_tree_ensembles.py +++ b/hagelslag/util/output_tree_ensembles.py @@ -2,9 +2,8 @@ """ Read a scikit-learn tree ensemble object and output the object into a human-readable text format. """ -import pickle import argparse - +import pickle __author__ = "David John Gagne " __copyright__ = "Copyright 2015, David John Gagne" @@ -61,10 +60,10 @@ def output_tree_ensemble(tree_ensemble_obj, output_filename, attribute_names=Non for t, tree in enumerate(tree_ensemble_obj.estimators_): print("Writing Tree {0:d}".format(t)) out_file = open(output_filename + ".{0:d}.tree", "w") - #out_file.write("Tree {0:d}\n".format(t)) + # out_file.write("Tree {0:d}\n".format(t)) tree_str = print_tree_recursive(tree.tree_, 0, attribute_names) out_file.write(tree_str) - #out_file.write("\n") + # out_file.write("\n") out_file.close() return @@ -115,5 +114,6 @@ def print_tree_recursive(tree_obj, node_index, attribute_names=None): tree_str += print_tree_recursive(tree_obj, tree_obj.children_right[node_index], attribute_names) return tree_str + if __name__ == "__main__": main() diff --git a/hagelslag/util/show_importance_ranks.py b/hagelslag/util/show_importance_ranks.py index 67f4253..c32e79f 100644 --- a/hagelslag/util/show_importance_ranks.py +++ b/hagelslag/util/show_importance_ranks.py @@ -1,7 +1,9 @@ +import argparse import pickle + import numpy as np + from hagelslag.util.Config import Config -import argparse def main(): diff --git a/hagelslag/util/storm_patch_center_coords.py b/hagelslag/util/storm_patch_center_coords.py index 7fd0abf..971e39f 100644 --- a/hagelslag/util/storm_patch_center_coords.py +++ b/hagelslag/util/storm_patch_center_coords.py @@ -1,10 +1,10 @@ -import numpy as np -import pandas as pd -from netCDF4 import Dataset, num2date -from glob import glob import argparse +from glob import glob from os.path import join +import pandas as pd +from netCDF4 import Dataset, num2date + def main(): parser = argparse.ArgumentParser() @@ -17,6 +17,7 @@ def main(): patch_center_frame = pd.concat(patch_center_list, ignore_index=True) patch_center_frame.to_csv(args.out, index=False) + def get_storm_centers(patch_file): patch_info = patch_file.split("/")[-1][:-3].split("_") run_date = pd.Timestamp(patch_info[-3][:-2]) @@ -24,16 +25,18 @@ def get_storm_centers(patch_file): patch_data = None print(run_date, member) with Dataset(patch_file) as patch_set: - num_patches = patch_set.variables["p"].size - lons = patch_set.variables["longitude"][:, 32, 32] - lats = patch_set.variables["latitude"][:, 32, 32] - valid_date = num2date(patch_set.variables["valid_date"][:], patch_set.variables["valid_date"].units) - patch_data = pd.DataFrame(dict(longitude=lons, - latitude=lats, - member=[member] * num_patches, - run_date=[run_date] * num_patches, - valid_date=valid_date), columns=["run_date", "member", "valid_date", "longitude", "latitude"]) + num_patches = patch_set.variables["p"].size + lons = patch_set.variables["longitude"][:, 32, 32] + lats = patch_set.variables["latitude"][:, 32, 32] + valid_date = num2date(patch_set.variables["valid_date"][:], patch_set.variables["valid_date"].units) + patch_data = pd.DataFrame(dict(longitude=lons, + latitude=lats, + member=[member] * num_patches, + run_date=[run_date] * num_patches, + valid_date=valid_date), + columns=["run_date", "member", "valid_date", "longitude", "latitude"]) return patch_data + if __name__ == "__main__": main() diff --git a/hagelslag/util/test_size_distributions.py b/hagelslag/util/test_size_distributions.py index 7ef8249..527f1e8 100644 --- a/hagelslag/util/test_size_distributions.py +++ b/hagelslag/util/test_size_distributions.py @@ -1,11 +1,13 @@ import json +import os +from glob import glob from multiprocessing import Pool -from scipy.stats import kstest, gamma + +import matplotlib.pyplot as plt import numpy as np import pandas as pd -from glob import glob -import matplotlib.pyplot as plt -import os +from scipy.stats import kstest, gamma + def main(): json_path = "/sharp/djgagne/track_data_spring2015_unique_json/" @@ -22,11 +24,12 @@ def main(): ks_total_frame.to_csv("ks_results_2015.csv", index_label="Id") return + def run_kstests(json_path, run_date, member): try: full_path = json_path + "/{0}/{1}/mesh_*.json".format(run_date, member) json_files = sorted(glob(full_path)) - ks_results = {"id":[], "ks":[]} + ks_results = {"id": [], "ks": []} for json_file in json_files: js = open(json_file) mesh_track = json.load(js) @@ -37,25 +40,27 @@ def run_kstests(json_path, run_date, member): ts = np.array(mesh_obj["properties"]["timesteps"]) mask = np.array(mesh_obj["properties"]["masks"]) vals = ts[mask == 1] - gdist = gamma.fit(vals, floc=vals.min()-0.1) + gdist = gamma.fit(vals, floc=vals.min() - 0.1) sig = kstest(vals, gamma(*gdist).cdf) ks_results["id"].append(step_id) ks_results["ks"].append(sig) if sig[1] < 0.01: - print(step_id,) - print(sig[1],gdist) + print(step_id, ) + print(sig[1], gdist) print(np.sort(vals)) - plt.figure(figsize=(8,8)) + plt.figure(figsize=(8, 8)) plt.pcolormesh(ts, alpha=0.5, cmap="YlOrRd", vmin=0, vmax=100) - pc = plt.pcolormesh(np.ma.array(ts, mask=mask==0), cmap="YlOrRd", vmin=0, vmax=100) + pc = plt.pcolormesh(np.ma.array(ts, mask=mask == 0), cmap="YlOrRd", vmin=0, vmax=100) plt.title(step_id) plt.colorbar(pc) plt.savefig(step_id + ".png", bbox_inches="tight", dpi=150) plt.close() - ks_frame = pd.DataFrame(ks_results["ks"], index=ks_results["id"],columns=["D", "p-val"]) + ks_frame = pd.DataFrame(ks_results["ks"], index=ks_results["id"], columns=["D", "p-val"]) print(ks_frame.shape[0]) except Exception as e: raise e return ks_frame + + if __name__ == "__main__": main() diff --git a/hagelslag/util/testing_sector.py b/hagelslag/util/testing_sector.py index a656a1d..45c5e46 100644 --- a/hagelslag/util/testing_sector.py +++ b/hagelslag/util/testing_sector.py @@ -1,94 +1,97 @@ -import numpy as np -from create_sector_grid_data import * from datetime import datetime - -from hagelslag.processing.ObjectMatcher import shifted_centroid_distance, start_time_distance -from hagelslag.processing.ObjectMatcher import centroid_distance, time_distance -import os -import pandas as pd + import numpy as np -from datetime import datetime +import pandas as pd +from create_sector_grid_data import * +from hagelslag.processing.ObjectMatcher import centroid_distance, time_distance +from hagelslag.processing.ObjectMatcher import shifted_centroid_distance -#date_index = pd.DatetimeIndex([pd.Timestamp.utcnow().strftime("%Y%m%d")]) +# date_index = pd.DatetimeIndex([pd.Timestamp.utcnow().strftime("%Y%m%d")]) work_path = "/ai-hail/aburke/2018_HREFv2_data/" -scratch_path= "/hail/aburke/testing_weights/figuring_out_files/" -ensemble_members = ['nam_00','nam_12','arw_00','arw_12','nssl_00','nssl_12','nmmb_00','nmmb_12'] +scratch_path = "/hail/aburke/testing_weights/figuring_out_files/" +ensemble_members = ['nam_00', 'nam_12', 'arw_00', 'arw_12', 'nssl_00', 'nssl_12', 'nmmb_00', 'nmmb_12'] -config=dict(dates=None, - start_hour=12, - end_hour=36, - watershed_variable="MAXUVV", - ensemble_name="HREFv2", - ensemble_members=ensemble_members, - model_path=work_path+'symbolic_hrefv2_data', - model_watershed_params=(8, 1, 80, 100, 60), - size_filter=12, - gaussian_window=2, - mrms_path=work_path+"MRMS/", - mrms_variable="MESH_Max_60min_00.50", - mrms_watershed_params=(19, 1, 100, 100, 75), - object_matcher_params=([shifted_centroid_distance], np.array([1.0]), +config = dict(dates=None, + start_hour=12, + end_hour=36, + watershed_variable="MAXUVV", + ensemble_name="HREFv2", + ensemble_members=ensemble_members, + model_path=work_path + 'symbolic_hrefv2_data', + model_watershed_params=(8, 1, 80, 100, 60), + size_filter=12, + gaussian_window=2, + mrms_path=work_path + "MRMS/", + mrms_variable="MESH_Max_60min_00.50", + mrms_watershed_params=(19, 1, 100, 100, 75), + object_matcher_params=([shifted_centroid_distance], np.array([1.0]), np.array([24000])), - - track_matcher_params=([centroid_distance, time_distance], - np.array([80000, 2])), - - storm_variables=['MAXUVV','Storm relative helicity_3000', 'Storm relative helicity_1000', - 'MAXREF','MXUPHL_5000','MAXDVV'], - potential_variables=['Precipitable water_0','Temperature_1000','Dew point temperature_1000','Geopotential Height_500','Temperature_500', - 'Dew point temperature_500','U component of wind_500','V component of wind_500', - 'Geopotential Height_700','Temperature_700', 'Dew point temperature_700','U component of wind_700', - 'V component of wind_700','Geopotential Height_850','Temperature_850', 'Dew point temperature_850', - 'U component of wind_850','V component of wind_850','MAXUW', 'MAXVW', - 'Surface lifted index','Convective available potential energy_0','Convective inhibition_0'], - - tendency_variables=[], - shape_variables=["area", "eccentricity", "major_axis_length", "minor_axis_length", "orientation", + track_matcher_params=([centroid_distance, time_distance], + np.array([80000, 2])), + + storm_variables=['MAXUVV', 'Storm relative helicity_3000', 'Storm relative helicity_1000', + 'MAXREF', 'MXUPHL_5000', 'MAXDVV'], + + potential_variables=['Precipitable water_0', 'Temperature_1000', 'Dew point temperature_1000', + 'Geopotential Height_500', 'Temperature_500', + 'Dew point temperature_500', 'U component of wind_500', 'V component of wind_500', + 'Geopotential Height_700', 'Temperature_700', 'Dew point temperature_700', + 'U component of wind_700', + 'V component of wind_700', 'Geopotential Height_850', 'Temperature_850', + 'Dew point temperature_850', + 'U component of wind_850', 'V component of wind_850', 'MAXUW', 'MAXVW', + 'Surface lifted index', 'Convective available potential energy_0', + 'Convective inhibition_0'], + + tendency_variables=[], + shape_variables=["area", "eccentricity", "major_axis_length", "minor_axis_length", "orientation", "extent"], - variable_statistics=["mean", "max", "min", "std", "skew", + variable_statistics=["mean", "max", "min", "std", "skew", "percentile_10", "percentile_50", "percentile_90"], - csv_path=scratch_path+"track_data_spring2018_MAXUVV_closest_csv/", - geojson_path=scratch_path+"track_data_spring2018_MAXUVV_closest_json/", - nc_path=scratch_path+"track_data_spring2018_MAXUVV_patch_nc/", - unique_matches=True, - patch_radius=16, - closest_matches=True, - match_steps=True, - train=False, - single_step=True, - label_type="gamma", - model_map_file="/hail/aburke/hagelslag/mapfiles/hrefv2_2018_map.txt", - mask_file="/hail/aburke/hagelslag/mapfiles/hrefv2_us_mask.nc", - run_date_format="%Y%m%d-%H%M", - sector_csv_outpath=scratch_path+'track_data_spring2018_MAXUVV_closest_csv/', - sector_nc_outpath=scratch_path+'track_data_spring2018_MAXUVV_patch_nc/', - sector_mapfile="/hail/aburke/hagelslag/mapfiles/hrefv2_sectors/C_2018_map.txt" - ) + csv_path=scratch_path + "track_data_spring2018_MAXUVV_closest_csv/", + geojson_path=scratch_path + "track_data_spring2018_MAXUVV_closest_json/", + nc_path=scratch_path + "track_data_spring2018_MAXUVV_patch_nc/", + unique_matches=True, + patch_radius=16, + closest_matches=True, + match_steps=True, + train=False, + single_step=True, + label_type="gamma", + model_map_file="/hail/aburke/hagelslag/mapfiles/hrefv2_2018_map.txt", + mask_file="/hail/aburke/hagelslag/mapfiles/hrefv2_us_mask.nc", + run_date_format="%Y%m%d-%H%M", + sector_csv_outpath=scratch_path + 'track_data_spring2018_MAXUVV_closest_csv/', + sector_nc_outpath=scratch_path + 'track_data_spring2018_MAXUVV_patch_nc/', + sector_mapfile="/hail/aburke/hagelslag/mapfiles/hrefv2_sectors/C_2018_map.txt" + ) -sector='E' +sector = 'E' date_index = pd.DatetimeIndex(start="2018-05-01T00:00", end="2018-06-02T00:00", freq="1D") sector_mapfile = "/hail/aburke/hagelslag/mapfiles/hrefv2_sectors/{0}_2018_map.txt".format(sector) -ensemble_name="HREFv2" -member='arw_00' -run_date = datetime(2018,7,1) -run_date_format="%Y%m%d-%H%M" -csv_path='/hail/aburke/HREF_Scripts_and_Data/hwt_2018/track_data_spring2018_MAXUVV_closest_csv/' -nc_path='/hail/aburke/HREF_Scripts_and_Data/hwt_2018/track_data_spring2018_MAXUVV_patch_nc/' -sector_csv_outpath="/hail/aburke/testing_weights/sector_trained/{0}/track_data_spring2018_MAXUVV_closest_csv/".format(sector) -sector_nc_outpath="/hail/aburke/testing_weights/sector_trained/{0}/track_data_spring2018_MAXUVV_patch_nc/".format(sector) -patch_radius=16 +ensemble_name = "HREFv2" +member = 'arw_00' +run_date = datetime(2018, 7, 1) +run_date_format = "%Y%m%d-%H%M" +csv_path = '/hail/aburke/HREF_Scripts_and_Data/hwt_2018/track_data_spring2018_MAXUVV_closest_csv/' +nc_path = '/hail/aburke/HREF_Scripts_and_Data/hwt_2018/track_data_spring2018_MAXUVV_patch_nc/' +sector_csv_outpath = "/hail/aburke/testing_weights/sector_trained/{0}/track_data_spring2018_MAXUVV_closest_csv/".format( + sector) +sector_nc_outpath = "/hail/aburke/testing_weights/sector_trained/{0}/track_data_spring2018_MAXUVV_patch_nc/".format( + sector) +patch_radius = 16 for run_date in date_index: for member in ensemble_members: sector = SectorProcessor(sector_mapfile, - ensemble_name,member, - run_date,run_date_format) + ensemble_name, member, + run_date, run_date_format) - sector.output_sector_netcdf(nc_path,sector_nc_outpath,patch_radius,config) + sector.output_sector_netcdf(nc_path, sector_nc_outpath, patch_radius, config) - #for key in ['track_step', 'track_total']: + # for key in ['track_step', 'track_total']: # sector.output_sector_csv(csv_path,key,sector_csv_outpath) diff --git a/setup.py b/setup.py index 07508cd..5d63b04 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ -from setuptools import setup import os +from setuptools import setup + classifiers = ['Development Status :: 4 - Beta', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', @@ -32,11 +33,11 @@ license="MIT", url="https://github.com/djgagne/hagelslag", packages=["hagelslag", "hagelslag.data", "hagelslag.processing", "hagelslag.evaluation", "hagelslag.util"], - scripts=["bin/hsdata", "bin/hsforecast", "bin/hseval", "bin/hsfileoutput", "bin/hsplotter", - "bin/hswrf3d", "bin/hsstation", "bin/hsncarpatch", "bin/hscalibration"], - data_files=[("mapfiles", ["mapfiles/ssef2013.map", - "mapfiles/ssef2014.map", - "mapfiles/ssef2015.map", + scripts=["bin/hsdata", "bin/hsforecast", "bin/hseval", "bin/hsfileoutput", "bin/hsplotter", + "bin/hswrf3d", "bin/hsstation", "bin/hsncarpatch", "bin/hscalibration"], + data_files=[("mapfiles", ["mapfiles/ssef2013.map", + "mapfiles/ssef2014.map", + "mapfiles/ssef2015.map", "mapfiles/ncar_grib_table.txt", "mapfiles/hrrr_map_2016.txt", "mapfiles/ncar_ensemble_map_2015.txt", From b994dfcb7381c1d155a204bef97306bbbdc863b1 Mon Sep 17 00:00:00 2001 From: David John Gagne Date: Thu, 30 Sep 2021 11:24:14 -0600 Subject: [PATCH 15/15] Addressing flake8 errors. --- .travis.yml | 15 -- demos/obj_tracking.py | 23 +-- doc/conf.py | 131 +++++++++--------- hagelslag/data/GribModelGrid.py | 4 +- hagelslag/processing/STObject.py | 2 +- hagelslag/processing/TrackProcessing.py | 9 +- hagelslag/util/lsr_calibration_dataset.py | 2 +- .../util/mrms_mesh_calibration_dataset.py | 6 +- hagelslag/util/testing_sector.py | 2 +- requirements.txt | 2 +- 10 files changed, 93 insertions(+), 103 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ec3da2d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python -env: - - PYTHON_VERSION=3.8 IPYTHON_KERNEL=python3 -before_install: - - wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - - sh Miniconda3-latest-Linux-x86_64.sh -b -p /home/travis/miniconda - - export PATH=/home/travis/miniconda/bin:$PATH -install: - - conda env create -f environment.yml - - source activate hagelslag - - echo `which python` -script: - - pytest -notifications: - email: true diff --git a/demos/obj_tracking.py b/demos/obj_tracking.py index 3b20a2a..18181ca 100755 --- a/demos/obj_tracking.py +++ b/demos/obj_tracking.py @@ -18,10 +18,12 @@ import numpy as np from scipy.ndimage import gaussian_filter, find_objects -from hagelslag.data import ModelOutput -from hagelslag.processing import STObject +from hagelslag.data.ModelOutput import ModelOutput +from hagelslag.processing.STObject import STObject from hagelslag.processing.EnhancedWatershedSegmenter import EnhancedWatershed from hagelslag.processing.ObjectMatcher import ObjectMatcher, closest_distance +from netCDF4 import Dataset + # In[2]: @@ -49,7 +51,8 @@ # data_increment (int): quantization interval. Use 1 if you don't want to quantize # max_thresh (int): values greater than maxThresh are treated as the maximum threshold # size_threshold_pixels (int): clusters smaller than this threshold are ignored. -# delta (int): maximum number of data increments the cluster is allowed to range over. Larger d results in clusters over larger scales. +# delta (int): maximum number of data increments the cluster is allowed to range over. Larger d results in +# clusters over larger scales. # From ahij's config file. if field == "MAX_UPDRAFT_HELICITY" or field == "UP_HELI_MAX03": @@ -61,11 +64,10 @@ levels = params['min_thresh'] * np.arange(1, 8) levels = np.append(levels, params['min_thresh'] * 15) model_watershed_params = ( -params['min_thresh'], params['step'], params['max_thresh'], params["max_size"], params["delta"]) + params['min_thresh'], params['step'], params['max_thresh'], params["max_size"], params["delta"]) end_date = start_date + timedelta(hours=0) -from netCDF4 import Dataset model_grid = ModelOutput(ensemble_name, member, @@ -149,15 +151,16 @@ def get_forecast_objects(model_grid, ew_params, min_size, gaussian_window): num_slices = len(obj_slices) model_objects.append([]) if num_slices > 0: - fig, ax = plt.subplots() - t = plt.contourf(model_grid.lon, model_grid.lat, hour_labels, np.arange(0, num_slices + 1) + 0.5, + plt.subplots() + plt.contourf(model_grid.lon, model_grid.lat, hour_labels, np.arange(0, num_slices + 1) + 0.5, extend="max", cmap="Set1", latlon=True, title=str(run_date) + " " + field + " " + str(h)) - ret = plt.savefig(odir + "enh_watershed_ex/ew{0:02d}.png".format(h)) + plt.savefig(odir + "enh_watershed_ex/ew{0:02d}.png".format(h)) for s, sl in enumerate(obj_slices): model_objects[-1].append(STObject(model_grid.data[h][sl], # np.where(hour_labels[sl] > 0, 1, 0), # For some objects (especially long, diagonal ones), the rectangular - # slice encompasses part of other objects (i.e. non-zero elements of slice). + # slice encompasses part of other objects + # (i.e. non-zero elements of slice). # We don't want them in our mask. np.where(hour_labels[sl] == s + 1, 1, 0), model_grid.x[sl], @@ -198,7 +201,7 @@ def track_forecast_objects(input_model_objects, model_grid, object_matcher): elif len(past_time_objs) > 0 and len(model_objects[h]) > 0: assignments = object_matcher.match_objects(past_time_objs, model_objects[h], h - 1, h) print("assignments:", assignments) - unpaired = range(len(model_objects[h])) + unpaired = list(range(len(model_objects[h]))) for pair in assignments: past_time_objs[pair[0]].extend(model_objects[h][pair[1]]) unpaired.remove(pair[1]) diff --git a/doc/conf.py b/doc/conf.py index 42c9f08..fca195e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -12,28 +12,30 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os -import shlex +import sys + from mock import Mock as MagicMock class Mock(MagicMock): @classmethod def __getattr__(cls, name): - return Mock() + return Mock() -MOCK_MODULES = ['numpy', 'scipy', 'pandas', 'skimage', 'netCDF4', 'basemap', 'matplotlib', "pyproj", "Nio", "scipy.spatial", - "pygrib", "mpl_toolkits", "mpl_toolkits.axes_grid1", "mpl_toolkits.basemap", "mpl_toolkits.basemap.pyproj", +MOCK_MODULES = ['numpy', 'scipy', 'pandas', 'skimage', 'netCDF4', 'basemap', 'matplotlib', "pyproj", "Nio", + "scipy.spatial", + "pygrib", "mpl_toolkits", "mpl_toolkits.axes_grid1", "mpl_toolkits.basemap", + "mpl_toolkits.basemap.pyproj", "sklearn", 'skimage.morphology', "scipy.ndimage", "matplotlib.pyplot", "scipy.stats", "scipy.signal", - "skimage.measure", "skimage.segmentation", "scipy.interpolate", "skimage.draw", - "mpl_toolkits.axes_grid1.inset_locator", "glue", "sklearn.linear_model", "glue.viewers", "glue.viewers.custom", + "skimage.measure", "skimage.segmentation", "scipy.interpolate", "skimage.draw", + "mpl_toolkits.axes_grid1.inset_locator", "glue", "sklearn.linear_model", "glue.viewers", + "glue.viewers.custom", "glue.viewers.custom.qt", "glue.core", "shapefile", "ncepgrib2", "glue.config", "sklearn.decomposition", "sklearn.model_selection", "arrow"] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -42,7 +44,7 @@ def __getattr__(cls, name): # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -64,14 +66,14 @@ def __getattr__(cls, name): source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'hagelslag' -copyright = u'2017, David John Gagne II' +copyright = u'2017-2021, David John Gagne II' author = u'David John Gagne II' # The version info for the project you're documenting, acts as replacement for @@ -92,9 +94,9 @@ def __getattr__(cls, name): # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -102,32 +104,31 @@ def __getattr__(cls, name): # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False - # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -140,23 +141,23 @@ def __getattr__(cls, name): html_theme_options = {'sidebarwidth': 375} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -166,62 +167,62 @@ def __getattr__(cls, name): # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = 'hagelslagdoc' @@ -229,46 +230,46 @@ def __getattr__(cls, name): # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', -# Latex figure (float) alignment -#'figure_align': 'htbp', + # Latex figure (float) alignment + # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'hagelslag.tex', u'hagelslag Documentation', - u'David John Gagne II', 'manual'), + (master_doc, 'hagelslag.tex', u'hagelslag Documentation', + u'David John Gagne II', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -281,7 +282,7 @@ def __getattr__(cls, name): ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -290,19 +291,19 @@ def __getattr__(cls, name): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'hagelslag', u'hagelslag Documentation', - author, 'hagelslag', 'One line description of project.', - 'Miscellaneous'), + (master_doc, 'hagelslag', u'hagelslag Documentation', + author, 'hagelslag', 'One line description of project.', + 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/hagelslag/data/GribModelGrid.py b/hagelslag/data/GribModelGrid.py index 7fe78e6..ee6af35 100644 --- a/hagelslag/data/GribModelGrid.py +++ b/hagelslag/data/GribModelGrid.py @@ -59,7 +59,7 @@ def __enter__(self): def format_grib_name(self, selected_variable): """ Assigns name to grib2 message number with name 'unknown'. Names based on NOAA grib2 abbreviations. - + Names: 197: RETOP: Echo Top 198: MAXREF: Hourly Maximum of Simulated Reflectivity at 1 km AGL @@ -71,7 +71,7 @@ def format_grib_name(self, selected_variable): 221: MAXDVV: Hourly Maximum of Downward Vertical Velocity in the lowest 400hPa 222: MAXUW: U Component of Hourly Maximum 10m Wind Speed 223: MAXVW: V Component of Hourly Maximum 10m Wind Speed - + Args: selected_variable(str): Name of selected variable for loading Returns: diff --git a/hagelslag/processing/STObject.py b/hagelslag/processing/STObject.py index f846d54..aa7f2fe 100644 --- a/hagelslag/processing/STObject.py +++ b/hagelslag/processing/STObject.py @@ -154,7 +154,7 @@ def trajectory(self): def get_corner(self, time): """ - Gets the corner array indices of the STObject at a given time that corresponds + Gets the corner array indices of the STObject at a given time that corresponds to the upper left corner of the bounding box for the STObject. Args: diff --git a/hagelslag/processing/TrackProcessing.py b/hagelslag/processing/TrackProcessing.py index 489d6e0..7fa4ed4 100644 --- a/hagelslag/processing/TrackProcessing.py +++ b/hagelslag/processing/TrackProcessing.py @@ -547,12 +547,13 @@ def match_single_track_dist(model_track, obs_track): def match_hail_size_step_distributions(self, model_tracks, obs_tracks, track_pairings): """ - Given a matching set of observed tracks for each model track, + Given a matching set of observed tracks for each model track, combine the hail size values and create + an observed hail size distribution. Args: - model_tracks: - obs_tracks: - track_pairings: + model_tracks: List of STObjects + obs_tracks: List of STObjects + track_pairings: Returns: diff --git a/hagelslag/util/lsr_calibration_dataset.py b/hagelslag/util/lsr_calibration_dataset.py index 9872482..b627c63 100644 --- a/hagelslag/util/lsr_calibration_dataset.py +++ b/hagelslag/util/lsr_calibration_dataset.py @@ -5,7 +5,7 @@ import pyproj from netCDF4 import Dataset -from make_proj_grids import * +from make_proj_grids import read_ncar_map_file, make_proj_grids def main(): diff --git a/hagelslag/util/mrms_mesh_calibration_dataset.py b/hagelslag/util/mrms_mesh_calibration_dataset.py index bb30a19..bbf133a 100644 --- a/hagelslag/util/mrms_mesh_calibration_dataset.py +++ b/hagelslag/util/mrms_mesh_calibration_dataset.py @@ -19,9 +19,9 @@ def main(): parser.add_argument("-a", "--mask_file", required=True, help="Path to the ensemble mask file") args = parser.parse_args() if args.end_date: - run_dates = pd.DatetimeIndex(start=args.start_date, end=args.end_date, freq='1D').strftime("%y%m%d") + run_dates = pd.date_range(start=args.start_date, end=args.end_date, freq='1D').strftime("%y%m%d") else: - run_dates = pd.DatetimeIndex(start=args.start_date, end=args.start_date, freq='1D').strftime("%y%m%d") + run_dates = pd.date_range(start=args.start_date, end=args.start_date, freq='1D').strftime("%y%m%d") data_path = args.data_path out_path = args.out_path mapfile = args.map_file @@ -31,7 +31,7 @@ def main(): return -def MESH_verification_data(data_path, maskfile, mapfile, run_dates, out_path, hours=[17, 19, 21]): +def MESH_verification_data(data_path, maskfile, mapfile, run_dates, out_path, hours=(17, 19, 21)): """ Calculate 40 km halos around MESH values greater than a threshold value """ diff --git a/hagelslag/util/testing_sector.py b/hagelslag/util/testing_sector.py index 45c5e46..61247d7 100644 --- a/hagelslag/util/testing_sector.py +++ b/hagelslag/util/testing_sector.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from create_sector_grid_data import * +from create_sector_grid_data import SectorProcessor from hagelslag.processing.ObjectMatcher import centroid_distance, time_distance from hagelslag.processing.ObjectMatcher import shifted_centroid_distance diff --git a/requirements.txt b/requirements.txt index e63baa4..2aecb80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ s3fs xarray sphinx pytest - +mock