Skip to content

Commit 817a570

Browse files
authored
Merge pull request #36 from charlie-becker/HRRR_Zarr
HRRR-Zarr Streaming
2 parents 2506198 + 5be2910 commit 817a570

File tree

5 files changed

+166
-0
lines changed

5 files changed

+166
-0
lines changed

bin/hsdata

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ def process_ensemble_member(run_date, member, config):
8383
print("Starting", run_date, member)
8484
start_date = run_date + timedelta(hours=config.start_hour)
8585
end_date = run_date + timedelta(hours=config.end_hour)
86+
87+
if config.ensemble_name == "HRRR-ZARR":
88+
if hasattr(config, "HRRR_alt_end_hour") and run_date.hour in config.HRRR_alt_run_hours:
89+
end_date = run_date + timedelta(hours=config.HRRR_alt_end_hour)
90+
8691
if hasattr(config, "mask_file"):
8792
mask_file = config.mask_file
8893
else:

config/HRRR_AWS_Stream.config

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python
2+
import numpy as np
3+
import pandas as pd
4+
from hagelslag.processing.ObjectMatcher import shifted_centroid_distance
5+
from hagelslag.processing.ObjectMatcher import centroid_distance, time_distance
6+
7+
# NOTE: HRRR variables must be listed in the following format:
8+
# {HRRR_VARIABLE_NAME}-{HRRR_level}
9+
# For example, Composite Reflectivity (REFC) which is at the (entire_atmosphere) level
10+
# would be listed as 'REFC-entire_atmosphere'
11+
12+
# 'ensemble_name' must be lsited as 'HRRR-ZARR'
13+
# 'model_path' must be "hrrrzarr/sfc/"
14+
# 'end_hour' must be no more than n-1 number of forecast hours in model
15+
16+
# Support for model runs with different forecast lengths, use 'HRRR_alt_end_hour' for an alternative
17+
# forecast length that is used for each model run hour listed in 'HRRR_alt_hours'. If neither is provided,
18+
# 'end_hour' will be used for all model runs.
19+
20+
## output path
21+
scratch_path = "/glade/scratch/dgagne/HRRR_objects/HRRR_AWS_realtime_test_060421/"
22+
23+
# Historical runs
24+
#date_index = pd.date_range(start='2021-03-15', end='2021-03-22', freq='1H', closed='left', tz='UTC').to_pydatetime()
25+
26+
# Real Time runs
27+
# Use pd.Timedelta to correspond with delay in data availability from hour that script is submitted
28+
date_index = pd.DatetimeIndex([pd.Timestamp.utcnow().strftime("%Y-%m-%d-%H")]) - pd.Timedelta(hours=3)
29+
30+
ensemble_members = ['oper']
31+
32+
config = dict(dates=date_index,
33+
start_hour=1,
34+
end_hour=17,
35+
HRRR_alt_end_hour=47,
36+
HRRR_alt_hours=[0, 6, 12, 18],
37+
watershed_variable="REFC-entire_atmosphere",
38+
ensemble_name="HRRR-ZARR",
39+
ensemble_members=ensemble_members,
40+
model_path="hrrrzarr/sfc/",
41+
segmentation_approach="hyst",
42+
model_watershed_params=(35, 50),
43+
size_filter=12,
44+
gaussian_window=1,
45+
mrms_path=None,
46+
mrms_variable="MESH_Max_60min_00.50",
47+
mrms_watershed_params=(13, 1, 125, 100, 100),
48+
object_matcher_params=([shifted_centroid_distance], np.array([1.0]),
49+
np.array([24000])),
50+
track_matcher_params=([centroid_distance, time_distance],
51+
np.array([80000, 2])),
52+
storm_variables=["REFC-entire_atmosphere", "MXUPHL_1hr_max_fcst-5000_2000m_above_ground"],
53+
potential_variables=[],
54+
tendency_variables=[],
55+
shape_variables=["area", "eccentricity", "major_axis_length", "minor_axis_length", "orientation"],
56+
variable_statistics=["mean", "max", "min"],
57+
csv_path=scratch_path + "track_data_hrrr_3km_csv_refl/",
58+
geojson_path=scratch_path + "track_data_hrrr_3km_json_refl/",
59+
nc_path=scratch_path + "track_data_hrrr_3km_nc_refl/",
60+
patch_radius=48,
61+
unique_matches=True,
62+
closest_matches=True,
63+
match_steps=True,
64+
train=False,
65+
single_step=True,
66+
label_type="gamma",
67+
model_map_file="/glade/u/home/dgagne/hagelslag/mapfiles/hrrr_map_2016.txt",
68+
mask_file=None)

hagelslag/data/HRRRZarrModelGrid.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import numpy as np
2+
import pandas as pd
3+
from hagelslag.data.ZarrModelGrid import ZarrModelGrid
4+
from os.path import join
5+
6+
7+
class HRRRZarrModelGrid(ZarrModelGrid):
8+
9+
def __init__(self, run_date, variable, start_date, end_date, path, frequency="1H"):
10+
self.run_date = pd.Timestamp(run_date)
11+
self.variable = variable
12+
self.start_date = pd.Timestamp(start_date)
13+
self.end_date = pd.Timestamp(end_date)
14+
self.frequency = frequency
15+
self.path = path
16+
17+
super(HRRRZarrModelGrid, self).__init__(path, run_date, start_date, end_date, variable)

hagelslag/data/ModelOutput.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ def load_data(self):
173173
self.path)
174174
self.data, self.units = mg.load_data()
175175
mg.close()
176+
176177
elif self.ensemble_name.upper() == "NCARSTORM":
177178
mg = NCARStormEventModelGrid(self.run_date,
178179
self.variable,
@@ -181,6 +182,14 @@ def load_data(self):
181182
self.path)
182183
self.data, self.units = mg.load_data()
183184
mg.close()
185+
186+
elif self.ensemble_name.upper() == "HRRR-ZARR":
187+
mg = HRRRZarrModelGrid(self.run_date,
188+
self.variable,
189+
self.start_date,
190+
self.end_date,
191+
self.path)
192+
self.data, self.units = mg.load_data()
184193
else:
185194
print(self.ensemble_name + " not supported.")
186195

hagelslag/data/ZarrModelGrid.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import numpy as np
2+
import pandas as pd
3+
from pandas import date_range
4+
from os.path import exists, join
5+
import s3fs
6+
import xarray as xr
7+
from datetime import timedelta
8+
9+
10+
class ZarrModelGrid(object):
11+
"""
12+
Base class for reading 2D model output grids from HRRR Zarr data streamed off of AWS.
13+
14+
Given an AWS bucket name, loads the values of a single variable from a model run. Supports model output in
15+
Zarr format.
16+
17+
Attributes:
18+
path (str): Base Path for AWS Bucket
19+
run_date (ISO date string or datetime.datetime object): Date of the initialization time of the model run.
20+
start_date (ISO date string or datetime.datetime object): Date of the first timestep extracted.
21+
end_date (ISO date string or datetime.datetime object): Date of the last timestep extracted.
22+
freqency (str): spacing between model time steps.
23+
valid_dates: DatetimeIndex of all model timesteps
24+
forecast_hours: array of all hours in the forecast
25+
file_objects (list): List of the file objects for each model time step
26+
"""
27+
def __init__(self,
28+
path,
29+
run_date,
30+
start_date,
31+
end_date,
32+
variable,
33+
frequency="1H"):
34+
self.path = path
35+
self.variable = variable
36+
self.run_date = pd.to_datetime(run_date)
37+
self.start_date = pd.to_datetime(start_date)
38+
self.end_date = pd.to_datetime(end_date)
39+
self.frequency = frequency
40+
self.valid_dates = date_range(start=self.start_date,
41+
end=self.end_date,
42+
freq=self.frequency)
43+
print(self.run_date)
44+
print(type(self.run_date))
45+
self.forecast_hours = (self.valid_dates - self.run_date).astype("timedelta64[h]").astype(int)
46+
47+
48+
def load_data(self):
49+
50+
units = ""
51+
level = self.variable.split('-')[1]
52+
self.variable = self.variable.split('-')[0]
53+
fs = s3fs.S3FileSystem(anon=True)
54+
files = []
55+
run_date_str = self.run_date.strftime("%Y%m%d")
56+
forecast_hour = self.run_date.strftime("%H")
57+
path = join(self.path, run_date_str, f'{run_date_str}_{forecast_hour}z_fcst.zarr', level, self.variable, level)
58+
f = s3fs.S3Map(root=path, s3=fs, check=False)
59+
files.append(f)
60+
61+
ds = xr.open_mfdataset(files, engine='zarr').load()
62+
array = ds[self.variable].values.astype('float32')
63+
64+
if hasattr(ds[self.variable], 'units'):
65+
units = ds[self.variable].attrs['units']
66+
67+
return array, units

0 commit comments

Comments
 (0)