Skip to content

Commit d01fd7a

Browse files
Add support for more EAMxx variables (#880)
Co-authored-by: Tom Vo <[email protected]>
1 parent 70ecf94 commit d01fd7a

File tree

19 files changed

+659
-71
lines changed

19 files changed

+659
-71
lines changed

.github/workflows/build_workflow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
strategy:
5252
matrix:
5353
python-version: ["3.9", "3.10", "3.11", "3.12"]
54-
container:
54+
container:
5555
image: ghcr.io/e3sm-project/containers-e3sm-diags-test-data:e3sm-diags-test-data-0.0.2
5656
steps:
5757
- id: skip_check
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
This script is used to debug the bottleneck issue in the reference u variable.
3+
"""
4+
5+
# %%
6+
import timeit
7+
8+
import xarray as xr
9+
10+
# Perlmutter
11+
# ----------
12+
# filepaths = [
13+
# "/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
14+
# ]
15+
16+
# LCRC
17+
# -----
18+
filepaths = [
19+
"/lcrc/group/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
20+
]
21+
time_slice = slice("1996-01-15", "1997-01-15", None)
22+
23+
# %%
24+
# Test case 1 - OPEN_MFDATASET() + "ua" dataset (76 GB) + subsetting + `.load()`
25+
# Result: .load() hangs when using `open_mfdataset`
26+
# ------------------------------------------------------------------------------
27+
ds_ua_omfd = xr.open_mfdataset(
28+
filepaths,
29+
decode_times=True,
30+
use_cftime=True,
31+
coords="minimal",
32+
compat="override",
33+
)
34+
ds_ua_omfd_sub = ds_ua_omfd.sel(time=time_slice)
35+
36+
# %%
37+
start_time = timeit.default_timer()
38+
ds_ua_omfd_sub.load()
39+
elapsed = timeit.default_timer() - start_time
40+
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")
41+
42+
# %%
43+
# Test case 2 - OPEN_DATASET() + "ua" dataset (76 GB) + subsetting + `.load()`
44+
# Result: load() works fine when using `open_dataset`
45+
# ------------------------------------------------------------------------------
46+
ds_ua_od = xc.open_dataset(
47+
filepaths[0],
48+
add_bounds=["X", "Y", "T"],
49+
decode_times=True,
50+
use_cftime=True,
51+
# coords="minimal",
52+
# compat="override",
53+
)
54+
ds_ua_od_sub = ds_ua_od.sel(time=time_slice)
55+
56+
# %%
57+
start_time = timeit.default_timer()
58+
ds_ua_od_sub.load()
59+
elapsed = timeit.default_timer() - start_time
60+
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")
61+
62+
# %%
63+
# Test case 3 - OPEN_MFDATASET() + "pr" dataset (2 GB) + subsetting + `.load()`
64+
# Result: ds.load() works fine with pr variable, but not with ua variable
65+
# Notes: pr is 3D variable (time, lat, lon), ua is a 4D variable (time, lat, lon, plev).
66+
# ------------------------------------------------------------------------------
67+
filepaths_pr = [
68+
"/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series/ERA5/pr_197901_201912.nc"
69+
]
70+
ds_pr = xc.open_mfdataset(
71+
filepaths_pr,
72+
add_bounds=["X", "Y", "T"],
73+
decode_times=True,
74+
use_cftime=True,
75+
coords="minimal",
76+
compat="override",
77+
)
78+
79+
# %%
80+
# pr dataset is ~2 GB without subsetting. There is no need to subset.
81+
start_time = timeit.default_timer()
82+
ds_pr.load()
83+
elapsed = timeit.default_timer() - start_time
84+
print(f"Time taken to load ds_xc_sub_0: {elapsed} seconds")
85+
# %%
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[#]
2+
sets = ["lat_lon"]
3+
case_id = "ERA5"
4+
variables = ["U"]
5+
ref_name = "ERA5"
6+
reference_name = "ERA5 Reanalysis"
7+
seasons = ["ANN", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "DJF", "MAM", "JJA", "SON"]
8+
plevs = [850.0]
9+
test_colormap = "PiYG_r"
10+
reference_colormap = "PiYG_r"
11+
contour_levels = [-20, -15, -10, -8, -5, -3, -1, 1, 3, 5, 8, 10, 15, 20]
12+
diff_levels = [-8, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 8]
13+
regrid_method = "bilinear"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import sys
2+
import os
3+
from e3sm_diags.parameter.core_parameter import CoreParameter
4+
from e3sm_diags.run import runner
5+
6+
param = CoreParameter()
7+
8+
9+
param.reference_data_path = (
10+
"/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series"
11+
)
12+
param.test_data_path = "/global/cfs/cdirs/e3sm/chengzhu/eamxx/post/data/rgr"
13+
param.test_name = "eamxx_decadal"
14+
param.seasons = ["ANN"]
15+
# param.save_netcdf = True
16+
17+
param.ref_timeseries_input = True
18+
# Years to slice the ref data, base this off the years in the filenames.
19+
param.ref_start_yr = "1996"
20+
param.ref_end_yr = "1996"
21+
22+
prefix = "/global/cfs/cdirs/e3sm/www/cdat-migration-fy24/892-bottleneck"
23+
param.results_dir = os.path.join(prefix, "eamxx_decadal_1996_1107_edv3")
24+
25+
cfg_path = "auxiliary_tools/cdat_regression_testing/892-bottleneck/run_script.cfg"
26+
sys.argv.extend(["--diags", cfg_path])
27+
28+
runner.sets_to_run = [
29+
"lat_lon",
30+
"zonal_mean_xy",
31+
"zonal_mean_2d",
32+
"zonal_mean_2d_stratosphere",
33+
"polar",
34+
"cosp_histogram",
35+
"meridional_mean_2d",
36+
"annual_cycle_zonal_mean",
37+
]
38+
39+
runner.run_diags([param])
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# %%
2+
import timeit
3+
4+
import xarray as xr
5+
6+
filepaths = [
7+
"/lcrc/group/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
8+
]
9+
10+
ds = xr.open_mfdataset(filepaths)
11+
12+
ds_sub = ds.sel(time=slice("1996-01-15", "1997-01-15", None))
13+
14+
# %%
15+
start_time = timeit.default_timer()
16+
ds_sub.ua.load()
17+
elapsed = timeit.default_timer() - start_time
18+
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")
19+
20+
# %%
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# %%
2+
import numpy as np
3+
import pandas as pd
4+
import xarray as xr
5+
import timeit
6+
7+
import dask.array as da
8+
9+
# %%
10+
# Define the dimensions
11+
time = 12
12+
plev = 37
13+
lat = 721
14+
lon = 1440
15+
16+
# Create the data arrays using dask.
17+
data = da.random.random(size=(time, plev, lat, lon), chunks=(12, 37, 721, 1440)).astype(
18+
np.float32
19+
)
20+
21+
# Create the coordinates.
22+
times = pd.date_range("2000-01-01", periods=time)
23+
plevs = np.linspace(100000, 10, plev)
24+
lats = np.linspace(-90, 90, lat)
25+
lons = np.linspace(0, 360, lon, endpoint=False)
26+
27+
# Create the dataset and write out to a file.
28+
ds = xr.Dataset(
29+
{"data": (["time", "plev", "lat", "lon"], data)},
30+
coords={"time": times, "plev": plevs, "lat": lats, "lon": lons},
31+
)
32+
# %%
33+
ds.to_netcdf("dask_bottleneck.nc")
34+
35+
# %%
36+
# Open the dataset.
37+
ds_open = xr.open_mfdataset("dask_bottleneck.nc")
38+
39+
# %%
40+
# Load the dataset into memory
41+
start_time = timeit.default_timer()
42+
ds.load()
43+
end_time = timeit.default_timer()
44+
45+
print(f"Time taken to load the dataset: {end_time - start_time} seconds")
46+
47+
48+
# %%

0 commit comments

Comments
 (0)