Skip to content
This repository was archived by the owner on Apr 22, 2025. It is now read-only.

Commit de47e5d

Browse files
DAS-1891 - Aggregated time dimensions only include input values.
1 parent 82d8a59 commit de47e5d

File tree

6 files changed

+112
-28
lines changed

6 files changed

+112
-28
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## v1.2.0
2+
### 2023-07-28
3+
4+
* DAS-1891 - Update temporal aggregation to return output temporal dimensions
5+
only containing values that map to values in input granules, rather than
6+
producing a regular grid.
7+
18
## v1.1.1
29
### 2023-02-17
310

docs/NetCDF-to-Zarr-Example-Usage.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@
298298
"\n",
299299
"**Figure 2:** Left: A gridded science variable as represented in six separate NetCDF-4 input GPM/IMERG granules. These have dimensions (1, 3600, 1800). Right: The stacked variable as saved in the output Zarr store. This has dimensions (6, 3600, 1800).\n",
300300
"\n",
301-
"The temporal aggregation will always produce a regular grid. If there are gaps in data coverage, then the temporal dimension will include a value for the missing data, but that slice will be populated with fill values.\n",
301+
"The temporal aggregation will produce an output temporal dimension that only includes the temporal dimension values of the input granules. If the input time values are not all evenly spaced, potentially due to a missing granule, then the output temporal dimension will have gaps, and be irregular.\n",
302302
"\n",
303303
"### The temporal aggregation request:\n",
304304
"\n",
@@ -403,7 +403,7 @@
403403
"name": "python",
404404
"nbconvert_exporter": "python",
405405
"pygments_lexer": "ipython3",
406-
"version": "3.9.7"
406+
"version": "3.9.12"
407407
}
408408
},
409409
"nbformat": 4,

harmony_netcdf_to_zarr/convert.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,8 @@ class method instantiates a dataset that uses the `ProcessSynchronizer`
354354
shape=aggregated_shape,
355355
chunks=tuple(chunks),
356356
dtype=netcdf_variable.dtype,
357-
fill_value=fill_value)
357+
fill_value=fill_value
358+
)
358359

359360
if resolved_variable_name not in aggregated_dimensions:
360361
# For a non-aggregated dimension, insert input granule data
@@ -564,7 +565,7 @@ def compute_chunksize(shape: Union[tuple, list],
564565
the regenerated new zarr chunks
565566
"""
566567
# convert compressed_chunksize_byte to integer if it's a str
567-
if type(compressed_chunksize_byte) == str:
568+
if isinstance(compressed_chunksize_byte, str):
568569
try:
569570
(value, unit) = findall(
570571
r'^\s*([\d.]+)\s*(Ki|Mi|Gi)\s*$', compressed_chunksize_byte

harmony_netcdf_to_zarr/mosaic_utilities.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def __init__(self, input_paths: List[str]):
162162
self._map_input_dimensions()
163163

164164
if len(self.input_paths) > 1:
165-
# Only calculate regular, aggregated dimensions for multiple inputs
165+
# Only calculate aggregated dimensions for multiple inputs
166166
self._aggregate_output_dimensions()
167167

168168
def _map_input_dimensions(self):
@@ -263,8 +263,22 @@ def _get_temporal_output_dimension(self,
263263
dimension_name: str) -> DimensionInformation:
264264
""" Find the units metadata attribute for the input granule with the
265265
earliest epoch. Apply this epoch to the temporal data in all
266-
granules, to place them with respect to a common epoch. Then use
267-
generate an output dimension grid.
266+
granules, to place them with respect to a common epoch.
267+
268+
This method now only returns an aggregated array of input values.
269+
Previously, it would calculate a regular grid that had evenly
270+
spaced pixels, and contained all input values. Several collections
271+
have monthly granules that produce grids with hourly or finer
272+
resolution, and so caused significant performance issues and
273+
created very sparsely populated Zarr stores.
274+
275+
To reimplement uniform gridding, replace the call to
276+
`self._get_dimension_bounds` and the return statement with:
277+
278+
```
279+
return self._get_output_dimension(dimension_name, all_input_values,
280+
output_dimension_units)
281+
```
268282
269283
"""
270284
dimension_units = [dimension_input.units
@@ -281,9 +295,12 @@ def _get_temporal_output_dimension(self,
281295
dtype=list(dimension_inputs.values())[0].get_values().dtype
282296
)
283297
)
284-
285-
return self._get_output_dimension(dimension_name, all_input_values,
286-
output_dimension_units)
298+
bounds_path, bounds_values = self._get_dimension_bounds(
299+
dimension_name, all_input_values
300+
)
301+
return DimensionInformation(dimension_name, all_input_values,
302+
output_dimension_units, bounds_path,
303+
bounds_values)
287304

288305
def _get_output_dimension(self, dimension_name: str,
289306
input_dimension_values: np.ndarray,

tests/unit/test_mosaic_utilities.py

+76-17
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,9 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
243243
244244
* Continuous granules (e.g., all output dimension values map to an
245245
input value).
246-
* Discontinous granules (e.g., there is intervening space in the
247-
output temporal dimension).
246+
* Discontinuous granules (e.g., there is intervening space in the
247+
output temporal dimension). This will have gaps in the output
248+
temporal dimension.
248249
249250
"""
250251
merra_time_values = np.linspace(0, 1380, 24)
@@ -358,8 +359,13 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
358359
# Check the output time has correct values and units.
359360
self.assertEqual(merra_mapping.output_dimensions['/time'].units,
360361
'minutes since 2020-01-03T00:30:00')
362+
363+
# Expected time values are 24 consecutive hours, then a gap of 24
364+
# hours, before another 24 consecutive hourly values.
365+
expected_time_values = np.append(np.linspace(0, 23 * 60, 24),
366+
np.linspace(48 * 60, 71 * 60, 24))
361367
assert_array_equal(merra_mapping.output_dimensions['/time'].values,
362-
np.linspace(0, 4260, 72)) # 72 values of consecutive hours
368+
expected_time_values)
363369

364370
# Check none of the output dimensions have bounds information, as
365371
# none of the inputs did.
@@ -385,21 +391,23 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
385391
* Continuous granules (e.g., all output dimension values map to an
386392
input value).
387393
* Discontinous granules (e.g., there is intervening space in the
388-
output temporal dimension).
394+
output temporal dimension). This test will now assume those gaps
395+
are persisted as the service will no longer attempt to create a
396+
regular grid.
389397
390398
"""
391-
expected_output_time_values = np.linspace(0, 432000, 6) # Daily data
399+
continuous_time_values = np.linspace(0, 432000, 6) # Daily data
392400
dataset_one = self.generate_netcdf_input(
393401
'gpm_one.nc4', self.lat_data, self.lon_data,
394-
np.array([expected_output_time_values[0]]), self.temporal_units
402+
np.array([continuous_time_values[0]]), self.temporal_units
395403
)
396404
dataset_two = self.generate_netcdf_input(
397405
'gpm_two.nc4', self.lat_data, self.lon_data,
398-
np.array([expected_output_time_values[2]]), self.temporal_units
406+
np.array([continuous_time_values[2]]), self.temporal_units
399407
)
400408
dataset_three = self.generate_netcdf_input(
401409
'gpm_three.nc4', self.lat_data, self.lon_data,
402-
np.array([expected_output_time_values[5]]), self.temporal_units
410+
np.array([continuous_time_values[5]]), self.temporal_units
403411
)
404412

405413
mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
@@ -438,10 +446,15 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
438446
"""
439447

440448
# Check the output time has correct values and units.
449+
expected_discontinuous_time_values = np.array([
450+
continuous_time_values[0],
451+
continuous_time_values[2],
452+
continuous_time_values[5]
453+
])
441454
self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
442455
self.temporal_units)
443456
assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
444-
expected_output_time_values)
457+
expected_discontinuous_time_values)
445458

446459
# Check none of the output dimensions have bounds information, as
447460
# none of the inputs did.
@@ -454,6 +467,55 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
454467
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
455468
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)
456469

470+
@patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
471+
def test_dimensions_mapping_unordered_granules(self, mock_dataset):
472+
""" Test that the `DimensionsMapping.output_dimensions` mapping is
473+
correctly instantiated from known input data. This specific test
474+
targets data like GPM/IMERG, where the spatial dimensions are the
475+
same in each granule, the temporal dimension epochs are the same,
476+
but the temporal dimension values vary between granules.
477+
478+
This specific test ensures that the output temporal dimension will
479+
be correctly ordered, even if the input granules are not. This is
480+
achieved by the behaviour of `numpy.unique`.
481+
482+
"""
483+
expected_output_time_values = np.linspace(0, 172800, 3) # Daily data
484+
dataset_one = self.generate_netcdf_input(
485+
'gpm_one.nc4', self.lat_data, self.lon_data,
486+
np.array([expected_output_time_values[0]]), self.temporal_units
487+
)
488+
dataset_two = self.generate_netcdf_input(
489+
'gpm_two.nc4', self.lat_data, self.lon_data,
490+
np.array([expected_output_time_values[1]]), self.temporal_units
491+
)
492+
dataset_three = self.generate_netcdf_input(
493+
'gpm_three.nc4', self.lat_data, self.lon_data,
494+
np.array([expected_output_time_values[2]]), self.temporal_units
495+
)
496+
497+
mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
498+
gpm_mapping = DimensionsMapping(['gpm_three.nc4', 'gpm_one.nc4',
499+
'gpm_two.nc4'])
500+
501+
# Check the expected dimensions are in the output mapping.
502+
# Note: aggregation of non-temporal dimensions has been disabled
503+
# as the Swath Projector can have values with slight rounding
504+
# errors in their output grid dimensions.
505+
self.assertSetEqual(set(gpm_mapping.output_dimensions.keys()),
506+
{'/time'})
507+
508+
# Check the output time has correct values and units.
509+
self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
510+
self.temporal_units)
511+
assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
512+
expected_output_time_values)
513+
514+
# Check none of the output dimensions have bounds information, as
515+
# none of the inputs did.
516+
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
517+
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)
518+
457519
@patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
458520
def test_dimensions_mapping_output_spatial(self, mock_dataset):
459521
""" Test that the `DimensionsMapping.output_dimensions` mapping is
@@ -538,9 +600,9 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
538600
* All output dimension values map to an input dimension value
539601
and therefore all output bounds values can be copied from the
540602
input data
541-
* There are output dimension values that do not map to input
542-
dimension values (due to gaps in coverage) and the corresponding
543-
bounds values for those gaps must be calculated.
603+
* The inputs are discontinuous, and so the outputs will also be
604+
discontinuous. (Note, previously, the gaps would be filled to
605+
form a regularly sampled dimension)
544606
545607
"""
546608
dimension_data_one = np.linspace(0, 2, 3)
@@ -591,7 +653,7 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
591653
if dataset.isopen():
592654
dataset.close()
593655

594-
with self.subTest('Some output dimension values are in coverage gaps'):
656+
with self.subTest('Discontinuous input granules'):
595657
dataset_one = self.generate_netcdf_with_bounds('bounds_three.nc4',
596658
'dim',
597659
dimension_data_one,
@@ -617,15 +679,12 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
617679
[2.5, 3.5],
618680
[3.5, 4.5],
619681
[4.5, 5.5],
620-
[5.5, 6.5],
621-
[6.5, 7.5],
622-
[7.5, 8.5],
623682
[8.5, 9.5],
624683
[9.5, 10.5],
625684
[10.5, 11.5]])
626685

627686
assert_array_equal(mapping.output_dimensions['/dim'].values,
628-
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))
687+
np.array([0, 1, 2, 3, 4, 5, 9, 10, 11]))
629688
self.assertEqual(mapping.output_dimensions['/dim'].bounds_path,
630689
'/dim_bnds')
631690
assert_array_equal(mapping.output_dimensions['/dim'].bounds_values,

version.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.1.1
1+
1.2.0

0 commit comments

Comments
 (0)