DAS-1891 - Aggregated time dimensions only include input values.

owenlittlejohns · web-flow · commit de47e5da94d3 · 2023-08-01T17:16:28.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## v1.2.0
+### 2023-07-28
+
+* DAS-1891 - Update temporal aggregation to return output temporal dimensions
+  only containing values that map to values in input granules, rather than
+  producing a regular grid.
+
 ## v1.1.1
 ### 2023-02-17
 
diff --git a/docs/NetCDF-to-Zarr-Example-Usage.ipynb b/docs/NetCDF-to-Zarr-Example-Usage.ipynb
@@ -298,7 +298,7 @@
     "\n",
     "**Figure 2:** Left: A gridded science variable as represented in six separate NetCDF-4 input GPM/IMERG granules. These have dimensions (1, 3600, 1800). Right: The stacked variable as saved in the output Zarr store. This has dimensions (6, 3600, 1800).\n",
     "\n",
-    "The temporal aggregation will always produce a regular grid. If there are gaps in data coverage, then the temporal dimension will include a value for the missing data, but that slice will be populated with fill values.\n",
+    "The temporal aggregation will produce an output temporal dimension that only includes the temporal dimension values of the input granules. If the input time values are not all evenly spaced, potentially due to a missing granule, then the output temporal dimension will have gaps, and be irregular.\n",
     "\n",
     "### The temporal aggregation request:\n",
     "\n",
@@ -403,7 +403,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,
diff --git a/harmony_netcdf_to_zarr/convert.py b/harmony_netcdf_to_zarr/convert.py
@@ -354,7 +354,8 @@ class method instantiates a dataset that uses the `ProcessSynchronizer`
             shape=aggregated_shape,
             chunks=tuple(chunks),
             dtype=netcdf_variable.dtype,
-            fill_value=fill_value)
+            fill_value=fill_value
+        )
 
         if resolved_variable_name not in aggregated_dimensions:
             # For a non-aggregated dimension, insert input granule data
@@ -564,7 +565,7 @@ def compute_chunksize(shape: Union[tuple, list],
         the regenerated new zarr chunks
     """
     # convert compressed_chunksize_byte to integer if it's a str
-    if type(compressed_chunksize_byte) == str:
+    if isinstance(compressed_chunksize_byte, str):
         try:
             (value, unit) = findall(
                 r'^\s*([\d.]+)\s*(Ki|Mi|Gi)\s*$', compressed_chunksize_byte
diff --git a/harmony_netcdf_to_zarr/mosaic_utilities.py b/harmony_netcdf_to_zarr/mosaic_utilities.py
@@ -162,7 +162,7 @@ def __init__(self, input_paths: List[str]):
         self._map_input_dimensions()
 
         if len(self.input_paths) > 1:
-            # Only calculate regular, aggregated dimensions for multiple inputs
+            # Only calculate aggregated dimensions for multiple inputs
             self._aggregate_output_dimensions()
 
     def _map_input_dimensions(self):
@@ -263,8 +263,22 @@ def _get_temporal_output_dimension(self,
                                        dimension_name: str) -> DimensionInformation:
         """ Find the units metadata attribute for the input granule with the
             earliest epoch. Apply this epoch to the temporal data in all
-            granules, to place them with respect to a common epoch. Then use
-            generate an output dimension grid.
+            granules, to place them with respect to a common epoch.
+
+            This method now only returns an aggregated array of input values.
+            Previously, it would calculate a regular grid that had evenly
+            spaced pixels, and contained all input values. Several collections
+            have monthly granules that produce grids with hourly or finer
+            resolution, and so caused significant performance issues and
+            created very sparsely populated Zarr stores.
+
+            To reimplement uniform gridding, replace the call to
+            `self._get_dimension_bounds` and the return statement with:
+
+            ```
+            return self._get_output_dimension(dimension_name, all_input_values,
+                                              output_dimension_units)
+            ```
 
         """
         dimension_units = [dimension_input.units
@@ -281,9 +295,12 @@ def _get_temporal_output_dimension(self,
                 dtype=list(dimension_inputs.values())[0].get_values().dtype
             )
         )
-
-        return self._get_output_dimension(dimension_name, all_input_values,
-                                          output_dimension_units)
+        bounds_path, bounds_values = self._get_dimension_bounds(
+            dimension_name, all_input_values
+        )
+        return DimensionInformation(dimension_name, all_input_values,
+                                    output_dimension_units, bounds_path,
+                                    bounds_values)
 
     def _get_output_dimension(self, dimension_name: str,
                               input_dimension_values: np.ndarray,
diff --git a/tests/unit/test_mosaic_utilities.py b/tests/unit/test_mosaic_utilities.py
@@ -243,8 +243,9 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
 
             * Continuous granules (e.g., all output dimension values map to an
               input value).
-            * Discontinous granules (e.g., there is intervening space in the
-              output temporal dimension).
+            * Discontinuous granules (e.g., there is intervening space in the
+              output temporal dimension). This will have gaps in the output
+              temporal dimension.
 
         """
         merra_time_values = np.linspace(0, 1380, 24)
@@ -358,8 +359,13 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
             # Check the output time has correct values and units.
             self.assertEqual(merra_mapping.output_dimensions['/time'].units,
                              'minutes since 2020-01-03T00:30:00')
+
+            # Expected time values are 24 consecutive hours, then a gap of 24
+            # hours, before another 24 consecutive hourly values.
+            expected_time_values = np.append(np.linspace(0, 23 * 60, 24),
+                                             np.linspace(48 * 60, 71 * 60, 24))
             assert_array_equal(merra_mapping.output_dimensions['/time'].values,
-                               np.linspace(0, 4260, 72))  # 72 values of consecutive hours
+                               expected_time_values)
 
             # Check none of the output dimensions have bounds information, as
             # none of the inputs did.
@@ -385,21 +391,23 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
             * Continuous granules (e.g., all output dimension values map to an
               input value).
             * Discontinous granules (e.g., there is intervening space in the
-              output temporal dimension).
+              output temporal dimension). This test will now assume those gaps
+              are persisted as the service will no longer attempt to create a
+              regular grid.
 
         """
-        expected_output_time_values = np.linspace(0, 432000, 6)  # Daily data
+        continuous_time_values = np.linspace(0, 432000, 6)  # Daily data
         dataset_one = self.generate_netcdf_input(
             'gpm_one.nc4', self.lat_data, self.lon_data,
-            np.array([expected_output_time_values[0]]), self.temporal_units
+            np.array([continuous_time_values[0]]), self.temporal_units
         )
         dataset_two = self.generate_netcdf_input(
             'gpm_two.nc4', self.lat_data, self.lon_data,
-            np.array([expected_output_time_values[2]]), self.temporal_units
+            np.array([continuous_time_values[2]]), self.temporal_units
         )
         dataset_three = self.generate_netcdf_input(
             'gpm_three.nc4', self.lat_data, self.lon_data,
-            np.array([expected_output_time_values[5]]), self.temporal_units
+            np.array([continuous_time_values[5]]), self.temporal_units
         )
 
         mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
@@ -438,10 +446,15 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
         """
 
         # Check the output time has correct values and units.
+        expected_discontinuous_time_values = np.array([
+            continuous_time_values[0],
+            continuous_time_values[2],
+            continuous_time_values[5]
+        ])
         self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
                          self.temporal_units)
         assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
-                           expected_output_time_values)
+                           expected_discontinuous_time_values)
 
         # Check none of the output dimensions have bounds information, as
         # none of the inputs did.
@@ -454,6 +467,55 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
         self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
         self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)
 
+    @patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
+    def test_dimensions_mapping_unordered_granules(self, mock_dataset):
+        """ Test that the `DimensionsMapping.output_dimensions` mapping is
+            correctly instantiated from known input data. This specific test
+            targets data like GPM/IMERG, where the spatial dimensions are the
+            same in each granule, the temporal dimension epochs are the same,
+            but the temporal dimension values vary between granules.
+
+            This specific test ensures that the output temporal dimension will
+            be correctly ordered, even if the input granules are not. This is
+            achieved by the behaviour of `numpy.unique`.
+
+        """
+        expected_output_time_values = np.linspace(0, 172800, 3)  # Daily data
+        dataset_one = self.generate_netcdf_input(
+            'gpm_one.nc4', self.lat_data, self.lon_data,
+            np.array([expected_output_time_values[0]]), self.temporal_units
+        )
+        dataset_two = self.generate_netcdf_input(
+            'gpm_two.nc4', self.lat_data, self.lon_data,
+            np.array([expected_output_time_values[1]]), self.temporal_units
+        )
+        dataset_three = self.generate_netcdf_input(
+            'gpm_three.nc4', self.lat_data, self.lon_data,
+            np.array([expected_output_time_values[2]]), self.temporal_units
+        )
+
+        mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
+        gpm_mapping = DimensionsMapping(['gpm_three.nc4', 'gpm_one.nc4',
+                                         'gpm_two.nc4'])
+
+        # Check the expected dimensions are in the output mapping.
+        # Note: aggregation of non-temporal dimensions has been disabled
+        # as the Swath Projector can have values with slight rounding
+        # errors in their output grid dimensions.
+        self.assertSetEqual(set(gpm_mapping.output_dimensions.keys()),
+                            {'/time'})
+
+        # Check the output time has correct values and units.
+        self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
+                         self.temporal_units)
+        assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
+                           expected_output_time_values)
+
+        # Check none of the output dimensions have bounds information, as
+        # none of the inputs did.
+        self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
+        self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)
+
     @patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
     def test_dimensions_mapping_output_spatial(self, mock_dataset):
         """ Test that the `DimensionsMapping.output_dimensions` mapping is
@@ -538,9 +600,9 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
             * All output dimension values map to an input dimension value
               and therefore all output bounds values can be copied from the
               input data
-            * There are output dimension values that do not map to input
-              dimension values (due to gaps in coverage) and the corresponding
-              bounds values for those gaps must be calculated.
+            * The inputs are discontinuous, and so the outputs will also be
+              discontinuous. (Note, previously, the gaps would be filled to
+              form a regularly sampled dimension)
 
         """
         dimension_data_one = np.linspace(0, 2, 3)
@@ -591,7 +653,7 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
                 if dataset.isopen():
                     dataset.close()
 
-        with self.subTest('Some output dimension values are in coverage gaps'):
+        with self.subTest('Discontinuous input granules'):
             dataset_one = self.generate_netcdf_with_bounds('bounds_three.nc4',
                                                            'dim',
                                                            dimension_data_one,
@@ -617,15 +679,12 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
                                                [2.5, 3.5],
                                                [3.5, 4.5],
                                                [4.5, 5.5],
-                                               [5.5, 6.5],
-                                               [6.5, 7.5],
-                                               [7.5, 8.5],
                                                [8.5, 9.5],
                                                [9.5, 10.5],
                                                [10.5, 11.5]])
 
             assert_array_equal(mapping.output_dimensions['/dim'].values,
-                               np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))
+                               np.array([0, 1, 2, 3, 4, 5, 9, 10, 11]))
             self.assertEqual(mapping.output_dimensions['/dim'].bounds_path,
                              '/dim_bnds')
             assert_array_equal(mapping.output_dimensions['/dim'].bounds_values,
diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-1.1.1
+1.2.0