diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index ab4b8da..c7d35ee 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -13,5 +13,6 @@ A short description of the changes in this PR. * [ ] Jira ticket acceptance criteria met. * [ ] `CHANGELOG.md` updated to include high level summary of PR changes. * [ ] `docker/service_version.txt` updated if publishing a release. +* [ ] Fix version [harmony-metadata-annotator-X.Y.Z] added to Jira ticket if publishing a release. * [ ] Tests added/updated and passing. * [ ] Documentation updated (if needed). diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b76c12..c36f549 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.5.0] - 2025-10-13 + +### Changed + +- Adds ability to exclude variables from the output using earthdata-varinfo configuration. +- Adds configuration entries to exclude SMAP L3 string variables. +- Changes xarray engine from default netcdf4 to h5netcdf. + ## [v1.4.0] - 2025-09-30 ### Changed diff --git a/docker/service_version.txt b/docker/service_version.txt index 88c5fb8..bc80560 100644 --- a/docker/service_version.txt +++ b/docker/service_version.txt @@ -1 +1 @@ -1.4.0 +1.5.0 diff --git a/metadata_annotator/annotate.py b/metadata_annotator/annotate.py index c8d15de..38891b6 100644 --- a/metadata_annotator/annotate.py +++ b/metadata_annotator/annotate.py @@ -46,8 +46,12 @@ def annotate_granule( config_file=varinfo_config_file, ) - if len(granule_varinfo.cf_config.metadata_overrides): - # There are metadata overrides applicable to the granule's collection: + if ( + len(granule_varinfo.cf_config.metadata_overrides) + or granule_varinfo.cf_config.excluded_science_variables + ): + # There are metadata overrides or excluded variables + # applicable to the granule's collection: amend_in_file_metadata(input_file_name, output_file_name, granule_varinfo) else: # There are no updates required, so copy the input file as-is: @@ -60,10 +64,10 @@ def amend_in_file_metadata( """Update metadata attributes according to known rules. First, identify the variables or groups needing to be updated, or variables - that need to be created. Next create any missing, attribute only, variables. - Update the metadata attributes of all variables listed in overrides, or - removing any attributes with an overriding value of None. Lastly, update - the `history` global attribute. + that need to be created. Then, delete any variables that are configured to be + excluded. Next create any missing, attribute only, variables. Update the metadata + attributes of all variables listed in overrides, or removing any attributes with an + overriding value of None. Lastly, update the `history` global attribute. When opening the file as a DataTree, attempts to decode times, coordinates and other CF-Convention metadata are disabled, to allow updates to be made @@ -74,7 +78,7 @@ def amend_in_file_metadata( items_to_update, variables_to_create = get_matching_groups_and_variables( granule_varinfo, ) - + variables_to_delete = get_variables_to_delete(granule_varinfo) with xr.open_datatree( input_file_name, decode_times=False, @@ -83,7 +87,15 @@ def amend_in_file_metadata( concat_characters=True, use_cftime=False, mask_and_scale=False, + engine='h5netcdf', ) as datatree: + # Delete the excluded variables from the datatree and remove them from + # the set of items to update + for variable in variables_to_delete: + if variable in items_to_update: + items_to_update.remove(variable) + delete_variable(datatree, variable) + # Update all pre-existing variables or groups with metadata overrides including # dimension renaming where applicable. update_group_and_variable_attributes(datatree, items_to_update, granule_varinfo) @@ -117,7 +129,7 @@ def amend_in_file_metadata( # whole `xarray.DataTree` in one operation. Making this write variables # and group separately reduces the memory usage, but makes the # operation slower. (See Harmony SMAP L2 Gridder implementation) - datatree.to_netcdf(output_file_name) + datatree.to_netcdf(output_file_name, engine='h5netcdf') def get_matching_groups_and_variables( @@ -510,3 +522,26 @@ def get_referenced_variables( ) return referenced_variables + + +def get_variables_to_delete( + var_info: VarInfoFromNetCDF4, +) -> list[str]: + """Returns a list of variables to delete identified by VarInfo configuration.""" + var_list = var_info.get_all_variables() + return [var for var in var_list if is_excluded_science_variable(var_info, var)] + + +def is_excluded_science_variable(var_info: VarInfoFromNetCDF4, var) -> bool: + """Returns True if variable is explicitly excluded by VarInfo configuration.""" + exclusions_pattern = re.compile( + '|'.join(var_info.cf_config.excluded_science_variables) + ) + return var_info.variable_is_excluded(var, exclusions_pattern) + + +def delete_variable(datatree, full_variable_path: str) -> None: + """Delete a variable from the DataTree.""" + parent_group, variable_name = full_variable_path.rsplit('/', 1) + node = datatree[parent_group] if parent_group else datatree + del node[variable_name] diff --git a/metadata_annotator/earthdata_varinfo_config.json b/metadata_annotator/earthdata_varinfo_config.json index 6f815c5..fd32798 100644 --- a/metadata_annotator/earthdata_varinfo_config.json +++ b/metadata_annotator/earthdata_varinfo_config.json @@ -16,6 +16,26 @@ "Mission": { "SPL[1234].+": "SMAP" }, + "ExcludedScienceVariables": [ + { + "Applicability": { + "Mission": "SMAP" + }, + "VariablePattern": [ + "/.*time_utc.*" + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FTA" + }, + "VariablePattern": [ + "/Freeze_Thaw_Retrieval_Data/freeze_reference_date", + "/Freeze_Thaw_Retrieval_Data/thaw_reference_date" + ] + } + ], "MetadataOverrides": [ { "Applicability": { @@ -1915,7 +1935,7 @@ ], "_Description": "SMAP L3 data are HDF5 and without dimension settings. Overrides here define the dimensions, a useful reference name, and critically, the dimension order." }, - { + { "Applicability": { "Mission": "SMAP", "ShortNamePath": "SPL2SMAP_S", diff --git a/requirements.txt b/requirements.txt index 12ea1ca..7f27740 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ earthdata-varinfo ~= 3.0.2 harmony-service-lib ~= 2.5.0 netCDF4 ~= 1.6.5 xarray == 2025.9.0 +h5netcdf ~= 1.6.4 diff --git a/tests/conftest.py b/tests/conftest.py index 26098e7..260ab61 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -394,3 +394,79 @@ def sample_varinfo_test04( return VarInfoFromNetCDF4( sample_netcdf4_file, config_file=varinfo_config_file, short_name='TEST04' ) + + +@fixture(scope='function') +def sample_netcdf4_file_test05(temp_dir) -> str: + """Create a sample NetCDF-4 file for testing excluding variables.""" + file_name = path_join(temp_dir, 'test_input_05.nc') + + sample_datatree = xr.DataTree( + xr.Dataset( + data_vars={ + 'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']), + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + ) + + sample_datatree['/sub_group'] = xr.Dataset( + data_vars={ + 'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']), + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + + sample_datatree['/sub_group/nested_group'] = xr.Dataset( + data_vars={ + 'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']), + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + + sample_datatree.to_netcdf(file_name, encoding=None) + return file_name + + +@fixture(scope='function') +def expected_output_netcdf4_file_test05(temp_dir) -> str: + """Create a sample NetCDF-4 file for testing excluding variables. + + The generated file omits the 'string_time_utc_seconds' variable from each group. + This ensures that the metadata annotator correctly excludes these variables + from its output during testing. + """ + file_name = path_join(temp_dir, 'test_input_05.nc') + + sample_datatree = xr.DataTree( + xr.Dataset( + data_vars={ + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + ) + + sample_datatree['/sub_group'] = xr.Dataset( + data_vars={ + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + + sample_datatree['/sub_group/nested_group'] = xr.Dataset( + data_vars={ + 'string_time_seconds': xr.DataArray(np.ones((3, 3))), + }, + ) + + sample_datatree.to_netcdf(file_name, encoding=None) + return file_name + + +@fixture(scope='function') +def sample_varinfo_test05( + sample_netcdf4_file_test05, varinfo_config_file +) -> VarInfoFromNetCDF4: + """Create sample VarInfoFromNetCDF4 instance.""" + return VarInfoFromNetCDF4( + sample_netcdf4_file_test05, config_file=varinfo_config_file, short_name='TEST05' + ) diff --git a/tests/data/earthdata_varinfo_test_config.json b/tests/data/earthdata_varinfo_test_config.json index 6aa1262..6dd6699 100644 --- a/tests/data/earthdata_varinfo_test_config.json +++ b/tests/data/earthdata_varinfo_test_config.json @@ -15,6 +15,17 @@ "Mission": { "TEST\\d{2}": "TEST_MISSION" }, + "ExcludedScienceVariables": [ + { + "Applicability": { + "Mission": "TEST_MISSION", + "ShortNamePath": "TEST05" + }, + "VariablePattern": [ + "/.*time_utc.*" + ] + } + ], "MetadataOverrides": [ { "Applicability": { diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py index a619141..ecaceb6 100644 --- a/tests/unit/test_annotate.py +++ b/tests/unit/test_annotate.py @@ -11,6 +11,7 @@ from metadata_annotator.annotate import ( annotate_granule, create_new_variable, + delete_variable, get_dimension_variables, get_geotransform_config, get_grid_start_index, @@ -19,7 +20,9 @@ get_spatial_dimension_type, get_spatial_dimension_variables, get_start_index_from_row_col_variable, + get_variables_to_delete, is_exact_path, + is_excluded_science_variable, is_temporary_attribute, update_dimension_names, update_dimension_variable_attributes, @@ -250,6 +253,31 @@ def test_annotate_granule_no_changes( assert results_datatree.identical(expected_datatree) +def test_annotate_granule_variable_exclusions_only( + sample_netcdf4_file_test05, + expected_output_netcdf4_file_test05, + temp_output_file_path, + varinfo_config_file, + mocker, +): + """Confirm that a granule with only variable exclusion configuration is updated.""" + _ = mocker.patch('metadata_annotator.annotate.update_history_metadata') + annotate_granule( + sample_netcdf4_file_test05, + temp_output_file_path, + varinfo_config_file, + 'TEST05', + ) + + with ( + xr.open_datatree( + expected_output_netcdf4_file_test05, decode_times=False + ) as expected_datatree, + xr.open_datatree(temp_output_file_path, decode_times=False) as results_datatree, + ): + assert results_datatree.identical(expected_datatree) + + def test_annotate_granule_with_dimension_variable_updates(temp_output_file_path): """Confirm that a granule has all metadata updated as expected. @@ -716,3 +744,65 @@ def test_update_dimension_variables(sample_netcdf4_file_test04, sample_varinfo_t assert np.allclose( test_datatree['sub_group'].dataset['y'], expected_y_values ) + + +def test_get_variables_to_delete(sample_varinfo_test05): + """Ensure correct list of variables to delete is obtained.""" + expected_result = set( + [ + '/string_time_utc_seconds', + '/sub_group/string_time_utc_seconds', + '/sub_group/nested_group/string_time_utc_seconds', + ] + ) + assert set(get_variables_to_delete(sample_varinfo_test05)) == expected_result + + +def test_is_excluded_science_variable(sample_varinfo_test05): + """Ensure excluded science variables are determined correctly.""" + assert is_excluded_science_variable( + sample_varinfo_test05, '/string_time_utc_seconds' + ) + assert is_excluded_science_variable( + sample_varinfo_test05, '/sub_group/string_time_utc_seconds' + ) + assert is_excluded_science_variable( + sample_varinfo_test05, '/sub_group/nested_group/string_time_utc_seconds' + ) + assert not is_excluded_science_variable( + sample_varinfo_test05, '/string_time_seconds' + ) + assert not is_excluded_science_variable( + sample_varinfo_test05, '/sub_group/string_time_seconds' + ) + assert not is_excluded_science_variable( + sample_varinfo_test05, '/sub_group/nested/string_time_seconds' + ) + + +@pytest.mark.parametrize( + 'group_name', + [ + '/', + '/sub_group', + '/sub_group/nested_group', + ], +) +def test_delete_variable(sample_netcdf4_file_test05, group_name): + """Ensure correct variables are deleted from the datatree.""" + with xr.open_datatree( + sample_netcdf4_file_test05, decode_times=False + ) as test_datatree: + variable_name = 'string_time_utc_seconds' + if group_name == '/': + full_path = f'/{variable_name}' + else: + full_path = f'{group_name}/{variable_name}' + + # check that variable is in datatree before deletion + assert variable_name in test_datatree[group_name].ds.data_vars + initial_var_count = len(test_datatree[group_name].ds.data_vars) + + delete_variable(test_datatree, full_path) + assert variable_name not in test_datatree[group_name].ds.data_vars + assert len(test_datatree[group_name].ds.data_vars) == initial_var_count - 1