Skip to content

Commit 7ec3b33

Browse files
authored
DAS-2418: Add exclusions for SMAP L3 string variables (#33)
* DAS-2418: Add exclusions for SMAP L3 string variables * DAS-2418: Use h5netcdf engine for xarray reads and write * DAS-2418: Add checkbox to PR template for fix version * DAS-2418: Minor updates for PR comments * DAS-2418: Update time_utc test variables to be strings
1 parent f78c981 commit 7ec3b33

9 files changed

Lines changed: 252 additions & 10 deletions

File tree

.github/pull_request_template.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@ A short description of the changes in this PR.
1313
* [ ] Jira ticket acceptance criteria met.
1414
* [ ] `CHANGELOG.md` updated to include high level summary of PR changes.
1515
* [ ] `docker/service_version.txt` updated if publishing a release.
16+
* [ ] Fix version [harmony-metadata-annotator-X.Y.Z] added to Jira ticket if publishing a release.
1617
* [ ] Tests added/updated and passing.
1718
* [ ] Documentation updated (if needed).

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [v1.5.0] - 2025-10-13
9+
10+
### Changed
11+
12+
- Adds ability to exclude variables from the output using earthdata-varinfo configuration.
13+
- Adds configuration entries to exclude SMAP L3 string variables.
14+
- Changes xarray engine from default netcdf4 to h5netcdf.
15+
816
## [v1.4.0] - 2025-09-30
917

1018
### Changed

docker/service_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.4.0
1+
1.5.0

metadata_annotator/annotate.py

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ def annotate_granule(
4646
config_file=varinfo_config_file,
4747
)
4848

49-
if len(granule_varinfo.cf_config.metadata_overrides):
50-
# There are metadata overrides applicable to the granule's collection:
49+
if (
50+
len(granule_varinfo.cf_config.metadata_overrides)
51+
or granule_varinfo.cf_config.excluded_science_variables
52+
):
53+
# There are metadata overrides or excluded variables
54+
# applicable to the granule's collection:
5155
amend_in_file_metadata(input_file_name, output_file_name, granule_varinfo)
5256
else:
5357
# There are no updates required, so copy the input file as-is:
@@ -60,10 +64,10 @@ def amend_in_file_metadata(
6064
"""Update metadata attributes according to known rules.
6165
6266
First, identify the variables or groups needing to be updated, or variables
63-
that need to be created. Next create any missing, attribute only, variables.
64-
Update the metadata attributes of all variables listed in overrides, or
65-
removing any attributes with an overriding value of None. Lastly, update
66-
the `history` global attribute.
67+
that need to be created. Then, delete any variables that are configured to be
68+
excluded. Next create any missing, attribute only, variables. Update the metadata
69+
attributes of all variables listed in overrides, or removing any attributes with an
70+
overriding value of None. Lastly, update the `history` global attribute.
6771
6872
When opening the file as a DataTree, attempts to decode times, coordinates
6973
and other CF-Convention metadata are disabled, to allow updates to be made
@@ -74,7 +78,7 @@ def amend_in_file_metadata(
7478
items_to_update, variables_to_create = get_matching_groups_and_variables(
7579
granule_varinfo,
7680
)
77-
81+
variables_to_delete = get_variables_to_delete(granule_varinfo)
7882
with xr.open_datatree(
7983
input_file_name,
8084
decode_times=False,
@@ -83,7 +87,15 @@ def amend_in_file_metadata(
8387
concat_characters=True,
8488
use_cftime=False,
8589
mask_and_scale=False,
90+
engine='h5netcdf',
8691
) as datatree:
92+
# Delete the excluded variables from the datatree and remove them from
93+
# the set of items to update
94+
for variable in variables_to_delete:
95+
if variable in items_to_update:
96+
items_to_update.remove(variable)
97+
delete_variable(datatree, variable)
98+
8799
# Update all pre-existing variables or groups with metadata overrides including
88100
# dimension renaming where applicable.
89101
update_group_and_variable_attributes(datatree, items_to_update, granule_varinfo)
@@ -117,7 +129,7 @@ def amend_in_file_metadata(
117129
# whole `xarray.DataTree` in one operation. Making this write variables
118130
# and group separately reduces the memory usage, but makes the
119131
# operation slower. (See Harmony SMAP L2 Gridder implementation)
120-
datatree.to_netcdf(output_file_name)
132+
datatree.to_netcdf(output_file_name, engine='h5netcdf')
121133

122134

123135
def get_matching_groups_and_variables(
@@ -510,3 +522,26 @@ def get_referenced_variables(
510522
)
511523

512524
return referenced_variables
525+
526+
527+
def get_variables_to_delete(
528+
var_info: VarInfoFromNetCDF4,
529+
) -> list[str]:
530+
"""Returns a list of variables to delete identified by VarInfo configuration."""
531+
var_list = var_info.get_all_variables()
532+
return [var for var in var_list if is_excluded_science_variable(var_info, var)]
533+
534+
535+
def is_excluded_science_variable(var_info: VarInfoFromNetCDF4, var) -> bool:
536+
"""Returns True if variable is explicitly excluded by VarInfo configuration."""
537+
exclusions_pattern = re.compile(
538+
'|'.join(var_info.cf_config.excluded_science_variables)
539+
)
540+
return var_info.variable_is_excluded(var, exclusions_pattern)
541+
542+
543+
def delete_variable(datatree, full_variable_path: str) -> None:
544+
"""Delete a variable from the DataTree."""
545+
parent_group, variable_name = full_variable_path.rsplit('/', 1)
546+
node = datatree[parent_group] if parent_group else datatree
547+
del node[variable_name]

metadata_annotator/earthdata_varinfo_config.json

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,26 @@
1616
"Mission": {
1717
"SPL[1234].+": "SMAP"
1818
},
19+
"ExcludedScienceVariables": [
20+
{
21+
"Applicability": {
22+
"Mission": "SMAP"
23+
},
24+
"VariablePattern": [
25+
"/.*time_utc.*"
26+
]
27+
},
28+
{
29+
"Applicability": {
30+
"Mission": "SMAP",
31+
"ShortNamePath": "SPL3FTA"
32+
},
33+
"VariablePattern": [
34+
"/Freeze_Thaw_Retrieval_Data/freeze_reference_date",
35+
"/Freeze_Thaw_Retrieval_Data/thaw_reference_date"
36+
]
37+
}
38+
],
1939
"MetadataOverrides": [
2040
{
2141
"Applicability": {
@@ -1915,7 +1935,7 @@
19151935
],
19161936
"_Description": "SMAP L3 data are HDF5 and without dimension settings. Overrides here define the dimensions, a useful reference name, and critically, the dimension order."
19171937
},
1918-
{
1938+
{
19191939
"Applicability": {
19201940
"Mission": "SMAP",
19211941
"ShortNamePath": "SPL2SMAP_S",

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ earthdata-varinfo ~= 3.0.2
44
harmony-service-lib ~= 2.5.0
55
netCDF4 ~= 1.6.5
66
xarray == 2025.9.0
7+
h5netcdf ~= 1.6.4

tests/conftest.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,3 +394,79 @@ def sample_varinfo_test04(
394394
return VarInfoFromNetCDF4(
395395
sample_netcdf4_file, config_file=varinfo_config_file, short_name='TEST04'
396396
)
397+
398+
399+
@fixture(scope='function')
400+
def sample_netcdf4_file_test05(temp_dir) -> str:
401+
"""Create a sample NetCDF-4 file for testing excluding variables."""
402+
file_name = path_join(temp_dir, 'test_input_05.nc')
403+
404+
sample_datatree = xr.DataTree(
405+
xr.Dataset(
406+
data_vars={
407+
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
408+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
409+
},
410+
)
411+
)
412+
413+
sample_datatree['/sub_group'] = xr.Dataset(
414+
data_vars={
415+
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
416+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
417+
},
418+
)
419+
420+
sample_datatree['/sub_group/nested_group'] = xr.Dataset(
421+
data_vars={
422+
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
423+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
424+
},
425+
)
426+
427+
sample_datatree.to_netcdf(file_name, encoding=None)
428+
return file_name
429+
430+
431+
@fixture(scope='function')
432+
def expected_output_netcdf4_file_test05(temp_dir) -> str:
433+
"""Create a sample NetCDF-4 file for testing excluding variables.
434+
435+
The generated file omits the 'string_time_utc_seconds' variable from each group.
436+
This ensures that the metadata annotator correctly excludes these variables
437+
from its output during testing.
438+
"""
439+
file_name = path_join(temp_dir, 'test_input_05.nc')
440+
441+
sample_datatree = xr.DataTree(
442+
xr.Dataset(
443+
data_vars={
444+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
445+
},
446+
)
447+
)
448+
449+
sample_datatree['/sub_group'] = xr.Dataset(
450+
data_vars={
451+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
452+
},
453+
)
454+
455+
sample_datatree['/sub_group/nested_group'] = xr.Dataset(
456+
data_vars={
457+
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
458+
},
459+
)
460+
461+
sample_datatree.to_netcdf(file_name, encoding=None)
462+
return file_name
463+
464+
465+
@fixture(scope='function')
466+
def sample_varinfo_test05(
467+
sample_netcdf4_file_test05, varinfo_config_file
468+
) -> VarInfoFromNetCDF4:
469+
"""Create sample VarInfoFromNetCDF4 instance."""
470+
return VarInfoFromNetCDF4(
471+
sample_netcdf4_file_test05, config_file=varinfo_config_file, short_name='TEST05'
472+
)

tests/data/earthdata_varinfo_test_config.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515
"Mission": {
1616
"TEST\\d{2}": "TEST_MISSION"
1717
},
18+
"ExcludedScienceVariables": [
19+
{
20+
"Applicability": {
21+
"Mission": "TEST_MISSION",
22+
"ShortNamePath": "TEST05"
23+
},
24+
"VariablePattern": [
25+
"/.*time_utc.*"
26+
]
27+
}
28+
],
1829
"MetadataOverrides": [
1930
{
2031
"Applicability": {

tests/unit/test_annotate.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from metadata_annotator.annotate import (
1212
annotate_granule,
1313
create_new_variable,
14+
delete_variable,
1415
get_dimension_variables,
1516
get_geotransform_config,
1617
get_grid_start_index,
@@ -19,7 +20,9 @@
1920
get_spatial_dimension_type,
2021
get_spatial_dimension_variables,
2122
get_start_index_from_row_col_variable,
23+
get_variables_to_delete,
2224
is_exact_path,
25+
is_excluded_science_variable,
2326
is_temporary_attribute,
2427
update_dimension_names,
2528
update_dimension_variable_attributes,
@@ -250,6 +253,31 @@ def test_annotate_granule_no_changes(
250253
assert results_datatree.identical(expected_datatree)
251254

252255

256+
def test_annotate_granule_variable_exclusions_only(
257+
sample_netcdf4_file_test05,
258+
expected_output_netcdf4_file_test05,
259+
temp_output_file_path,
260+
varinfo_config_file,
261+
mocker,
262+
):
263+
"""Confirm that a granule with only variable exclusion configuration is updated."""
264+
_ = mocker.patch('metadata_annotator.annotate.update_history_metadata')
265+
annotate_granule(
266+
sample_netcdf4_file_test05,
267+
temp_output_file_path,
268+
varinfo_config_file,
269+
'TEST05',
270+
)
271+
272+
with (
273+
xr.open_datatree(
274+
expected_output_netcdf4_file_test05, decode_times=False
275+
) as expected_datatree,
276+
xr.open_datatree(temp_output_file_path, decode_times=False) as results_datatree,
277+
):
278+
assert results_datatree.identical(expected_datatree)
279+
280+
253281
def test_annotate_granule_with_dimension_variable_updates(temp_output_file_path):
254282
"""Confirm that a granule has all metadata updated as expected.
255283
@@ -716,3 +744,65 @@ def test_update_dimension_variables(sample_netcdf4_file_test04, sample_varinfo_t
716744
assert np.allclose(
717745
test_datatree['sub_group'].dataset['y'], expected_y_values
718746
)
747+
748+
749+
def test_get_variables_to_delete(sample_varinfo_test05):
750+
"""Ensure correct list of variables to delete is obtained."""
751+
expected_result = set(
752+
[
753+
'/string_time_utc_seconds',
754+
'/sub_group/string_time_utc_seconds',
755+
'/sub_group/nested_group/string_time_utc_seconds',
756+
]
757+
)
758+
assert set(get_variables_to_delete(sample_varinfo_test05)) == expected_result
759+
760+
761+
def test_is_excluded_science_variable(sample_varinfo_test05):
762+
"""Ensure excluded science variables are determined correctly."""
763+
assert is_excluded_science_variable(
764+
sample_varinfo_test05, '/string_time_utc_seconds'
765+
)
766+
assert is_excluded_science_variable(
767+
sample_varinfo_test05, '/sub_group/string_time_utc_seconds'
768+
)
769+
assert is_excluded_science_variable(
770+
sample_varinfo_test05, '/sub_group/nested_group/string_time_utc_seconds'
771+
)
772+
assert not is_excluded_science_variable(
773+
sample_varinfo_test05, '/string_time_seconds'
774+
)
775+
assert not is_excluded_science_variable(
776+
sample_varinfo_test05, '/sub_group/string_time_seconds'
777+
)
778+
assert not is_excluded_science_variable(
779+
sample_varinfo_test05, '/sub_group/nested/string_time_seconds'
780+
)
781+
782+
783+
@pytest.mark.parametrize(
784+
'group_name',
785+
[
786+
'/',
787+
'/sub_group',
788+
'/sub_group/nested_group',
789+
],
790+
)
791+
def test_delete_variable(sample_netcdf4_file_test05, group_name):
792+
"""Ensure correct variables are deleted from the datatree."""
793+
with xr.open_datatree(
794+
sample_netcdf4_file_test05, decode_times=False
795+
) as test_datatree:
796+
variable_name = 'string_time_utc_seconds'
797+
if group_name == '/':
798+
full_path = f'/{variable_name}'
799+
else:
800+
full_path = f'{group_name}/{variable_name}'
801+
802+
# check that variable is in datatree before deletion
803+
assert variable_name in test_datatree[group_name].ds.data_vars
804+
initial_var_count = len(test_datatree[group_name].ds.data_vars)
805+
806+
delete_variable(test_datatree, full_path)
807+
assert variable_name not in test_datatree[group_name].ds.data_vars
808+
assert len(test_datatree[group_name].ds.data_vars) == initial_var_count - 1

0 commit comments

Comments
 (0)