Merge pull request #144 from Deltares/DEI-260-handle-NaN

wschoonveld · web-flow · commit 2a188e538b80 · 2025-09-09T11:02:07.000+02:00
DEI-260: handle NaN in CombineResultsRule and TimeAggregationRule
diff --git a/README.md b/README.md
@@ -139,7 +139,7 @@ INFO     -  [10:44:34] Serving on http://127.0.0.1:8000/
 In this case the docs are available on http://127.0.0.1:8000/ or http://localhost:8000
 
 For each release a version of documentation is available at: 
-[deltares.github.io/D-EcoImpact/](deltares.github.io/D-EcoImpact/)
+[deltares.github.io/D-EcoImpact/](https://deltares.github.io/D-EcoImpact/)
 
 ## Add acceptance tests
 
diff --git a/decoimpact/business/entities/rules/combine_results_rule.py b/decoimpact/business/entities/rules/combine_results_rule.py
@@ -34,16 +34,23 @@ def __init__(
         name: str,
         input_variable_names: List[str],
         operation_type: MultiArrayOperationType,
+        ignore_nan: bool = False,
     ):
         super().__init__(name, input_variable_names)
         self._operation_type: MultiArrayOperationType = operation_type
+        self._ignore_nan = ignore_nan
         self._operations = self._create_operations()
 
     @property
     def operation_type(self) -> MultiArrayOperationType:
         """Name of the rule"""
         return self._operation_type
 
+    @property
+    def ignore_nan(self) -> bool:
+        """Indicates if NaN values should be ignored in the calculations"""
+        return self._ignore_nan
+
     def validate(self, logger: ILogger) -> bool:
         if self._operation_type not in self._operations:
 
@@ -89,6 +96,19 @@ def execute(
         return result_variable
 
     def _create_operations(self) -> dict[MultiArrayOperationType, Callable]:
+        if self.ignore_nan:
+            return {
+                MultiArrayOperationType.MULTIPLY: lambda npa: _np.prod(npa, axis=0),
+                MultiArrayOperationType.MIN: lambda npa: _np.nanmin(npa, axis=0),
+                MultiArrayOperationType.MAX: lambda npa: _np.nanmax(npa, axis=0),
+                MultiArrayOperationType.AVERAGE: lambda npa: _np.nanmean(npa, axis=0),
+                MultiArrayOperationType.MEDIAN: lambda npa: _np.nanmedian(npa, axis=0),
+                MultiArrayOperationType.ADD: lambda npa: _np.nansum(npa, axis=0),
+                MultiArrayOperationType.SUBTRACT: lambda npa: _np.subtract(
+                    npa[0], _np.nansum(npa[1:], axis=0)
+                ),
+            }
+        # and if ignore_nan is False:
         return {
             MultiArrayOperationType.MULTIPLY: lambda npa: _np.prod(npa, axis=0),
             MultiArrayOperationType.MIN: lambda npa: _np.min(npa, axis=0),
@@ -101,7 +121,7 @@ def _create_operations(self) -> dict[MultiArrayOperationType, Callable]:
             ),
         }
 
-    def _check_dimensions(self, np_arrays: List[_np.array]) -> bool:
+    def _check_dimensions(self, np_arrays: List[_np.ndarray]) -> bool:
         """Brief check if all the arrays to be combined have the
            same size/dimension/length
         Args:
diff --git a/decoimpact/business/entities/rules/time_aggregation_rule.py b/decoimpact/business/entities/rules/time_aggregation_rule.py
@@ -67,19 +67,22 @@ def execute(self, value_array: _xr.DataArray, logger: ILogger) -> _xr.DataArray:
         """
         settings = self._settings
         if settings.operation_type is TimeOperationType.COUNT_PERIODS:
-            # Check if all values in a COUNT_PERIODS value array are either 0 or 1
-            compare_values = (value_array == 0) | (value_array == 1)
+            # Check if all values in a COUNT_PERIODS value array
+            #  are either 0 or 1 or NaN
+            compare_values = (
+                (value_array == 0) | (value_array == 1) | _np.isnan(value_array)
+            )
             check_values = _xr.where(compare_values, True, False)
             if False in check_values:
                 raise ValueError(
                     "The value array for the time aggregation rule with operation type"
-                    " COUNT_PERIODS should only contain the values 0 and 1."
+                    " COUNT_PERIODS should only contain the values 0 and 1 (or NaN)."
                 )
 
         dim_name = get_dict_element(settings.time_scale, settings.time_scale_mapping)
 
         time_dim_name = get_time_dimension_name(value_array, logger)
-        aggregated_values = value_array.resample({time_dim_name: dim_name})
+        aggregated_values = value_array.resample({time_dim_name: dim_name}, skipna=True)
 
         result = self._perform_operation(aggregated_values)
         # create a new aggregated time dimension based on original time dimension
@@ -164,7 +167,7 @@ def count_groups(self, elem):
         """
         # in case of an example array with 5 values [1,1,0,1,0]:
         # subtract last 4 values from the first 4 values: [1,0,1,0] - [1,1,0,1]:
-        # (the result of this example differences: [0,-1,1,0])
+        # (the result of this example differences: [0,-1,1,-1])
         differences = _np.diff(elem)
         # First add the first element of the array to the difference array (as this
         # could also indicate a beginning of a group or not and the diff is calculated
@@ -208,6 +211,7 @@ def analyze_groups(self, elem, axis):
         Returns:
             array: array with the analyzed periods, with the same dimensions as elem
         """
+        # Determine the number of axes in the array
         no_axis = len(_np.shape(elem))
 
         # The reduce function that calls this analyze_groups function should be reduces
@@ -224,6 +228,10 @@ def analyze_groups(self, elem, axis):
 
         #  in case of 1 dimension:
         if no_axis == 1:
+            # remove NaN values from the array (these are to be ignored)
+            elem = elem[~_np.isnan(elem)]
+            if len(elem) == 0:
+                return 0
             if self.settings.operation_type is TimeOperationType.COUNT_PERIODS:
                 group_result = self.count_groups(elem)
             elif self.settings.operation_type is TimeOperationType.MAX_DURATION_PERIODS:
diff --git a/decoimpact/business/workflow/model_builder.py b/decoimpact/business/workflow/model_builder.py
@@ -159,6 +159,7 @@ def _create_rule(rule_data: IRuleData) -> IRule:
                 rule_data.name,
                 rule_data.input_variable_names,
                 MultiArrayOperationType[rule_data.operation_type],
+                rule_data.ignore_nan
             )
         elif isinstance(rule_data, IResponseCurveRuleData):
             rule = ResponseCurveRule(
diff --git a/decoimpact/data/api/i_combine_results_rule_data.py b/decoimpact/data/api/i_combine_results_rule_data.py
@@ -30,3 +30,8 @@ def input_variable_names(self) -> List[str]:
     @abstractmethod
     def operation_type(self) -> str:
         """Property for the operation_type"""
+
+    @property
+    @abstractmethod
+    def ignore_nan(self) -> bool:
+        """Property for the ignore_nan flag"""
diff --git a/decoimpact/data/entities/combine_results_rule_data.py b/decoimpact/data/entities/combine_results_rule_data.py
@@ -21,10 +21,12 @@
 class CombineResultsRuleData(ICombineResultsRuleData, RuleData):
     """Class for storing data related to combine results rule"""
 
-    def __init__(self, name: str, input_variable_names: List[str], operation_type: str):
+    def __init__(self, name: str, input_variable_names: List[str],
+                 operation_type: str, ignore_nan: bool = False):
         super().__init__(name)
         self._input_variable_names = input_variable_names
         self._operation_type = operation_type
+        self._ignore_nan = ignore_nan
 
     @property
     def input_variable_names(self) -> List[str]:
@@ -35,3 +37,8 @@ def input_variable_names(self) -> List[str]:
     def operation_type(self) -> str:
         """Name of the input variable"""
         return self._operation_type
+
+    @property
+    def ignore_nan(self) -> bool:
+        """Property for the ignore_nan flag"""
+        return self._ignore_nan
diff --git a/decoimpact/data/parsers/parser_combine_results_rule.py b/decoimpact/data/parsers/parser_combine_results_rule.py
@@ -48,7 +48,10 @@ def parse_dict(self, dictionary: Dict[str, Any], logger: ILogger) -> IRuleData:
         if not description:
             description = ""
 
-        rule_data = CombineResultsRuleData(name, input_variable_names, operation_type)
+        ignore_nan = get_dict_element("ignore_nan", dictionary, False)
+
+        rule_data = CombineResultsRuleData(name, input_variable_names,
+                                           operation_type, ignore_nan)
         rule_data.output_variable = output_variable_name
         rule_data.description = description
 
diff --git a/docs/manual/rules/combine_results_rule.md b/docs/manual/rules/combine_results_rule.md
@@ -8,11 +8,13 @@ FORMAT
       operation: <statistic_opperation_applied>
       input_variables: [<list with_input_variable_names>]
       output_variable: <one_output_variable_name>
+      ignore_nan: <boolean>
 ```
 
-The combine results rule combines the output of two or more variables to one output variable. The way this data is combined depends on the operation chosen. This could be used for adding mutual exclusive results (e.g., habitat suitability based on flow velocity and water depth) or asses difference between results (e.g., waterlevel and bathymetry to get the water depth).The rule operates one or multiple  3D variables or 2D variables, independent of the time axes, as long as these all have the same dimensions and returns a single 3D or 2D result, either with time axis, depending on input.
+The combine results rule combines the output of two or more variables to one output variable. The way this data is combined depends on the operation chosen. This could be used for adding mutual exclusive results (e.g., habitat suitability based on flow velocity and water depth) or assessing difference between results (e.g., waterlevel and bathymetry to get the water depth). The rule operates one or multiple  3D variables or 2D variables, independent of the time axes, as long as these all have the same dimensions and returns a single 3D or 2D result, either with time axis, depending on input.
 
 Operations available: Add, Subtract, Multiply, Average, Median, Min and Max
+The parameter ignore_nan is optional and has a default value of False. When this parameter is set to True, empty values (NaN) will be ignored for all operations, except for Multiply.
 
 The rule needs to be applied to an existing 2D/3D variables with or without time axis. A new 2D/3D variable with or without time axis is created when the rule is executed.
 
@@ -26,6 +28,7 @@ The rule needs to be applied to an existing 2D/3D variables with or without time
       operation: subtract
       input_variables: ["water_level","water_depth"]
       output_variable: bathymetry_time
+      ignore: True
 
 ```
 
diff --git a/docs/manual/rules/time_aggregation_rule.md b/docs/manual/rules/time_aggregation_rule.md
@@ -33,7 +33,7 @@ The rule needs to be applied to an existing 2D/3D variable with time axis. A new
 
 Period statistics: Time aggregation rule with COUNT_PERIODS, AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS
 
-When the operation type period statistics is used, the user needs to make sure that the input data is always consisting of only 1 and 0. If there is no such layer, the user can make a combination of for example the classification rule together with the time aggregation rule. For example, water depth can be used to check whether the cells are dry or not (this can be done with a classification rule) and with the COUNT_PERIODS operation type in the time aggregation rule the number of consecutive periods within a year or month can be calculated (nr). AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS take the respective statistic of the duration for those consecutive periods (duration).
+When the operation type period statistics is used, the user needs to make sure that the input data is always consisting of only 1 and 0. If there is no such layer, the user can make a combination of for example the classification rule together with the time aggregation rule. For example, water depth can be used to check whether the cells are dry or not (this can be done with a classification rule) and with the COUNT_PERIODS operation type in the time aggregation rule the number of consecutive periods within a year or month can be calculated (nr). AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS take the respective statistic of the duration for those consecutive periods (duration). Empty values (NaN) are allowed and will be ignored. In case for a specific dimension only empty values occur, the result of the aggregation will be 0.
 
 
 
diff --git a/tests/business/entities/rules/test_combine_results_rule.py b/tests/business/entities/rules/test_combine_results_rule.py
@@ -99,7 +99,7 @@ def test_execute_error_combine_results_rule_different_shapes():
 
     # Arrange & Act
     rule = CombineResultsRule(
-        "test", ["foo_data", "hello_data"], MultiArrayOperationType.MULTIPLY
+        "test", ["foo_data", "hello_data"], MultiArrayOperationType.MULTIPLY, False
     )
     value_array = {
         "foo_data": _xr.DataArray([[1, 2], [3, 4]]),
@@ -150,6 +150,76 @@ def test_all_operations_combine_results_rule(
     _xr.testing.assert_equal(obtained_result, _xr.DataArray(expected_result))
 
 
+@pytest.mark.parametrize(
+    "operation, expected_result",
+    [
+        (MultiArrayOperationType.MIN, [_np.nan, 5, 3]),
+        (MultiArrayOperationType.MAX, [_np.nan, 12, 24]),
+        (MultiArrayOperationType.MULTIPLY, [_np.nan, 420, 432]),
+        (MultiArrayOperationType.AVERAGE, [_np.nan, 8, 11]),
+        (MultiArrayOperationType.MEDIAN, [_np.nan, 7, 6]),
+        (MultiArrayOperationType.ADD, [_np.nan, 24, 33]),
+        (MultiArrayOperationType.SUBTRACT, [_np.nan, -10, -27]),
+    ],
+)
+def test_all_operations_incl_nan(
+    operation: MultiArrayOperationType, expected_result: List[float]
+):
+    """Test the outcome of each operand for the combine results rule"""
+    # Arrange
+    logger = Mock(ILogger)
+    dict_vars = {
+        "var1_name": _xr.DataArray([20, 7, 3]),
+        "var2_name": _xr.DataArray([4, 5, 6]),
+        "var3_name": _xr.DataArray([_np.nan, 12, 24]),
+    }
+
+    # Act
+    rule = CombineResultsRule(
+        "test_name",
+        ["var1_name", "var2_name", "var3_name"],
+        operation,
+    )
+    obtained_result = rule.execute(dict_vars, logger)
+
+    # Assert
+    _xr.testing.assert_equal(obtained_result, _xr.DataArray(expected_result))
+
+
+@pytest.mark.parametrize(
+    "operation, expected_result",
+    [
+        (MultiArrayOperationType.MIN, [4, 5, 3]),
+        (MultiArrayOperationType.MAX, [20, 12, 24]),
+        (MultiArrayOperationType.MULTIPLY, [_np.nan, 420, 432]),
+        (MultiArrayOperationType.AVERAGE, [12, 8, 11]),
+        (MultiArrayOperationType.MEDIAN, [12, 7, 6]),
+        (MultiArrayOperationType.ADD, [24, 24, 33]),
+        (MultiArrayOperationType.SUBTRACT, [16, -10, -27]),
+    ],
+)
+def test_all_operations_ignore_nan(
+    operation: MultiArrayOperationType, expected_result: List[float]
+):
+    """Test the outcome of each operand for the combine results rule"""
+    # Arrange
+    logger = Mock(ILogger)
+    dict_vars = {
+        "var1_name": _xr.DataArray([20, 7, 3]),
+        "var2_name": _xr.DataArray([4, 5, 6]),
+        "var3_name": _xr.DataArray([_np.nan, 12, 24]),
+    }
+
+    # Act
+    rule = CombineResultsRule(
+        "test_name", ["var1_name", "var2_name", "var3_name"], operation, ignore_nan=True
+    )
+    obtained_result = rule.execute(dict_vars, logger)
+
+    # Assert
+    _xr.testing.assert_equal(obtained_result, _xr.DataArray(expected_result))
+
+
 def test_dims_present_in_result():
     """Test that the dims metadata of the result is equal to the one of the first xarray used."""
     # Arrange
diff --git a/tests/business/entities/rules/test_time_aggregation_rule_analyze_periods.py b/tests/business/entities/rules/test_time_aggregation_rule_analyze_periods.py

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ def _create_rule(rule_data: IRuleData) -> IRule:`
`159`	`159`	`rule_data.name,`
`160`	`160`	`rule_data.input_variable_names,`
`161`	`161`	`MultiArrayOperationType[rule_data.operation_type],`
	`162`	`+ rule_data.ignore_nan`
`162`	`163`	`)`
`163`	`164`	`elif isinstance(rule_data, IResponseCurveRuleData):`
`164`	`165`	`rule = ResponseCurveRule(`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ The rule needs to be applied to an existing 2D/3D variable with time axis. A new`
`33`	`33`
`34`	`34`	`Period statistics: Time aggregation rule with COUNT_PERIODS, AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS`
`35`	`35`
`36`		-When the operation type period statistics is used, the user needs to make sure that the input data is always consisting of only 1 and 0. If there is no such layer, the user can make a combination of for example the classification rule together with the time aggregation rule. For example, water depth can be used to check whether the cells are dry or not (this can be done with a classification rule) and with the COUNT_PERIODS operation type in the time aggregation rule the number of consecutive periods within a year or month can be calculated (nr). AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS take the respective statistic of the duration for those consecutive periods (duration).
	`36`	+When the operation type period statistics is used, the user needs to make sure that the input data is always consisting of only 1 and 0. If there is no such layer, the user can make a combination of for example the classification rule together with the time aggregation rule. For example, water depth can be used to check whether the cells are dry or not (this can be done with a classification rule) and with the COUNT_PERIODS operation type in the time aggregation rule the number of consecutive periods within a year or month can be calculated (nr). AVG_DURATION_PERIODS, MIN_DURATION_PERIODS and MAX_DURATION_PERIODS take the respective statistic of the duration for those consecutive periods (duration). Empty values (NaN) are allowed and will be ignored. In case for a specific dimension only empty values occur, the result of the aggregation will be 0.
`37`	`37`
`38`	`38`
`39`	`39`