Merge branch 'master' into ablation

jjuncho · web-flow · commit 28aada15c887 · 2025-03-17T09:25:41.000-04:00
diff --git a/captum/_utils/typing.py b/captum/_utils/typing.py
@@ -24,6 +24,7 @@
 TupleOrTensorOrBoolGeneric = TypeVar(
     "TupleOrTensorOrBoolGeneric", Tuple[Tensor, ...], Tensor, bool
 )
+PassThroughOutputType = TypeVar("PassThroughOutputType")
 ModuleOrModuleList = TypeVar("ModuleOrModuleList", Module, List[Module])
 TargetType = Union[None, int, Tuple[int, ...], Tensor, List[Tuple[int, ...]], List[int]]
 BaselineTupleType = Union[None, Tuple[Union[Tensor, int, float], ...]]
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
@@ -3,7 +3,18 @@
 # pyre-strict
 
 import math
-from typing import Any, Callable, cast, Generator, List, Optional, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
 
 import torch
 from captum._utils.common import (
@@ -465,13 +476,21 @@ def _attribute_with_cross_tensor_feature_masks(
         attrib_type: dtype,
         **kwargs: Any,
     ) -> Tuple[List[Tensor], List[Tensor]]:
+        feature_idx_to_tensor_idx: Dict[int, List[int]] = {}
+        for i, mask in enumerate(formatted_feature_mask):
+            for feature_idx in torch.unique(mask):
+                if feature_idx.item() not in feature_idx_to_tensor_idx:
+                    feature_idx_to_tensor_idx[feature_idx.item()] = []
+                feature_idx_to_tensor_idx[feature_idx.item()].append(i)
+
         for (
             current_inputs,
             current_mask,
         ) in self._ablation_generator(
             formatted_inputs,
             baselines,
             formatted_feature_mask,
+            feature_idx_to_tensor_idx,
             **kwargs,
         ):
             # modified_eval has (n_feature_perturbed * n_outputs) elements
@@ -511,27 +530,28 @@ def _ablation_generator(
         inputs: Tuple[Tensor, ...],
         baselines: BaselineType,
         input_mask: Tuple[Tensor, ...],
+        feature_idx_to_tensor_idx: Dict[int, List[int]],
         **kwargs: Any,
     ) -> Generator[
         Tuple[
             Tuple[Tensor, ...],
-            Tuple[Tensor, ...],
+            Tuple[Optional[Tensor], ...],
         ],
         None,
         None,
     ]:
-        unique_feature_ids = torch.unique(
-            torch.cat([mask.flatten() for mask in input_mask])
-        ).tolist()
-
         if isinstance(baselines, torch.Tensor):
             baselines = baselines.reshape((1,) + tuple(baselines.shape))
 
         # Process one feature per time, rather than processing every input tensor
-        for feature_idx in unique_feature_ids:
+        for feature_idx in feature_idx_to_tensor_idx.keys():
             ablated_inputs, current_masks = (
                 self._construct_ablated_input_across_tensors(
-                    inputs, input_mask, baselines, feature_idx
+                    inputs,
+                    input_mask,
+                    baselines,
+                    feature_idx,
+                    feature_idx_to_tensor_idx[feature_idx],
                 )
             )
             yield ablated_inputs, current_masks
@@ -542,18 +562,17 @@ def _construct_ablated_input_across_tensors(
         input_mask: Tuple[Tensor, ...],
         baselines: BaselineType,
         feature_idx: int,
-    ) -> Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...]]:
+        tensor_idxs: List[int],
+    ) -> Tuple[Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]]:
 
         ablated_inputs = []
-        current_masks = []
+        current_masks: List[Optional[Tensor]] = []
         for i, input_tensor in enumerate(inputs):
-            mask = input_mask[i]
-            tensor_mask = mask == feature_idx
-            if not tensor_mask.any():
+            if i not in tensor_idxs:
                 ablated_inputs.append(input_tensor)
-                current_masks.append(torch.zeros_like(tensor_mask))
+                current_masks.append(None)
                 continue
-            tensor_mask = tensor_mask.to(input_tensor.device).long()
+            tensor_mask = (input_mask[i] == feature_idx).to(input_tensor.device).long()
             baseline = baselines[i] if isinstance(baselines, tuple) else baselines
             if isinstance(baseline, torch.Tensor):
                 baseline = baseline.reshape(
@@ -1173,7 +1192,7 @@ def _process_ablated_out(
     def _process_ablated_out_full(
         self,
         modified_eval: Tensor,
-        current_mask: Tuple[Tensor, ...],
+        current_mask: Tuple[Optional[Tensor], ...],
         flattened_initial_eval: Tensor,
         inputs: TensorOrTupleOfTensorsGeneric,
         n_outputs: int,
@@ -1195,9 +1214,10 @@ def _process_ablated_out_full(
 
         if self.use_weights:
             for weight, mask in zip(weights, current_mask):
-                weight += mask.float().sum(dim=0)
+                if mask is not None:
+                    weight += mask.float().sum(dim=0)
         for i, mask in enumerate(current_mask):
-            if inputs[i].numel() == 0:
+            if mask is None or inputs[i].numel() == 0:
                 continue
             eval_diff = eval_diff.reshape(
                 eval_diff_shape + (inputs[i].dim() - 1) * (1,)
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # pyre-strict
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 from captum._utils.typing import BaselineType, TargetType, TensorOrTupleOfTensorsGeneric
@@ -26,15 +26,15 @@ def _permute_feature(x: Tensor, feature_mask: Tensor) -> Tensor:
 
 
 def _permute_features_across_tensors(
-    inputs: Tuple[Tensor, ...], feature_masks: Tuple[Tensor, ...]
+    inputs: Tuple[Tensor, ...], feature_masks: Tuple[Optional[Tensor], ...]
 ) -> Tuple[Tensor, ...]:
     """
     Permutes features across multiple input tensors using the corresponding
     feature masks.
     """
     permuted_outputs = []
     for input_tensor, feature_mask in zip(inputs, feature_masks):
-        if not feature_mask.any():
+        if feature_mask is None or not feature_mask.any():
             permuted_outputs.append(input_tensor)
             continue
         n = input_tensor.size(0)
@@ -103,7 +103,7 @@ def __init__(
         forward_func: Callable[..., Union[int, float, Tensor, Future[Tensor]]],
         perm_func: Callable[[Tensor, Tensor], Tensor] = _permute_feature,
         perm_func_cross_tensor: Callable[
-            [Tuple[Tensor, ...], Tuple[Tensor, ...]], Tuple[Tensor, ...]
+            [Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]], Tuple[Tensor, ...]
         ] = _permute_features_across_tensors,
     ) -> None:
         r"""
@@ -392,9 +392,14 @@ def _construct_ablated_input_across_tensors(
         input_mask: Tuple[Tensor, ...],
         baselines: BaselineType,
         feature_idx: int,
-    ) -> Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...]]:
-        feature_masks = tuple(
-            (mask == feature_idx).to(inputs[0].device) for mask in input_mask
-        )
+        tensor_idxs: List[int],
+    ) -> Tuple[Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]]:
+        current_masks: List[Optional[Tensor]] = []
+        for i, mask in enumerate(input_mask):
+            if i in tensor_idxs:
+                current_masks.append((mask == feature_idx).to(inputs[0].device))
+            else:
+                current_masks.append(None)
+        feature_masks = tuple(current_masks)
         permuted_outputs = self.perm_func_cross_tensor(inputs, feature_masks)
         return permuted_outputs, feature_masks
diff --git a/captum/attr/_utils/common.py b/captum/attr/_utils/common.py
@@ -364,7 +364,7 @@ def _find_output_mode_and_verify(
                     "returns a scalar."
                 )
     else:
-        agg_output_mode = False
+        agg_output_mode = perturbations_per_eval == 1
         if not allow_multi_outputs:
             assert (
                 isinstance(initial_eval, torch.Tensor) and initial_eval[0].numel() == 1
diff --git a/captum/testing/helpers/basic_models.py b/captum/testing/helpers/basic_models.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from captum._utils.typing import PassThroughOutputType
 from torch import Tensor
 from torch.futures import Future
 
@@ -417,6 +418,76 @@ def forward(self, input1, input2, input3=None):
         return self.linear2(self.relu(self.linear1(embeddings))).sum(1)
 
 
+class GradientUnsupportedLayerOutput(nn.Module):
+    """
+    This layer is used to test the case where the model returns a layer that
+    is not supported by the gradient computation.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @no_type_check
+    def forward(
+        self, unsupported_layer_output: PassThroughOutputType
+    ) -> PassThroughOutputType:
+        return unsupported_layer_output
+
+
+class BasicModel_GradientLayerAttribution(nn.Module):
+    def __init__(
+        self,
+        inplace: bool = False,
+        unsupported_layer_output: PassThroughOutputType = None,
+    ) -> None:
+        super().__init__()
+        # Linear 0 is simply identity transform
+        self.unsupported_layer_output = unsupported_layer_output
+        self.linear0 = nn.Linear(3, 3)
+        self.linear0.weight = nn.Parameter(torch.eye(3))
+        self.linear0.bias = nn.Parameter(torch.zeros(3))
+        self.linear1 = nn.Linear(3, 4)
+        self.linear1.weight = nn.Parameter(torch.ones(4, 3))
+        self.linear1.bias = nn.Parameter(torch.tensor([-10.0, 1.0, 1.0, 1.0]))
+
+        self.linear1_alt = nn.Linear(3, 4)
+        self.linear1_alt.weight = nn.Parameter(torch.ones(4, 3))
+        self.linear1_alt.bias = nn.Parameter(torch.tensor([-10.0, 1.0, 1.0, 1.0]))
+
+        self.relu = nn.ReLU(inplace=inplace)
+        self.relu_alt = nn.ReLU(inplace=False)
+        self.unsupportedLayer = GradientUnsupportedLayerOutput()
+
+        self.linear2 = nn.Linear(4, 2)
+        self.linear2.weight = nn.Parameter(torch.ones(2, 4))
+        self.linear2.bias = nn.Parameter(torch.tensor([-1.0, 1.0]))
+
+        self.linear3 = nn.Linear(4, 2)
+        self.linear3.weight = nn.Parameter(torch.ones(2, 4))
+        self.linear3.bias = nn.Parameter(torch.tensor([-1.0, 1.0]))
+
+    @no_type_check
+    def forward(self, x: Tensor, add_input: Optional[Tensor] = None) -> Tensor:
+        input = x if add_input is None else x + add_input
+        lin0_out = self.linear0(input)
+        lin1_out = self.linear1(lin0_out)
+        lin1_out_alt = self.linear1_alt(lin0_out)
+
+        if self.unsupported_layer_output is not None:
+            self.unsupportedLayer(self.unsupported_layer_output)
+            # unsupportedLayer is unused in the forward func.
+        self.relu_alt(
+            lin1_out_alt
+        )  # relu_alt's output is supported but it's unused in the forward func.
+
+        relu_out = self.relu(lin1_out)
+        lin2_out = self.linear2(relu_out)
+
+        lin3_out = self.linear3(lin1_out_alt).to(torch.int64)
+
+        return torch.cat((lin2_out, lin3_out), dim=1)
+
+
 class MultiRelu(nn.Module):
     def __init__(self, inplace: bool = False) -> None:
         super().__init__()
@@ -429,7 +500,11 @@ def forward(self, arg1: Tensor, arg2: Tensor) -> Tuple[Tensor, Tensor]:
 
 
 class BasicModel_MultiLayer(nn.Module):
-    def __init__(self, inplace: bool = False, multi_input_module: bool = False) -> None:
+    def __init__(
+        self,
+        inplace: bool = False,
+        multi_input_module: bool = False,
+    ) -> None:
         super().__init__()
         # Linear 0 is simply identity transform
         self.multi_input_module = multi_input_module
@@ -461,6 +536,7 @@ def forward(
         input = x if add_input is None else x + add_input
         lin0_out = self.linear0(input)
         lin1_out = self.linear1(lin0_out)
+
         if self.multi_input_module:
             relu_out1, relu_out2 = self.multi_relu(lin1_out, self.linear1_alt(input))
             relu_out = relu_out1 + relu_out2
diff --git a/tests/attr/test_data_parallel.py b/tests/attr/test_data_parallel.py
@@ -41,7 +41,7 @@
 """
 
 # Distributed Data Parallel env setup
-os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_ADDR"] = "localhost"
 os.environ["MASTER_PORT"] = "29500"
 dist.init_process_group(backend="gloo", rank=0, world_size=1)
 
diff --git a/tests/attr/test_shapley.py b/tests/attr/test_shapley.py
@@ -806,6 +806,30 @@ def func_future(*inp):
             lambda *inp: func_to_use(*inp), use_future=use_future
         )
 
+    @parameterized.expand([True, False])
+    def test_mutli_inp_shapley_batch_scalar_tensor_expanded(self, use_future) -> None:
+        def func(*inp):
+            sum_val = torch.sum(net(*inp)).item()
+            return torch.tensor([sum_val, sum_val + 2.0, sum_val + 3.0])
+
+        def func_future(*inp):
+            temp = net_fut(*inp)
+            temp.wait()
+            sum_val = torch.sum(temp.value()).item()
+            fut = Future()
+            fut.set_result(torch.tensor([sum_val, sum_val + 2.0, sum_val + 3.0]))
+            return fut
+
+        if use_future:
+            net_fut = BasicModel_MultiLayer_MultiInput_with_Future()
+            func_to_use = func_future
+        else:
+            net = BasicModel_MultiLayer_MultiInput()
+            func_to_use = func
+        self._multi_input_batch_scalar_shapley_assert(
+            lambda *inp: func_to_use(*inp), use_future=use_future, expanded_output=True
+        )
+
     @unittest.mock.patch("sys.stderr", new_callable=io.StringIO)
     def test_shapley_sampling_with_show_progress(self, mock_stderr) -> None:
         net = BasicModel_MultiLayer()
@@ -947,18 +971,19 @@ def _single_int_input_multi_sample_batch_scalar_shapley_assert(
             )
 
     def _multi_input_batch_scalar_shapley_assert(
-        self, func: Callable, use_future: bool = False
+        self, func: Callable, use_future: bool = False, expanded_output: bool = False
     ) -> None:
         inp1 = torch.tensor([[23.0, 100.0, 0.0], [20.0, 50.0, 30.0]])
         inp2 = torch.tensor([[20.0, 50.0, 30.0], [0.0, 100.0, 0.0]])
         inp3 = torch.tensor([[0.0, 100.0, 10.0], [20.0, 10.0, 13.0]])
         mask1 = torch.tensor([[1, 1, 1]])
         mask2 = torch.tensor([[0, 1, 2]])
         mask3 = torch.tensor([[0, 1, 2]])
+        out_mult = 3 if expanded_output else 1
         expected = (
-            [[3850.6666, 3850.6666, 3850.6666]],
-            [[306.6666, 3850.6666, 410.6666]],
-            [[306.6666, 3850.6666, 410.6666]],
+            [[3850.6666, 3850.6666, 3850.6666]] * out_mult,
+            [[306.6666, 3850.6666, 410.6666]] * out_mult,
+            [[306.6666, 3850.6666, 410.6666]] * out_mult,
         )
         if use_future:
             self._shapley_test_assert_future(

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`TupleOrTensorOrBoolGeneric = TypeVar(`
`25`	`25`	`"TupleOrTensorOrBoolGeneric", Tuple[Tensor, ...], Tensor, bool`
`26`	`26`	`)`
	`27`	`+PassThroughOutputType = TypeVar("PassThroughOutputType")`
`27`	`28`	`ModuleOrModuleList = TypeVar("ModuleOrModuleList", Module, List[Module])`
`28`	`29`	`TargetType = Union[None, int, Tuple[int, ...], Tensor, List[Tuple[int, ...]], List[int]]`
`29`	`30`	`BaselineTupleType = Union[None, Tuple[Union[Tensor, int, float], ...]]`
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ def _find_output_mode_and_verify(`
`364`	`364`	`"returns a scalar."`
`365`	`365`	`)`
`366`	`366`	`else:`
`367`		`- agg_output_mode = False`
	`367`	`+ agg_output_mode = perturbations_per_eval == 1`
`368`	`368`	`if not allow_multi_outputs:`
`369`	`369`	`assert (`
`370`	`370`	`isinstance(initial_eval, torch.Tensor) and initial_eval[0].numel() == 1`