diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 6b8def255f..a2f020f9d2 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -145,7 +145,7 @@ def attribute(  # type: ignore
     ) -> Union[
         TensorOrTupleOfTensorsGeneric, Tuple[TensorOrTupleOfTensorsGeneric, Tensor]
     ]:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -165,23 +165,25 @@ def attribute(  # type: ignore
                         Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
@@ -195,21 +197,21 @@ def attribute(  # type: ignore
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
@@ -232,9 +234,11 @@ def attribute(  # type: ignore
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided, we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
@@ -608,21 +612,21 @@ def attribute(  # type: ignore
                         corresponding references. Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
@@ -634,21 +638,21 @@ def attribute(  # type: ignore
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
@@ -671,9 +675,11 @@ def attribute(  # type: ignore
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index e886034542..b69d8f957b 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -60,132 +60,139 @@ def attribute(
         perturbations_per_eval: int = 1,
         **kwargs: Any
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
-                inputs (tensor or tuple of tensors):  Input for which ablation
-                            attributions are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples (aka batch size), and if
-                            multiple input tensors are provided, the examples must
-                            be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define reference value which replaces each
-                            feature when ablated.
-                            Baselines can be provided as:
-                            - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or
-                                broadcastable to match the dimensions of inputs
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with
-                                    exactly the same dimensions as inputs or
-                                    broadcastable to match the dimensions of inputs
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
-                            Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which difference is computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            For a tensor, the first dimension of the tensor must
-                            correspond to the number of examples. For all other types,
-                            the given argument is used for all forward evaluations.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                feature_mask (tensor or tuple of tensors, optional):
-                            feature_mask defines a mask for the input, grouping
-                            features which should be ablated together. feature_mask
-                            should contain the same number of tensors as inputs.
-                            Each tensor should
-                            be the same size as the corresponding input or
-                            broadcastable to match the input tensor. Each tensor
-                            should contain integers in the range 0 to num_features
-                            - 1, and indices corresponding to the same feature should
-                            have the same value.
-                            Note that features within each input tensor are ablated
-                            independently (not across tensors).
-                            If the forward function returns a single scalar per batch,
-                            we enforce that the first dimension of each mask must be 1,
-                            since attributions are returned batch-wise rather than per
-                            example, so the attributions must correspond to the
-                            same features (indices) in each input example.
-                            If None, then a feature mask is constructed which assigns
-                            each scalar within a tensor as a separate feature, which
-                            is ablated independently.
-                            Default: None
-                perturbations_per_eval (int, optional): Allows ablation of multiple
-                            features to be processed simultaneously in one call to
-                            forward_fn.
-                            Each forward pass will contain a maximum of
-                            perturbations_per_eval * #examples samples.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain at most
-                            (perturbations_per_eval * #examples) / num_devices
-                            samples.
-                            If the forward function returns a single scalar per batch,
-                            perturbations_per_eval must be set to 1.
-                            Default: 1
-                **kwargs (Any, optional): Any additional arguments used by child
-                            classes of FeatureAblation (such as Occlusion) to construct
-                            ablations. These arguments are ignored when using
-                            FeatureAblation directly.
-                            Default: None
+            inputs (tensor or tuple of tensors):  Input for which ablation
+                        attributions are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples (aka batch size), and if
+                        multiple input tensors are provided, the examples must
+                        be aligned appropriately.
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define reference value which replaces each
+                        feature when ablated.
+                        Baselines can be provided as:
+
+                        - a single tensor, if inputs is a single tensor, with
+                          exactly the same dimensions as inputs or
+                          broadcastable to match the dimensions of inputs
+
+                        - a single scalar, if inputs is a single tensor, which will
+                          be broadcasted for each input value in input tensor.
+
+                        - a tuple of tensors or scalars, the baseline corresponding
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
+                        Default: None
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                          integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                          elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        For a tensor, the first dimension of the tensor must
+                        correspond to the number of examples. For all other types,
+                        the given argument is used for all forward evaluations.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            feature_mask (tensor or tuple of tensors, optional):
+                        feature_mask defines a mask for the input, grouping
+                        features which should be ablated together. feature_mask
+                        should contain the same number of tensors as inputs.
+                        Each tensor should
+                        be the same size as the corresponding input or
+                        broadcastable to match the input tensor. Each tensor
+                        should contain integers in the range 0 to num_features
+                        - 1, and indices corresponding to the same feature should
+                        have the same value.
+                        Note that features within each input tensor are ablated
+                        independently (not across tensors).
+                        If the forward function returns a single scalar per batch,
+                        we enforce that the first dimension of each mask must be 1,
+                        since attributions are returned batch-wise rather than per
+                        example, so the attributions must correspond to the
+                        same features (indices) in each input example.
+                        If None, then a feature mask is constructed which assigns
+                        each scalar within a tensor as a separate feature, which
+                        is ablated independently.
+                        Default: None
+            perturbations_per_eval (int, optional): Allows ablation of multiple
+                        features to be processed simultaneously in one call to
+                        forward_fn.
+                        Each forward pass will contain a maximum of
+                        perturbations_per_eval * #examples samples.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain at most
+                        (perturbations_per_eval * #examples) / num_devices
+                        samples.
+                        If the forward function returns a single scalar per batch,
+                        perturbations_per_eval must be set to 1.
+                        Default: 1
+            **kwargs (Any, optional): Any additional arguments used by child
+                        classes of FeatureAblation (such as Occlusion) to construct
+                        ablations. These arguments are ignored when using
+                        FeatureAblation directly.
+                        Default: None
 
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            The attributions with respect to each input feature.
-                            If the forward function returns
-                            a scalar value per example, attributions will be
-                            the same size as the provided inputs, with each value
-                            providing the attribution of the corresponding input index.
-                            If the forward function returns a scalar per batch, then
-                            attribution tensor(s) will have first dimension 1 and
-                            the remaining dimensions will match the input.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        The attributions with respect to each input feature.
+                        If the forward function returns
+                        a scalar value per example, attributions will be
+                        the same size as the provided inputs, with each value
+                        providing the attribution of the corresponding input index.
+                        If the forward function returns a scalar per batch, then
+                        attribution tensor(s) will have first dimension 1 and
+                        the remaining dimensions will match the input.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple of tensors is provided for inputs, a
+                        tuple of corresponding sized tensors is returned.
 
 
         Examples::
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 923a9ea9ba..c72b77a166 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -24,27 +24,49 @@ def _permute_feature(x: Tensor, feature_mask: Tensor) -> Tensor:
 
 class FeaturePermutation(FeatureAblation):
     r"""
-    This attribution method essentially implements the permutation feature
-    importance algorithm, as described here:
-    https://christophm.github.io/interpretable-ml-book/feature-importance.html
+    A perturbation based approach to compute attribution, which
+    takes each input feature, permutes the feature values within a batch,
+    and computes the difference between original and shuffled outputs for
+    the given batch. This difference signifies the feature importance
+    for the permuted feature.
 
-    A basic tl;dr of the algorithm is:
+    Example pseudocode for the algorithm is as follows::
 
-    perm_feature_importance(batch):
-        importance = dict()
-        baseline_error = error_metric(model(batch), batch_labels)
-        for each feature:
-            permute this feature across the batch
-            error = error_metric(model(permuted_batch), batch_labels)
-            importance[feature] = baseline_error - error
-            "un-permute" the feature across the batch
+        perm_feature_importance(batch):
+            importance = dict()
+            baseline_error = error_metric(model(batch), batch_labels)
+            for each feature:
+                permute this feature across the batch
+                error = error_metric(model(permuted_batch), batch_labels)
+                importance[feature] = baseline_error - error
+                "un-permute" the feature across the batch
 
-        return importance
+            return importance
 
     It should be noted that the `error_metric` must be called in the
-    `forward_func`. You do not need to provide an error metric, e.g. you
+    `forward_func`. You do not need to have an error metric, e.g. you
     could simply return the logits (the model output), but this may or may
     not provide a meaningful attribution.
+
+    This method, unlike other attribution methods, requires a batch
+    of examples to compute attributions and cannot be performed on a single example.
+
+    By default, each scalar value within
+    each input tensor is taken as a feature and shuffled independently. Passing
+    a feature mask, allows grouping features to be shuffled together.
+    Each input scalar in the group will be given the same attribution value
+    equal to the change in target as a result of shuffling the entire feature
+    group.
+
+    The forward function can either return a scalar per example, or a single
+    scalar for the full batch. If a single scalar is returned for the batch,
+    `perturbations_per_eval` must be 1, and the returned attributions will have
+    first dimension 1, corresponding to feature importance across all
+    examples in the batch.
+
+    More information can be found in the permutation feature
+    importance algorithm description here:
+    https://christophm.github.io/interpretable-ml-book/feature-importance.html
     """
 
     def __init__(self, forward_func: Callable, perm_func: Callable = _permute_feature):
@@ -53,11 +75,11 @@ def __init__(self, forward_func: Callable, perm_func: Callable = _permute_featur
 
             forward_func (callable): The forward function of the model or
                 any modification of it
-            perm_func (callable): A function that accepts a batch of inputs and
-                a feature mask, and "permutes" the feature across the batch.
-                NOTE: one obviously does not have to perform a permutation.
-                See `_permute_feature` as an example on how to implement
-                your own permutation function.
+            perm_func (callable, optional): A function that accepts a batch of
+                inputs and a feature mask, and "permutes" the feature using
+                feature mask across the batch. This defaults to a function
+                which applies a random permutation, this argument only needs
+                to be provided if a custom permutation behavior is desired.
                 Default: `_permute_feature`
         """
         FeatureAblation.__init__(self, forward_func=forward_func)
@@ -100,21 +122,21 @@ def attribute(  # type: ignore
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
@@ -169,6 +191,58 @@ def attribute(  # type: ignore
                             ablations. These arguments are ignored when using
                             FeatureAblation directly.
                             Default: None
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        The attributions with respect to each input feature.
+                        If the forward function returns
+                        a scalar value per example, attributions will be
+                        the same size as the provided inputs, with each value
+                        providing the attribution of the corresponding input index.
+                        If the forward function returns a scalar per batch, then
+                        attribution tensor(s) will have first dimension 1 and
+                        the remaining dimensions will match the input.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple of tensors is provided for inputs,
+                        a tuple of corresponding sized tensors is returned.
+
+
+        Examples::
+
+            >>> # SimpleClassifier takes a single input tensor of size Nx4x4,
+            >>> # and returns an Nx3 tensor of class probabilities.
+            >>> net = SimpleClassifier()
+            >>> # Generating random input with size 10 x 4 x 4
+            >>> input = torch.randn(10, 4, 4)
+            >>> # Defining FeaturePermutation interpreter
+            >>> feature_perm = FeaturePermutation(net)
+            >>> # Computes permutation attribution, shuffling each of the 16
+            >>> # scalar input independently.
+            >>> attr = feature_perm.attribute(input, target=1)
+
+            >>> # Alternatively, we may want to permute features in groups, e.g.
+            >>> # grouping each 2x2 square of the inputs and shuffling them together.
+            >>> # This can be done by creating a feature mask as follows, which
+            >>> # defines the feature groups, e.g.:
+            >>> # +---+---+---+---+
+            >>> # | 0 | 0 | 1 | 1 |
+            >>> # +---+---+---+---+
+            >>> # | 0 | 0 | 1 | 1 |
+            >>> # +---+---+---+---+
+            >>> # | 2 | 2 | 3 | 3 |
+            >>> # +---+---+---+---+
+            >>> # | 2 | 2 | 3 | 3 |
+            >>> # +---+---+---+---+
+            >>> # With this mask, all inputs with the same value are shuffled
+            >>> # simultaneously, and the attribution for each input in the same
+            >>> # group (0, 1, 2, and 3) per example are the same.
+            >>> # The attributions can be calculated as follows:
+            >>> # feature mask has dimensions 1 x 4 x 4
+            >>> feature_mask = torch.tensor([[[0,0,1,1],[0,0,1,1],
+            >>>                             [2,2,3,3],[2,2,3,3]]])
+            >>> attr = feature_perm.attribute(input, target=1,
+            >>>                               feature_mask=feature_mask)
         """
         return FeatureAblation.attribute(
             self,
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 99f1b32e40..6afc3eccec 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -128,21 +128,21 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
@@ -167,21 +167,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index 623b5829f3..4f3b9c0578 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -36,7 +36,7 @@ def attribute(
         target: TargetType = None,
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Computes attribution by overriding relu gradients. Based on constructor
         flag use_relu_grad_output, performs either GuidedBackpropagation if False
         and Deconvolution if True. This class is the parent class of both these
@@ -125,7 +125,7 @@ def attribute(
         target: TargetType = None,
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -145,21 +145,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
@@ -231,7 +231,7 @@ def attribute(
         target: TargetType = None,
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -251,21 +251,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index d7e105969a..f2f2845dfd 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -49,7 +49,7 @@ def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
         r"""
-        Args
+        Args:
 
             model (nn.Module):  The reference to PyTorch model instance.
             layer (torch.nn.Module): Layer for which GradCAM attributions are computed.
@@ -92,21 +92,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index 6246f84162..7e6ba9c348 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -29,7 +29,7 @@ def attribute(
         target: TargetType = None,
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -49,21 +49,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index 5a26183adb..02a8031bbb 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -122,24 +122,25 @@ def attribute(  # type: ignore
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
 
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
@@ -152,21 +153,21 @@ def attribute(  # type: ignore
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
@@ -204,9 +205,9 @@ def attribute(  # type: ignore
                         Default: None
             return_convergence_delta (bool, optional): Indicates whether to return
                     convergence delta or not. If `return_convergence_delta`
-                        is set to True convergence delta will be returned in
-                        a tuple following attributions.
-                        Default: False
+                    is set to True convergence delta will be returned in
+                    a tuple following attributions.
+                    Default: False
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
             - **attributions** (*tensor* or tuple of *tensors*):
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index f62473020e..d9b0b73abd 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -61,7 +61,7 @@ def __init__(
         device_ids: Union[None, List[int]] = None,
     ) -> None:
         r"""
-        Args
+        Args:
 
             forward_func (callable):  The forward function of the model or any
                           modification of it
@@ -105,21 +105,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index d294551e4c..db141a0d86 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -88,23 +88,25 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
@@ -118,21 +120,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index ad3284877e..3f350a491f 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -130,24 +130,25 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
 
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
@@ -160,21 +161,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 2267800319..8e85d243ae 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -118,7 +118,7 @@ def attribute(
     ) -> Union[
         Tensor, Tuple[Tensor, ...], Tuple[Union[Tensor, Tuple[Tensor, ...]], Tensor],
     ]:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
@@ -138,24 +138,25 @@ def attribute(
                         Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
 
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
@@ -168,21 +169,22 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
+
                         Default: None
             additional_forward_args (any, optional): If the forward function
                         requires additional arguments other than the inputs for
@@ -215,9 +217,11 @@ def attribute(
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided, we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
@@ -436,21 +440,21 @@ def attribute(
                         corresponding references. Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
@@ -462,21 +466,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
@@ -509,9 +513,11 @@ def attribute(
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided, we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 7c5ad388a7..cd962964db 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -94,28 +94,28 @@ def attribute(
                             use zero as the baseline for each neuron.
                             Default: None
                 target (int, tuple, tensor or list, optional):  Output indices for
-                            which difference is computed (for classification cases,
+                            which gradients are computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
                             no target index is necessary.
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 1a84c4a5f5..1fc229c721 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -142,21 +142,21 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
@@ -181,21 +181,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index ef51808a8a..1037b110ae 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -76,21 +76,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 56e0327b08..cced8a9542 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -140,23 +140,23 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
+                          to each tensor in the inputs' tuple can be:
                             - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
+                              corresponding tensor in the inputs' tuple
+                              or the first dimension is one and the remaining
+                              dimensions match with the corresponding
+                              input tensor.
                             - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                              inputs' tuple. This scalar value is broadcasted
+                              for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
@@ -170,21 +170,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 5cecbbfe49..1fae4c0789 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -101,23 +101,25 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
@@ -131,21 +133,21 @@ def attribute(
                         For general 2D outputs, targets can be either:
 
                         - a single integer or a tensor containing a single
-                            integer, which is applied to all input examples
+                          integer, which is applied to all input examples
 
                         - a list of integers or a 1D tensor, with length matching
-                            the number of examples in inputs (dim 0). Each integer
-                            is applied as the target for the corresponding example.
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
 
                         For outputs with > 2 dimensions, targets can be either:
 
                         - A single tuple, which contains #output_dims - 1
-                            elements. This target index is applied to all examples.
+                          elements. This target index is applied to all examples.
 
                         - A list of tuples with length equal to the number of
-                            examples in inputs (dim 0), and each tuple containing
-                            #output_dims - 1 elements. Each tuple is applied as the
-                            target for the corresponding example.
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
 
                         Default: None
             additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 0d7bd1b0ca..52e47e6148 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -64,7 +64,7 @@ def attribute(
         attribute_to_neuron_input: bool = False,
         custom_attribution_func: Union[None, Callable[..., Tuple[Tensor, ...]]] = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
@@ -91,23 +91,25 @@ def attribute(
                         Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
@@ -139,9 +141,11 @@ def attribute(
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided, we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
@@ -264,21 +268,21 @@ def attribute(
                         corresponding references. Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
@@ -308,9 +312,11 @@ def attribute(
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
-                            - custom_attribution_func(multipliers)
-                            - custom_attribution_func(multipliers, inputs)
-                            - custom_attribution_func(multipliers, inputs, baselines)
+
+                        - custom_attribution_func(multipliers)
+                        - custom_attribution_func(multipliers, inputs)
+                        - custom_attribution_func(multipliers, inputs, baselines)
+
                         In case this function is not provided, we use the default
                         logic defined as: multipliers * (inputs - baselines)
                         It is assumed that all input arguments, `multipliers`,
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 481e5b7ade..02d4a78a05 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -85,19 +85,26 @@ def attribute(
                         Baselines define reference value which replaces each
                         feature when ablated.
                         Baselines can be provided as:
+
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or
-                            broadcastable to match the dimensions of inputs
+                          exactly the same dimensions as inputs or
+                          broadcastable to match the dimensions of inputs
+
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
+
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with
-                                exactly the same dimensions as inputs or
-                                broadcastable to match the dimensions of inputs
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index fdeef0a620..3e029da79d 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -109,21 +109,21 @@ def attribute(
                         is computed and can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            the first dimension equal to the number of examples
-                            in the baselines' distribution. The remaining dimensions
-                            must match with input tensor's dimension starting from
-                            the second dimension.
+                          the first dimension equal to the number of examples
+                          in the baselines' distribution. The remaining dimensions
+                          must match with input tensor's dimension starting from
+                          the second dimension.
 
                         - a tuple of tensors, if inputs is a tuple of tensors,
-                            with the first dimension of any tensor inside the tuple
-                            equal to the number of examples in the baseline's
-                            distribution. The remaining dimensions must match
-                            the dimensions of the corresponding input tensor
-                            starting from the second dimension.
+                          with the first dimension of any tensor inside the tuple
+                          equal to the number of examples in the baseline's
+                          distribution. The remaining dimensions must match
+                          the dimensions of the corresponding input tensor
+                          starting from the second dimension.
 
                         - callable function, optionally takes `inputs` as an
-                            argument and either returns a single tensor
-                            or a tuple of those.
+                          argument and either returns a single tensor
+                          or a tuple of those.
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 6c71c2ea0e..f10eca6d29 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -13,7 +13,7 @@ class NeuronDeconvolution(NeuronAttribution, GradientAttribution):
     r"""
     Computes attribution of the given neuron using deconvolution.
     Deconvolution computes the gradient of the target output with
-    respect to the input, but gradients of ReLU functions are overriden so
+    respect to the input, but gradients of ReLU functions are overridden so
     that the gradient of the ReLU input is simply computed taking ReLU of
     the output gradient, essentially only propagating non-negative gradients
     (without dependence on the sign of the ReLU input).
@@ -25,7 +25,7 @@ class NeuronDeconvolution(NeuronAttribution, GradientAttribution):
 
     Warning: Ensure that all ReLU operations in the forward function of the
     given model are performed using a module (nn.module.ReLU).
-    If nn.functional.ReLU is used, gradients are not overriden appropriately.
+    If nn.functional.ReLU is used, gradients are not overridden appropriately.
     """
 
     def __init__(
@@ -61,7 +61,7 @@ def attribute(
         additional_forward_args: Any = None,
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -142,7 +142,7 @@ class NeuronGuidedBackprop(NeuronAttribution, GradientAttribution):
     r"""
     Computes attribution of the given neuron using guided backpropagation.
     Guided backpropagation computes the gradient of the target neuron
-    with respect to the input, but gradients of ReLU functions are overriden
+    with respect to the input, but gradients of ReLU functions are overridden
     so that only non-negative gradients are backpropagated.
 
     More details regarding the guided backpropagation algorithm can be found
@@ -151,7 +151,7 @@ class NeuronGuidedBackprop(NeuronAttribution, GradientAttribution):
 
     Warning: Ensure that all ReLU operations in the forward function of the
     given model are performed using a module (nn.module.ReLU).
-    If nn.functional.ReLU is used, gradients are not overriden appropriately.
+    If nn.functional.ReLU is used, gradients are not overridden appropriately.
     """
 
     def __init__(
@@ -184,7 +184,7 @@ def attribute(
         additional_forward_args: Any = None,
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 5989ee7a14..c4ee7f38d6 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -87,23 +87,25 @@ def attribute(
                         Baselines can be provided as:
 
                         - a single tensor, if inputs is a single tensor, with
-                            exactly the same dimensions as inputs or the first
-                            dimension is one and the remaining dimensions match
-                            with inputs.
+                          exactly the same dimensions as inputs or the first
+                          dimension is one and the remaining dimensions match
+                          with inputs.
 
                         - a single scalar, if inputs is a single tensor, which will
-                            be broadcasted for each input value in input tensor.
+                          be broadcasted for each input value in input tensor.
 
                         - a tuple of tensors or scalars, the baseline corresponding
-                            to each tensor in the inputs' tuple can be:
-                            - either a tensor with matching dimensions to
-                                corresponding tensor in the inputs' tuple
-                                or the first dimension is one and the remaining
-                                dimensions match with the corresponding
-                                input tensor.
-                            - or a scalar, corresponding to a tensor in the
-                                inputs' tuple. This scalar value is broadcasted
-                                for corresponding input tensor.
+                          to each tensor in the inputs' tuple can be:
+
+                          - either a tensor with matching dimensions to
+                            corresponding tensor in the inputs' tuple
+                            or the first dimension is one and the remaining
+                            dimensions match with the corresponding
+                            input tensor.
+
+                          - or a scalar, corresponding to a tensor in the
+                            inputs' tuple. This scalar value is broadcasted
+                            for corresponding input tensor.
 
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 777049f16a..298f3ea0d1 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -58,9 +58,10 @@ class NoiseTunnel(Attribution):
 
     def __init__(self, attribution_method: Attribution) -> None:
         r"""
-        attribution_method (Attribution): An instance of any attribution algorithm
-                    of type `Attribution`. E.g. Integrated Gradients,
-                    Conductance or Saliency.
+        Args:
+            attribution_method (Attribution): An instance of any attribution algorithm
+                        of type `Attribution`. E.g. Integrated Gradients,
+                        Conductance or Saliency.
         """
         self.attribution_method = attribution_method
         self.is_delta_supported = self.attribution_method.has_convergence_delta()
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 3b8eb7bc6e..0544e57151 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -57,7 +57,7 @@ def attribute(  # type: ignore
         additional_forward_args: Any = None,
         perturbations_per_eval: int = 1,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
                 inputs (tensor or tuple of tensors):  Input for which occlusion
@@ -102,19 +102,26 @@ def attribute(  # type: ignore
                             Baselines define reference value which replaces each
                             feature when occluded.
                             Baselines can be provided as:
+
                             - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or
-                                broadcastable to match the dimensions of inputs
+                              exactly the same dimensions as inputs or
+                              broadcastable to match the dimensions of inputs
+
                             - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
+                              be broadcasted for each input value in input tensor.
+
                             - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with
-                                    exactly the same dimensions as inputs or
-                                    broadcastable to match the dimensions of inputs
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
+                              to each tensor in the inputs' tuple can be:
+
+                              - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+
+                              - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
@@ -126,21 +133,21 @@ def attribute(  # type: ignore
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index b679358b1a..d6d48565bf 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -38,70 +38,70 @@ def attribute(
         abs: bool = True,
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
-        r""""
+        r"""
         Args:
 
-                inputs (tensor or tuple of tensors):  Input for which integrated
-                            gradients are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples (aka batch size), and if
-                            multiple input tensors are provided, the examples must
-                            be aligned appropriately.
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                abs (bool, optional): Returns absolute value of gradients if set
-                            to True, otherwise returns the (signed) gradients if
-                            False.
-                            Defalut: True
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
+            inputs (tensor or tuple of tensors):  Input for which integrated
+                        gradients are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples (aka batch size), and if
+                        multiple input tensors are provided, the examples must
+                        be aligned appropriately.
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                          integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                          the number of examples in inputs (dim 0). Each integer
+                          is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                          elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                          examples in inputs (dim 0), and each tuple containing
+                          #output_dims - 1 elements. Each tuple is applied as the
+                          target for the corresponding example.
+
+                        Default: None
+            abs (bool, optional): Returns absolute value of gradients if set
+                        to True, otherwise returns the (signed) gradients if
+                        False.
+                        Default: True
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
 
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            The gradients with respect to each input feature.
-                            Attributions will always be
-                            the same size as the provided inputs, with each value
-                            providing the attribution of the corresponding input index.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        The gradients with respect to each input feature.
+                        Attributions will always be
+                        the same size as the provided inputs, with each value
+                        providing the attribution of the corresponding input index.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple is provided for inputs, a tuple of
+                        corresponding sized tensors is returned.
 
 
         Examples::
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 630ebc56d3..6225bb0226 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -103,22 +103,27 @@ def attribute(
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
+
                             - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
+                              exactly the same dimensions as inputs or the first
+                              dimension is one and the remaining dimensions match
+                              with inputs.
+
                             - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
+                              be broadcasted for each input value in input tensor.
+
                             - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
+                              to each tensor in the inputs' tuple can be:
+
+                              - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+
+                              - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
@@ -130,21 +135,21 @@ def attribute(
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
@@ -533,22 +538,27 @@ def attribute(
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
+
                             - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
+                              exactly the same dimensions as inputs or the first
+                              dimension is one and the remaining dimensions match
+                              with inputs.
+
                             - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
+                              be broadcasted for each input value in input tensor.
+
                             - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
+                              to each tensor in the inputs' tuple can be:
+
+                              - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+
+                              - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
@@ -560,21 +570,21 @@ def attribute(
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 2f8a6806e8..1dfdd0883d 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -203,21 +203,21 @@ def compute_convergence_delta(
                             For general 2D outputs, targets can be either:
 
                             - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                              integer, which is applied to all input examples
 
                             - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                              the number of examples in inputs (dim 0). Each integer
+                              is applied as the target for the corresponding example.
 
                             For outputs with > 2 dimensions, targets can be either:
 
                             - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                              elements. This target index is applied to all examples.
 
                             - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                              examples in inputs (dim 0), and each tuple containing
+                              #output_dims - 1 elements. Each tuple is applied as the
+                              target for the corresponding example.
 
                             Default: None
                 additional_forward_args (any, optional): If the forward function
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 8f0b52965c..fa7d6b9c42 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -117,26 +117,35 @@ def visualize_image_attr(
                         Default: None
             method (string, optional): Chosen method for visualizing attribution.
                         Supported options are:
-                            1. `heat_map` - Display heat map of chosen attributions
-                            2. `blended_heat_map` - Overlay heat map over greyscale
-                                version of original image. Parameter alpha_overlay
-                                corresponds to alpha of heat map.
-                            3. `original_image` - Only display original image.
-                            4. `masked_image` - Mask image (pixel-wise multiply)
-                                by normalized attribution values.
-                            5. `alpha_scaling` - Sets alpha channel of each pixel
-                            to be equal to normalized attribution value.
+
+                        1. `heat_map` - Display heat map of chosen attributions
+
+                        2. `blended_heat_map` - Overlay heat map over greyscale
+                           version of original image. Parameter alpha_overlay
+                           corresponds to alpha of heat map.
+
+                        3. `original_image` - Only display original image.
+
+                        4. `masked_image` - Mask image (pixel-wise multiply)
+                           by normalized attribution values.
+
+                        5. `alpha_scaling` - Sets alpha channel of each pixel
+                           to be equal to normalized attribution value.
                         Default: `heat_map`
             sign (string, optional): Chosen sign of attributions to visualize. Supported
-                            options are:
-                            1. `positive` - Displays only positive pixel attributions.
-                            2. `absolute_value` - Displays absolute value of
-                                attributions.
-                            3. `negative` - Displays only negative pixel attributions.
-                            4. `all` - Displays both positive and negative attribution
-                                values. This is not supported for `masked_image` or
-                                `alpha_scaling` modes, since signed information cannot
-                                be represented in these modes.
+                        options are:
+
+                        1. `positive` - Displays only positive pixel attributions.
+
+                        2. `absolute_value` - Displays absolute value of
+                           attributions.
+
+                        3. `negative` - Displays only negative pixel attributions.
+
+                        4. `all` - Displays both positive and negative attribution
+                           values. This is not supported for `masked_image` or
+                           `alpha_scaling` modes, since signed information cannot
+                           be represented in these modes.
                         Default: `absolute_value`
             plt_fig_axis (tuple, optional): Tuple of matplotlib.pyplot.figure and axis
                         on which to visualize. If None is provided, then a new figure
diff --git a/docs/algorithms_comparison_matrix.md b/docs/algorithms_comparison_matrix.md
index 4eea8c33ef..12af58a5fa 100644
--- a/docs/algorithms_comparison_matrix.md
+++ b/docs/algorithms_comparison_matrix.md
@@ -167,15 +167,6 @@ Please, scroll to the right for more details.
     <td>No (Internally in our implementation permuted features for each batch are treated as baselines)</td>
     <td>Assigns an importance score to each input feature based on the magnitude changes in model output or loss when those features are permuted based on input feature mask. </td>
   </tr>
-  <tr>
-    <td> Perturbation </td>
-    <td>Any traditional or neural network model. </td>
-    <td>O(#examples * #features * #perturbations_per_eval)</td>
-    <td>Forward</td>
-     <td>#examples * #features </td>
-    <td>No (Internally in our implementation permuted features for each batch are treated as baselines)</td>
-    <td>Assigns an importance score to each input feature based on the magnitude changes in model output or loss when those features are permuted based on input feature mask. </td>
-  </tr>
   <tr>
     <td><strong>Occlusion</strong></td>
     <td> Perturbation </td>
diff --git a/sphinx/source/attribution.rst b/sphinx/source/attribution.rst
index 5c79136c5c..e148c5b39e 100644
--- a/sphinx/source/attribution.rst
+++ b/sphinx/source/attribution.rst
@@ -13,3 +13,5 @@ Attribution
     deconvolution
     feature_ablation
     occlusion
+    feature_permutation
+    shapley_value_sampling
diff --git a/sphinx/source/feature_permutation.rst b/sphinx/source/feature_permutation.rst
new file mode 100644
index 0000000000..d58f625aee
--- /dev/null
+++ b/sphinx/source/feature_permutation.rst
@@ -0,0 +1,5 @@
+Feature Permutation
+=========
+
+.. autoclass:: captum.attr.FeaturePermutation
+    :members:
diff --git a/sphinx/source/shapley_value_sampling.rst b/sphinx/source/shapley_value_sampling.rst
new file mode 100644
index 0000000000..c998125af9
--- /dev/null
+++ b/sphinx/source/shapley_value_sampling.rst
@@ -0,0 +1,7 @@
+Shapley Value Sampling
+=========
+
+.. autoclass:: captum.attr.ShapleyValueSampling
+    :members:
+.. autoclass:: captum.attr.ShapleyValues
+    :members:
diff --git a/website/pages/tutorials/index.js b/website/pages/tutorials/index.js
index 3f1856bf0f..5f6acc93ca 100644
--- a/website/pages/tutorials/index.js
+++ b/website/pages/tutorials/index.js
@@ -83,16 +83,27 @@ class TutorialHome extends React.Component {
                 Using Captum and Integrated Gradients we interpret the output of several test questions and analyze the attribution scores
                 of the text and visual parts of the model. Find the tutorial <a href="Multimodal_VQA_Interpret">here</a>.
 
-                <h4>[master only] Interpreting question answering with BERT:</h4>
+                <h4>Interpreting question answering with BERT:</h4>
                 This tutorial demonstrates how to use Captum to interpret a BERT model for question answering.
                 We use a pre-trained model from Hugging Face fine-tuned on the SQUAD dataset and show how to use hooks to
                 examine and better understand embeddings, sub-embeddings, BERT, and attention layers.
                 Find the tutorial <a href="Bert_SQUAD_Interpret">here</a>.
 
+                <h4>Interpreting a regression model of Boston house prices:</h4>
+                To demonstrate interpreting regression models we have chosen to look at the Boston house prices dataset.
+                Using Captum and a variety of attribution methods, we evaluate feature importance as well as internal attribution to understand
+                the network function. Find the tutorial <a href="House_Prices_Regression_Interpret">here</a>.
+
                 <h4>Getting Started with Captum Insights:</h4>
                 This tutorial demonstrates how to use Captum Insights for a vision model in a notebook setting.  A simple pretrained torchvision
                 CNN model is loaded and then used on the CIFAR dataset.  Captum Insights is then loaded to visualize the interpretation of specific examples.
                 Find the tutorial <a href="CIFAR_TorchVision_Captum_Insights">here</a>.
+
+                <h4>Using Captum Insights with multimodal models (VQA):</h4>
+                This tutorial demonstrates how to use Captum Insights for visualizing attributions of a multimodal model, particularly an open
+                source Visual Question Answer (VQA) model.
+                Find the tutorial <a href="Multimodal_VQA_Captum_Insights">here</a>.
+
               </p>
             </body>
           </div>
diff --git a/website/tutorials.json b/website/tutorials.json
index a7c45dbaa4..9d72d7753c 100644
--- a/website/tutorials.json
+++ b/website/tutorials.json
@@ -25,7 +25,11 @@
     },
     {
       "id": "Bert_SQUAD_Interpret",
-      "title": "[master only] Interpreting question answering with BERT"
+      "title": "Interpreting question answering with BERT"
+    },
+    {
+      "id": "House_Prices_Regression_Interpret",
+      "title": "Interpreting a regression model of Boston house prices"
     },
     {
       "id": "CIFAR_TorchVision_Captum_Insights",