pytorch
diff --git a/‎tests/influence/_core/test_arnoldi_influence.py
+30-16 b/‎tests/influence/_core/test_arnoldi_influence.py
+30-16
diff --git a/‎tests/influence/_core/test_naive_influence.py
+9-7 b/‎tests/influence/_core/test_naive_influence.py
+9-7
diff --git a/‎tests/influence/_core/test_tracin_k_most_influential.py
+9-14 b/‎tests/influence/_core/test_tracin_k_most_influential.py
+9-14
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -27,8 +27,9 @@
     generate_assymetric_matrix_given_eigenvalues,
     generate_symmetric_matrix_given_eigenvalues,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch import Tensor
 from torch.utils.data import DataLoader
@@ -229,6 +230,17 @@ def _param_matmul(params: Tuple[Tensor]):
                         "max",
                     )
 
+    # TODO: for some unknow reason, this test and the test below does not work
+    # on `cuda_data_parallel` setting. We need to investigate why.
+    # Use a local version of setting list for these two tests for now
+    # since we have changed the default setting list to includes all options.
+    # (This is also used in many other tests, which also needs to be unified later).
+    gpu_setting_list = (
+        ["", "cuda"]
+        if torch.cuda.is_available() and torch.cuda.device_count() != 0
+        else [""]
+    )
+
     @parameterized.expand(
         [
             (
@@ -237,17 +249,17 @@ def _param_matmul(params: Tuple[Tensor]):
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 # compare implementations, when considering only 1 layer
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         projection_dim=5,
@@ -258,7 +270,7 @@ def _param_matmul(params: Tuple[Tensor]):
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         arnoldi_dim=50,
@@ -314,7 +326,7 @@ def test_compare_implementations_trained_NN_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -329,14 +341,15 @@ def test_compare_implementations_trained_NN_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     # this compares `ArnoldiInfluenceFunction` and `NaiveInfluenceFunction` on randomly
     # generated data. because these implementations are numerically equivalent, we
     # can also compare the intermediate quantities. we do not compare with
     # `NaiveInfluence` because on randomly generated data, it is not comparable,
     # conceptually, with the other implementations, due to numerical issues.
+
     @parameterized.expand(
         [
             (
@@ -345,16 +358,16 @@ def test_compare_implementations_trained_NN_model_and_data(
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in gpu_setting_list
             for (influence_constructor_1, influence_constructor_2, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -364,7 +377,7 @@ def test_compare_implementations_trained_NN_model_and_data(
                         ArnoldiInfluenceFunction,
                         layers=(
                             ["module.linear1"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear1"]
                         ),
                         show_progress=False,
@@ -397,7 +410,7 @@ def test_compare_implementations_random_model_and_data(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this compares 2 influence implementations on a trained 2-layer NN model.
@@ -412,7 +425,7 @@ def test_compare_implementations_random_model_and_data(
             delta,
             mode,
             unpack_inputs,
-            use_gpu,
+            gpu_setting,
         )
 
     def _test_compare_implementations(
@@ -423,7 +436,7 @@ def _test_compare_implementations(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         checks that 2 implementations of `InfluenceFunctionBase` return the same
@@ -444,13 +457,14 @@ def _test_compare_implementations(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type=model_type,
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
 
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 
 import torch
 
@@ -21,9 +21,10 @@
     DataInfluenceConstructor,
     ExplicitDataset,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
     Linear,
     UnpackDataset,
-    USE_GPU_LIST,
 )
 from torch.utils.data import DataLoader
 
@@ -59,17 +60,17 @@ def test_flatten_unflattener(self, param_shapes: List[Tuple[int, ...]]) -> None:
                 delta,
                 mode,
                 unpack_inputs,
-                use_gpu,
+                gpu_setting,
             )
             for reduction in ["none", "sum", "mean"]
-            for use_gpu in USE_GPU_LIST
+            for gpu_setting in GPU_SETTING_LIST
             for (influence_constructor, delta) in [
                 (
                     DataInfluenceConstructor(
                         NaiveInfluenceFunction,
                         layers=(
                             ["module.linear"]
-                            if use_gpu == "cuda_dataparallel"
+                            if gpu_setting == "cuda_dataparallel"
                             else ["linear"]
                         ),
                         projection_dim=None,
@@ -109,7 +110,7 @@ def test_matches_linear_regression(
         delta: float,
         mode: str,
         unpack_inputs: bool,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
     ) -> None:
         """
         this tests that `NaiveInfluence`, the simplest implementation, agree with the
@@ -129,13 +130,14 @@ def test_matches_linear_regression(
                 tmpdir,
                 unpack_inputs,
                 return_test_data=True,
-                use_gpu=use_gpu,
+                gpu_setting=gpu_setting,
                 return_hessian_data=True,
                 model_type="trained_linear",
             )
 
             train_dataset = DataLoader(train_dataset, batch_size=5)
 
+            use_gpu = is_gpu(gpu_setting)
             hessian_dataset = (
                 ExplicitDataset(hessian_samples, hessian_labels, use_gpu)
                 if not unpack_inputs
 
@@ -1,5 +1,5 @@
 import tempfile
-from typing import Callable, Union
+from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -13,22 +13,17 @@
     build_test_name_func,
     DataInfluenceConstructor,
     get_random_model_and_data,
+    GPU_SETTING_LIST,
+    is_gpu,
 )
 
 
 class TestTracInGetKMostInfluential(BaseTest):
-
-    use_gpu_list = (
-        [False, "cuda", "cuda_data_parallel"]
-        if torch.cuda.is_available() and torch.cuda.device_count() != 0
-        else [False]
-    )
-
     param_list = []
     for batch_size, k in [(4, 7), (7, 4), (40, 5), (5, 40), (40, 45)]:
         for unpack_inputs in [True, False]:
             for proponents in [True, False]:
-                for use_gpu in use_gpu_list:
+                for gpu_setting in GPU_SETTING_LIST:
                     for reduction, constr, aggregate in [
                         (
                             "none",
@@ -51,7 +46,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                 name="linear2",
                                 layers=(
                                     ["module.linear2"]
-                                    if use_gpu == "cuda_data_parallel"
+                                    if gpu_setting == "cuda_data_parallel"
                                     else ["linear2"]
                                 ),
                             ),
@@ -61,7 +56,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                         if not (
                             "sample_wise_grads_per_batch" in constr.kwargs
                             and constr.kwargs["sample_wise_grads_per_batch"]
-                            and use_gpu
+                            and is_gpu(gpu_setting)
                         ):
                             param_list.append(
                                 (
@@ -71,7 +66,7 @@ class TestTracInGetKMostInfluential(BaseTest):
                                     proponents,
                                     batch_size,
                                     k,
-                                    use_gpu,
+                                    gpu_setting,
                                     aggregate,
                                 )
                             )
@@ -88,7 +83,7 @@ def test_tracin_k_most_influential(
         proponents: bool,
         batch_size: int,
         k: int,
-        use_gpu: Union[bool, str],
+        gpu_setting: Optional[str],
         aggregate: bool,
     ) -> None:
         """
@@ -107,7 +102,7 @@ def test_tracin_k_most_influential(
                 tmpdir,
                 unpack_inputs,
                 True,
-                use_gpu,
+                gpu_setting,
             )
 
             self.assertTrue(isinstance(reduction, str))