DistributedManager cleanup and kNN cuml/scipy hotfixes (#1182)

coreyjadams · web-flow · commit bd46225d1b66 · 2025-10-24T14:42:47.000-07:00
* Enable a "soft" check path that doesn't raise an exception, just
returns false, for check_min_version.

This let's knn import without crashing if cuml isn't installed.

* Use the soft fail for scipy too.

* remove barrier at the cleanup of distributed manager.  It is still opt-in,
if it's ever needed, but by default it's off.

* Finish cleanup of knn.  New logic:
- soft check of cuml and scipy install.
- if the backend is selected as cuml or scipy, but not installed,
  it will error loudly.
- if the backend is "auto", it will select cuml/scipy if available
  but torch if not.

* Add test path for cpu knn when scipy is not installed.

document hard_fail parameter in check_min_version.
diff --git a/physicsnemo/distributed/manager.py b/physicsnemo/distributed/manager.py
@@ -800,18 +800,25 @@ def create_groups_from_config(
 
     @atexit.register
     @staticmethod
-    def cleanup():
-        """Clean up distributed group and singleton"""
+    def cleanup(barrier: bool = False):
+        """Clean up distributed group and singleton
+
+        Parameters
+        ----------
+        barrier : bool, optional
+            Whether to use a global barrier before destroying the process group, by default False
+        """
         # Destroying group.WORLD is enough for all process groups to get destroyed
         if (
             "_is_initialized" in DistributedManager._shared_state
             and DistributedManager._shared_state["_is_initialized"]
             and "_distributed" in DistributedManager._shared_state
             and DistributedManager._shared_state["_distributed"]
         ):
-            if torch.cuda.is_available():
-                dist.barrier(device_ids=[DistributedManager().local_rank])
-            else:
-                dist.barrier()
+            if barrier:
+                if torch.cuda.is_available():
+                    dist.barrier(device_ids=[DistributedManager().local_rank])
+                else:
+                    dist.barrier()
             dist.destroy_process_group()
         DistributedManager._shared_state = {}
diff --git a/physicsnemo/utils/neighbors/knn/_cuml_impl.py b/physicsnemo/utils/neighbors/knn/_cuml_impl.py
@@ -18,7 +18,7 @@
 
 from physicsnemo.utils.version_check import check_min_version
 
-CUML_AVAILABLE = check_min_version("cuml", "24.0.0")
+CUML_AVAILABLE = check_min_version("cuml", "24.0.0", hard_fail=False)
 
 if CUML_AVAILABLE:
     import cuml
diff --git a/physicsnemo/utils/neighbors/knn/_scipy_impl.py b/physicsnemo/utils/neighbors/knn/_scipy_impl.py
@@ -18,7 +18,7 @@
 
 from physicsnemo.utils.version_check import check_min_version
 
-SCIPY_AVAILABLE = check_min_version("scipy", "1.7.0")
+SCIPY_AVAILABLE = check_min_version("scipy", "1.7.0", hard_fail=False)
 
 if SCIPY_AVAILABLE:
     from scipy.spatial import KDTree
diff --git a/physicsnemo/utils/neighbors/knn/knn.py b/physicsnemo/utils/neighbors/knn/knn.py
@@ -18,7 +18,9 @@
 
 import torch
 
+from ._cuml_impl import CUML_AVAILABLE
 from ._cuml_impl import knn_impl as knn_cuml
+from ._scipy_impl import SCIPY_AVAILABLE
 from ._scipy_impl import knn_impl as knn_scipy
 from ._torch_impl import knn_impl as knn_torch
 
@@ -72,9 +74,15 @@ def knn(
 
     if backend == "auto":
         if points.is_cuda:
-            backend = "cuml"
+            if CUML_AVAILABLE:
+                backend = "cuml"
+            else:
+                backend = "torch"
         else:
-            backend = "scipy"
+            if SCIPY_AVAILABLE:
+                backend = "scipy"
+            else:
+                backend = "torch"
 
     # Cuml foes not support bfloat16:
     # Autocast to float32:
diff --git a/physicsnemo/utils/version_check.py b/physicsnemo/utils/version_check.py
@@ -37,7 +37,10 @@
 
 
 def check_min_version(
-    package_name: str, min_version: str, error_msg: Optional[str] = None
+    package_name: str,
+    min_version: str,
+    error_msg: Optional[str] = None,
+    hard_fail: bool = True,
 ) -> bool:
     """
     Check if an installed package meets the minimum version requirement.
@@ -46,7 +49,7 @@ def check_min_version(
         package_name: Name of the package to check
         min_version: Minimum required version string (e.g. '2.6.0')
         error_msg: Optional custom error message
-
+        hard_fail: Whether to raise an ImportError if the version requirement is not met
     Returns:
         True if version requirement is met
 
@@ -57,14 +60,20 @@ def check_min_version(
         package = importlib.import_module(package_name)
         package_version = getattr(package, "__version__", "0.0.0")
     except ImportError:
-        raise ImportError(f"Package {package_name} is required but not installed.")
+        if hard_fail:
+            raise ImportError(f"Package {package_name} is required but not installed.")
+        else:
+            return False
 
     if version.parse(package_version) < version.parse(min_version):
         msg = (
             error_msg
             or f"{package_name} version {min_version} or higher is required, but found {package_version}"
         )
-        raise ImportError(msg)
+        if hard_fail:
+            raise ImportError(msg)
+        else:
+            return False
 
     return True
 
diff --git a/test/utils/neighbors/test_knn.py b/test/utils/neighbors/test_knn.py
@@ -20,6 +20,7 @@
 from physicsnemo.utils.neighbors import knn
 from physicsnemo.utils.neighbors.knn._cuml_impl import knn_impl as knn_cuml
 from physicsnemo.utils.neighbors.knn._scipy_impl import knn_impl as knn_scipy
+from physicsnemo.utils.version_check import check_min_version
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
@@ -33,6 +34,15 @@ def test_knn(device: str, k: int, backend: str, dtype: torch.dtype):
     Basic test for KNN functionality.
     We use a predictable grid of points to ensure the results are valid.
     """
+
+    if backend == "cuml":
+        if not check_min_version("cuml", "24.0.0", hard_fail=False):
+            pytest.skip("cuml not available")
+
+    elif backend == "scipy":
+        if not check_min_version("scipy", "1.7.0", hard_fail=False):
+            pytest.skip("scipy not available")
+
     # Skip cuml tests on CPU as it's not supported
     if backend == "cuml" and device == "cpu":
         pytest.skip("cuml backend not supported on CPU")
@@ -102,12 +112,17 @@ def test_knn_torch_compile_no_graph_break(device):
     queries = torch.randn(13, 3, device=device)
     k = 5
 
+    if not check_min_version("cuml", "24.0.0", hard_fail=False):
+        backend = "torch"
+    else:
+        backend = "auto"
+
     def search_fn(points, queries):
         return knn(
             points,
             queries,
             k=k,
-            backend="auto",
+            backend=backend,
         )
 
     # Run both and compare outputs
@@ -133,8 +148,12 @@ def test_opcheck(device):
     k = 5
 
     if device == "cuda":
+        if not check_min_version("cuml", "24.0.0", hard_fail=False):
+            pytest.skip("cuml not available")
         op = knn_cuml
     else:
+        if not check_min_version("scipy", "1.7.0", hard_fail=False):
+            pytest.skip("scipy not available")
         op = knn_scipy
 
     torch.library.opcheck(op, args=(points, queries, k))
@@ -146,6 +165,13 @@ def test_knn_comparison(device):
     queries = torch.randn(21, 3, device=device)
     k = 5
 
+    if not check_min_version("cuml", "24.0.0", hard_fail=False):
+        if device == "cuda":
+            pytest.skip("cuml not available")
+    if not check_min_version("scipy", "1.7.0", hard_fail=False):
+        if device == "cuda":
+            pytest.skip("scipy not available")
+
     if device == "cuda":
         indices_cuml, distances_A = knn(points, queries, k, backend="cuml")
         indices_torch, distances_B = knn(points, queries, k, backend="torch")