array api compat for ch score

jaffourt · jaffourt · commit 996cf26a7ec1 · 2025-10-28T10:45:03.000-04:00
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
@@ -16,7 +16,7 @@
 )
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils import _safe_indexing, check_random_state, check_X_y
-from sklearn.utils._array_api import xpx
+from sklearn.utils._array_api import get_namespace_and_device, xpx
 from sklearn.utils._param_validation import Interval, StrOptions, validate_params
 
 
@@ -362,22 +362,24 @@ def calinski_harabasz_score(X, labels):
     >>> calinski_harabasz_score(X, kmeans.labels_)
     114.8...
     """
+
+    xp, _, _ = get_namespace_and_device(X, labels)
     X, labels = check_X_y(X, labels)
     le = LabelEncoder()
     labels = le.fit_transform(labels)
 
     n_samples, _ = X.shape
-    n_labels = len(le.classes_)
+    n_labels = le.classes_.shape[0]
 
     check_number_of_labels(n_labels, n_samples)
 
     extra_disp, intra_disp = 0.0, 0.0
-    mean = np.mean(X, axis=0)
+    mean = xp.sum(X, axis=0) / X.shape[0]
     for k in range(n_labels):
         cluster_k = X[labels == k]
-        mean_k = np.mean(cluster_k, axis=0)
-        extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
-        intra_disp += np.sum((cluster_k - mean_k) ** 2)
+        mean_k = xp.sum(cluster_k, axis=0) / cluster_k.shape[0]
+        extra_disp += cluster_k.shape[0] * xp.sum((mean_k - mean) ** 2)
+        intra_disp += xp.sum((cluster_k - mean_k) ** 2)
 
     return float(
         1.0
diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from sklearn._config import config_context
 from sklearn.metrics.cluster import (
     adjusted_mutual_info_score,
     adjusted_rand_score,
@@ -18,7 +19,13 @@
     silhouette_score,
     v_measure_score,
 )
-from sklearn.utils._testing import assert_allclose
+from sklearn.utils._array_api import (
+    _atol_for_type,
+    _convert_to_numpy,
+    _get_namespace_device_dtype_ids,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._testing import _array_api_for_tests, assert_allclose
 
 # Dictionaries of metrics
 # ------------------------
@@ -232,3 +239,88 @@ def test_returned_value_consistency(name):
 
     assert isinstance(score, float)
     assert not isinstance(score, (np.float64, np.float32))
+
+
+def check_array_api_metric(
+    metric, array_namespace, device, dtype_name, a_np, b_np, **metric_kwargs
+):
+    xp = _array_api_for_tests(array_namespace, device)
+
+    a_xp = xp.asarray(a_np, device=device)
+    b_xp = xp.asarray(b_np, device=device)
+
+    metric_np = metric(a_np, b_np, **metric_kwargs)
+
+    # When array API dispatch is disabled, and np.asarray works (for example PyTorch
+    # with CPU device), calling the metric function with such numpy compatible inputs
+    # should work (albeit by implicitly converting to numpy arrays instead of
+    # dispatching to the array library).
+    try:
+        np.asarray(a_xp)
+        np.asarray(b_xp)
+        numpy_as_array_works = True
+    except (TypeError, RuntimeError, ValueError):
+        # PyTorch with CUDA device and CuPy raise TypeError consistently.
+        # array-api-strict chose to raise RuntimeError instead. NumPy raises
+        # a ValueError if the `__array__` dunder does not return an array.
+        # Exception type may need to be updated in the future for other libraries.
+        numpy_as_array_works = False
+
+    def _check_metric_matches(metric_a, metric_b, convert_a=False):
+        if convert_a:
+            metric_a = _convert_to_numpy(xp.asarray(metric_a), xp)
+        assert_allclose(metric_a, metric_b, atol=_atol_for_type(dtype_name))
+
+    if numpy_as_array_works:
+        metric_xp = metric(a_xp, b_xp, **metric_kwargs)
+
+        # Handle cases where multiple return values are not of the same shape,
+        # e.g. precision_recall_curve:
+        _check_metric_matches(metric_xp, metric_np)
+
+        metric_xp_mixed_1 = metric(a_np, b_xp, **metric_kwargs)
+        _check_metric_matches(metric_xp_mixed_1, metric_np)
+
+        metric_xp_mixed_2 = metric(a_xp, b_np, **metric_kwargs)
+        _check_metric_matches(metric_xp_mixed_2, metric_np)
+
+    with config_context(array_api_dispatch=True):
+        metric_xp = metric(a_xp, b_xp, **metric_kwargs)
+        _check_metric_matches(metric_xp, metric_np, convert_a=True)
+
+
+def check_array_api_unsupervised_metric(metric, array_namespace, device, dtype_name):
+    y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
+    X = np.random.randint(10, size=(7, 10))
+
+    check_array_api_metric(
+        metric,
+        array_namespace,
+        device,
+        dtype_name,
+        a_np=X,
+        b_np=y_pred,
+    )
+
+
+array_api_metric_checkers = {
+    calinski_harabasz_score: [
+        check_array_api_unsupervised_metric,
+    ]
+}
+
+
+def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers):
+    for metric, checkers in metric_checkers.items():
+        for checker in checkers:
+            yield metric, checker
+
+
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name",
+    yield_namespace_device_dtype_combinations(),
+    ids=_get_namespace_device_dtype_ids,
+)
+@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations())
+def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func):
+    check_func(metric, array_namespace, device, dtype_name)