MAINT Clean-up scipy<1.10 code (scikit-learn#32615)

DeaMariaLeon · lesteve · web-flow · commit d41f7340dca3 · 2025-10-31T15:54:00.000Z
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
@@ -14,6 +14,7 @@
 import numpy as np
 import pytest
 from _pytest.doctest import DoctestItem
+from scipy.datasets import face
 from threadpoolctl import threadpool_limits
 
 from sklearn._min_dependencies import PYTEST_MIN_VERSION
@@ -56,24 +57,16 @@
         f" should have pytest >= {PYTEST_MIN_VERSION} installed."
     )
 
-scipy_datasets_require_network = sp_version >= parse_version("1.10")
-
 
 def raccoon_face_or_skip():
-    # SciPy >= 1.10 requires network to access to get data
-    if scipy_datasets_require_network:
-        run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
-        if not run_network_tests:
-            raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
-
-        try:
-            import pooch  # noqa: F401
-        except ImportError:
-            raise SkipTest("test requires pooch to be installed")
-
-        from scipy.datasets import face
-    else:
-        from scipy.misc import face
+    # SciPy requires network access to get data
+    run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
+    if not run_network_tests:
+        raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+    try:
+        import pooch  # noqa: F401
+    except ImportError:
+        raise SkipTest("test requires pooch to be installed")
 
     return face(gray=True)
 
@@ -91,8 +84,7 @@ def raccoon_face_or_skip():
     "fetch_species_distributions_fxt": fetch_species_distributions,
 }
 
-if scipy_datasets_require_network:
-    dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
+dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
 
 _SKIP32_MARK = pytest.mark.skipif(
     environ.get("SKLEARN_RUN_FLOAT32_TESTS", "0") != "1",
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -35,7 +35,6 @@
     validate_params,
 )
 from sklearn.utils.extmath import _incremental_mean_and_var, row_norms
-from sklearn.utils.fixes import _yeojohnson_lambda
 from sklearn.utils.sparsefuncs import (
     incr_mean_variance_axis,
     inplace_column_scale,
@@ -3595,8 +3594,8 @@ def _neg_log_likelihood(lmbda):
         # the computation of lambda is influenced by NaNs so we need to
         # get rid of them
         x = x[~np.isnan(x)]
-
-        return _yeojohnson_lambda(_neg_log_likelihood, x)
+        _, lmbda = stats.yeojohnson(x, lmbda=None)
+        return lmbda
 
     def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
         """Validate the input before fit and transform.
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
@@ -29,7 +29,6 @@
 )
 from sklearn.utils._mask import _get_mask
 from sklearn.utils._param_validation import Interval, StrOptions
-from sklearn.utils.fixes import parse_version, sp_version
 from sklearn.utils.stats import _weighted_percentile
 from sklearn.utils.validation import (
     FLOAT_DTYPES,
@@ -460,23 +459,6 @@ def transform(self, X):
                 # edge case: deal with empty matrix
                 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
             else:
-                # `scipy.sparse.hstack` breaks in scipy<1.9.2
-                # when `n_output_features_ > max_int32`
-                all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)
-                if (
-                    sp_version < parse_version("1.9.2")
-                    and self.n_output_features_ > max_int32
-                    and all_int32
-                ):
-                    raise ValueError(  # pragma: no cover
-                        "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
-                        " produces negative columns when:\n1. The output shape contains"
-                        " `n_cols` too large to be represented by a 32bit signed"
-                        " integer.\n2. All sub-matrices to be stacked have indices of"
-                        " dtype `np.int32`.\nTo avoid this error, either use a version"
-                        " of scipy `>=1.9.2` or alter the `PolynomialFeatures`"
-                        " transformer to produce fewer than 2^31 output features"
-                    )
                 XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")
         elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:
             return self.transform(X.tocsr()).tocsc()
@@ -1022,27 +1004,14 @@ def transform(self, X):
         n_splines = self.bsplines_[0].c.shape[1]
         degree = self.degree
 
-        # TODO: Remove this condition, once scipy 1.10 is the minimum version.
-        #       Only scipy >= 1.10 supports design_matrix(.., extrapolate=..).
-        #       The default (implicit in scipy < 1.10) is extrapolate=False.
-        scipy_1_10 = sp_version >= parse_version("1.10.0")
-        # Note: self.bsplines_[0].extrapolate is True for extrapolation in
-        # ["periodic", "continue"]
-        if scipy_1_10:
-            use_sparse = self.sparse_output
-            kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}
-        else:
-            use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate
-            kwargs_extrapolate = dict()
-
         # Note that scipy BSpline returns float64 arrays and converts input
         # x=X[:, i] to c-contiguous float64.
         n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
         if X.dtype in FLOAT_DTYPES:
             dtype = X.dtype
         else:
             dtype = np.float64
-        if use_sparse:
+        if self.sparse_output:
             output_list = []
         else:
             XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
@@ -1071,7 +1040,7 @@ def transform(self, X):
                 else:  # self.extrapolation in ("continue", "error")
                     x = X[:, feature_idx]
 
-                if use_sparse:
+                if self.sparse_output:
                     # We replace the nan values in the input column by some
                     # arbitrary, in-range, numerical value since
                     # BSpline.design_matrix() would otherwise raise on any nan
@@ -1093,8 +1062,11 @@ def transform(self, X):
                     elif nan_row_indices.shape[0] > 0:
                         x = x.copy()  # avoid mutation of input data
                         x[nan_row_indices] = np.nanmin(x)
+
+                    # Note: self.bsplines_[0].extrapolate is True for extrapolation in
+                    # ["periodic", "continue"]
                     XBS_sparse = BSpline.design_matrix(
-                        x, spl.t, spl.k, **kwargs_extrapolate
+                        x, spl.t, spl.k, self.bsplines_[0].extrapolate
                     )
 
                     if self.extrapolation == "periodic":
@@ -1122,7 +1094,7 @@ def transform(self, X):
                         XBS[
                             nan_row_indices, output_feature_idx : output_feature_idx + 1
                         ] = 0
-                    if use_sparse:
+                    if self.sparse_output:
                         XBS_sparse = XBS
 
             else:  # extrapolation in ("constant", "linear")
@@ -1135,7 +1107,7 @@ def transform(self, X):
                     X[:, feature_idx] <= xmax
                 )
 
-                if use_sparse:
+                if self.sparse_output:
                     outside_range_mask = ~inside_range_mask
                     x = X[:, feature_idx].copy()
                     # Set to some arbitrary value within the range of values
@@ -1162,7 +1134,7 @@ def transform(self, X):
             # 'continue' is already returned as is by scipy BSplines
             if self.extrapolation == "error":
                 has_nan_output_values = False
-                if use_sparse:
+                if self.sparse_output:
                     # Early convert to CSR as the sparsity structure of this
                     # block should not change anymore. This is needed to be able
                     # to safely assume that `.data` is a 1D array.
@@ -1187,7 +1159,7 @@ def transform(self, X):
 
                 below_xmin_mask = X[:, feature_idx] < xmin
                 if np.any(below_xmin_mask):
-                    if use_sparse:
+                    if self.sparse_output:
                         # Note: See comment about SparseEfficiencyWarning above.
                         XBS_sparse = XBS_sparse.tolil()
                         XBS_sparse[below_xmin_mask, :degree] = f_min[:degree]
@@ -1202,7 +1174,7 @@ def transform(self, X):
 
                 above_xmax_mask = X[:, feature_idx] > xmax
                 if np.any(above_xmax_mask):
-                    if use_sparse:
+                    if self.sparse_output:
                         # Note: See comment about SparseEfficiencyWarning above.
                         XBS_sparse = XBS_sparse.tolil()
                         XBS_sparse[above_xmax_mask, -degree:] = f_max[-degree:]
@@ -1235,7 +1207,7 @@ def transform(self, X):
                             f_min[j]
                             + (X[below_xmin_mask, feature_idx] - xmin) * fp_min[j]
                         )
-                        if use_sparse:
+                        if self.sparse_output:
                             # Note: See comment about SparseEfficiencyWarning above.
                             XBS_sparse = XBS_sparse.tolil()
                             XBS_sparse[below_xmin_mask, j] = linear_extr
@@ -1251,7 +1223,7 @@ def transform(self, X):
                             f_max[k]
                             + (X[above_xmax_mask, feature_idx] - xmax) * fp_max[k]
                         )
-                        if use_sparse:
+                        if self.sparse_output:
                             # Note: See comment about SparseEfficiencyWarning above.
                             XBS_sparse = XBS_sparse.tolil()
                             XBS_sparse[above_xmax_mask, k : k + 1] = linear_extr[
@@ -1262,38 +1234,12 @@ def transform(self, X):
                                 linear_extr
                             )
 
-            if use_sparse:
+            if self.sparse_output:
                 XBS_sparse = XBS_sparse.tocsr()
                 output_list.append(XBS_sparse)
 
-        if use_sparse:
-            # TODO: Remove this conditional error when the minimum supported version of
-            # SciPy is 1.9.2
-            # `scipy.sparse.hstack` breaks in scipy<1.9.2
-            # when `n_features_out_ > max_int32`
-            max_int32 = np.iinfo(np.int32).max
-            all_int32 = True
-            for mat in output_list:
-                all_int32 &= mat.indices.dtype == np.int32
-            if (
-                sp_version < parse_version("1.9.2")
-                and self.n_features_out_ > max_int32
-                and all_int32
-            ):
-                raise ValueError(
-                    "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
-                    " produces negative columns when:\n1. The output shape contains"
-                    " `n_cols` too large to be represented by a 32bit signed"
-                    " integer.\n. All sub-matrices to be stacked have indices of"
-                    " dtype `np.int32`.\nTo avoid this error, either use a version"
-                    " of scipy `>=1.9.2` or alter the `SplineTransformer`"
-                    " transformer to produce fewer than 2^31 output features"
-                )
+        if self.sparse_output:
             XBS = sparse.hstack(output_list, format="csr")
-        elif self.sparse_output:
-            # TODO: Remove conversion to csr, once scipy 1.10 is the minimum version:
-            # Adjust format of XBS to sparse, for scipy versions < 1.10.0:
-            XBS = sparse.csr_matrix(XBS)
 
         if self.include_bias:
             return XBS
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
@@ -36,8 +36,6 @@
 from sklearn.utils.fixes import (
     CSC_CONTAINERS,
     CSR_CONTAINERS,
-    parse_version,
-    sp_version,
 )
 
 
@@ -1196,21 +1194,6 @@ def test_csr_polynomial_expansion_index_overflow(
             pf.fit(X)
         return
 
-    # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
-    # dtype for representing indices and indptr if `n_features` is still
-    # small enough so that each block matrix's indices and indptr arrays
-    # can be represented with `np.int32`. We test `n_features==65535`
-    # since it is guaranteed to run into this bug.
-    if (
-        sp_version < parse_version("1.9.2")
-        and n_features == 65535
-        and degree == 2
-        and not interaction_only
-    ):  # pragma: no cover
-        msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
-        with pytest.raises(ValueError, match=msg):
-            X_trans = pf.fit_transform(X)
-        return
     X_trans = pf.fit_transform(X)
 
     expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -14,7 +14,6 @@
 import scipy
 import scipy.sparse.linalg
 import scipy.stats
-from scipy import optimize
 
 try:
     import pandas as pd
@@ -81,38 +80,6 @@ def _sparse_linalg_cg(A, b, **kwargs):
         return scipy.sparse.linalg.cg(A, b, **kwargs)
 
 
-# TODO : remove this when required minimum version of SciPy >= 1.9.0
-def _yeojohnson_lambda(_neg_log_likelihood, x):
-    """Estimate the optimal Yeo-Johnson transformation parameter (lambda).
-
-    This function provides a compatibility workaround for versions of SciPy
-    older than 1.9.0, where `scipy.stats.yeojohnson` did not return
-    the estimated lambda directly.
-
-    Parameters
-    ----------
-    _neg_log_likelihood : callable
-        A function that computes the negative log-likelihood of the Yeo-Johnson
-        transformation for a given lambda. Used only for SciPy versions < 1.9.0.
-
-    x : array-like
-        Input data to estimate the Yeo-Johnson transformation parameter.
-
-    Returns
-    -------
-    lmbda : float
-        The estimated lambda parameter for the Yeo-Johnson transformation.
-    """
-    min_scipy_version = "1.9.0"
-
-    if sp_version < parse_version(min_scipy_version):
-        # choosing bracket -2, 2 like for boxcox
-        return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
-
-    _, lmbda = scipy.stats.yeojohnson(x, lmbda=None)
-    return lmbda
-
-
 # TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
 # into the public min_max_axis function when SciPy 1.11 is the minimum supported
 # version and delete the backport in the else branch below.