Skip to content

Commit d41f734

Browse files
MAINT Clean-up scipy<1.10 code (scikit-learn#32615)
Co-authored-by: Loïc Estève <[email protected]>
1 parent ca39ad1 commit d41f734

File tree

5 files changed

+27
-140
lines changed

5 files changed

+27
-140
lines changed

sklearn/conftest.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import numpy as np
1515
import pytest
1616
from _pytest.doctest import DoctestItem
17+
from scipy.datasets import face
1718
from threadpoolctl import threadpool_limits
1819

1920
from sklearn._min_dependencies import PYTEST_MIN_VERSION
@@ -56,24 +57,16 @@
5657
f" should have pytest >= {PYTEST_MIN_VERSION} installed."
5758
)
5859

59-
scipy_datasets_require_network = sp_version >= parse_version("1.10")
60-
6160

6261
def raccoon_face_or_skip():
63-
# SciPy >= 1.10 requires network to access to get data
64-
if scipy_datasets_require_network:
65-
run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
66-
if not run_network_tests:
67-
raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
68-
69-
try:
70-
import pooch # noqa: F401
71-
except ImportError:
72-
raise SkipTest("test requires pooch to be installed")
73-
74-
from scipy.datasets import face
75-
else:
76-
from scipy.misc import face
62+
# SciPy requires network access to get data
63+
run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
64+
if not run_network_tests:
65+
raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
66+
try:
67+
import pooch # noqa: F401
68+
except ImportError:
69+
raise SkipTest("test requires pooch to be installed")
7770

7871
return face(gray=True)
7972

@@ -91,8 +84,7 @@ def raccoon_face_or_skip():
9184
"fetch_species_distributions_fxt": fetch_species_distributions,
9285
}
9386

94-
if scipy_datasets_require_network:
95-
dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
87+
dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
9688

9789
_SKIP32_MARK = pytest.mark.skipif(
9890
environ.get("SKLEARN_RUN_FLOAT32_TESTS", "0") != "1",

sklearn/preprocessing/_data.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
validate_params,
3636
)
3737
from sklearn.utils.extmath import _incremental_mean_and_var, row_norms
38-
from sklearn.utils.fixes import _yeojohnson_lambda
3938
from sklearn.utils.sparsefuncs import (
4039
incr_mean_variance_axis,
4140
inplace_column_scale,
@@ -3595,8 +3594,8 @@ def _neg_log_likelihood(lmbda):
35953594
# the computation of lambda is influenced by NaNs so we need to
35963595
# get rid of them
35973596
x = x[~np.isnan(x)]
3598-
3599-
return _yeojohnson_lambda(_neg_log_likelihood, x)
3597+
_, lmbda = stats.yeojohnson(x, lmbda=None)
3598+
return lmbda
36003599

36013600
def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
36023601
"""Validate the input before fit and transform.

sklearn/preprocessing/_polynomial.py

Lines changed: 15 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
)
3030
from sklearn.utils._mask import _get_mask
3131
from sklearn.utils._param_validation import Interval, StrOptions
32-
from sklearn.utils.fixes import parse_version, sp_version
3332
from sklearn.utils.stats import _weighted_percentile
3433
from sklearn.utils.validation import (
3534
FLOAT_DTYPES,
@@ -460,23 +459,6 @@ def transform(self, X):
460459
# edge case: deal with empty matrix
461460
XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
462461
else:
463-
# `scipy.sparse.hstack` breaks in scipy<1.9.2
464-
# when `n_output_features_ > max_int32`
465-
all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)
466-
if (
467-
sp_version < parse_version("1.9.2")
468-
and self.n_output_features_ > max_int32
469-
and all_int32
470-
):
471-
raise ValueError( # pragma: no cover
472-
"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
473-
" produces negative columns when:\n1. The output shape contains"
474-
" `n_cols` too large to be represented by a 32bit signed"
475-
" integer.\n2. All sub-matrices to be stacked have indices of"
476-
" dtype `np.int32`.\nTo avoid this error, either use a version"
477-
" of scipy `>=1.9.2` or alter the `PolynomialFeatures`"
478-
" transformer to produce fewer than 2^31 output features"
479-
)
480462
XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")
481463
elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:
482464
return self.transform(X.tocsr()).tocsc()
@@ -1022,27 +1004,14 @@ def transform(self, X):
10221004
n_splines = self.bsplines_[0].c.shape[1]
10231005
degree = self.degree
10241006

1025-
# TODO: Remove this condition, once scipy 1.10 is the minimum version.
1026-
# Only scipy >= 1.10 supports design_matrix(.., extrapolate=..).
1027-
# The default (implicit in scipy < 1.10) is extrapolate=False.
1028-
scipy_1_10 = sp_version >= parse_version("1.10.0")
1029-
# Note: self.bsplines_[0].extrapolate is True for extrapolation in
1030-
# ["periodic", "continue"]
1031-
if scipy_1_10:
1032-
use_sparse = self.sparse_output
1033-
kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}
1034-
else:
1035-
use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate
1036-
kwargs_extrapolate = dict()
1037-
10381007
# Note that scipy BSpline returns float64 arrays and converts input
10391008
# x=X[:, i] to c-contiguous float64.
10401009
n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
10411010
if X.dtype in FLOAT_DTYPES:
10421011
dtype = X.dtype
10431012
else:
10441013
dtype = np.float64
1045-
if use_sparse:
1014+
if self.sparse_output:
10461015
output_list = []
10471016
else:
10481017
XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
@@ -1071,7 +1040,7 @@ def transform(self, X):
10711040
else: # self.extrapolation in ("continue", "error")
10721041
x = X[:, feature_idx]
10731042

1074-
if use_sparse:
1043+
if self.sparse_output:
10751044
# We replace the nan values in the input column by some
10761045
# arbitrary, in-range, numerical value since
10771046
# BSpline.design_matrix() would otherwise raise on any nan
@@ -1093,8 +1062,11 @@ def transform(self, X):
10931062
elif nan_row_indices.shape[0] > 0:
10941063
x = x.copy() # avoid mutation of input data
10951064
x[nan_row_indices] = np.nanmin(x)
1065+
1066+
# Note: self.bsplines_[0].extrapolate is True for extrapolation in
1067+
# ["periodic", "continue"]
10961068
XBS_sparse = BSpline.design_matrix(
1097-
x, spl.t, spl.k, **kwargs_extrapolate
1069+
x, spl.t, spl.k, self.bsplines_[0].extrapolate
10981070
)
10991071

11001072
if self.extrapolation == "periodic":
@@ -1122,7 +1094,7 @@ def transform(self, X):
11221094
XBS[
11231095
nan_row_indices, output_feature_idx : output_feature_idx + 1
11241096
] = 0
1125-
if use_sparse:
1097+
if self.sparse_output:
11261098
XBS_sparse = XBS
11271099

11281100
else: # extrapolation in ("constant", "linear")
@@ -1135,7 +1107,7 @@ def transform(self, X):
11351107
X[:, feature_idx] <= xmax
11361108
)
11371109

1138-
if use_sparse:
1110+
if self.sparse_output:
11391111
outside_range_mask = ~inside_range_mask
11401112
x = X[:, feature_idx].copy()
11411113
# Set to some arbitrary value within the range of values
@@ -1162,7 +1134,7 @@ def transform(self, X):
11621134
# 'continue' is already returned as is by scipy BSplines
11631135
if self.extrapolation == "error":
11641136
has_nan_output_values = False
1165-
if use_sparse:
1137+
if self.sparse_output:
11661138
# Early convert to CSR as the sparsity structure of this
11671139
# block should not change anymore. This is needed to be able
11681140
# to safely assume that `.data` is a 1D array.
@@ -1187,7 +1159,7 @@ def transform(self, X):
11871159

11881160
below_xmin_mask = X[:, feature_idx] < xmin
11891161
if np.any(below_xmin_mask):
1190-
if use_sparse:
1162+
if self.sparse_output:
11911163
# Note: See comment about SparseEfficiencyWarning above.
11921164
XBS_sparse = XBS_sparse.tolil()
11931165
XBS_sparse[below_xmin_mask, :degree] = f_min[:degree]
@@ -1202,7 +1174,7 @@ def transform(self, X):
12021174

12031175
above_xmax_mask = X[:, feature_idx] > xmax
12041176
if np.any(above_xmax_mask):
1205-
if use_sparse:
1177+
if self.sparse_output:
12061178
# Note: See comment about SparseEfficiencyWarning above.
12071179
XBS_sparse = XBS_sparse.tolil()
12081180
XBS_sparse[above_xmax_mask, -degree:] = f_max[-degree:]
@@ -1235,7 +1207,7 @@ def transform(self, X):
12351207
f_min[j]
12361208
+ (X[below_xmin_mask, feature_idx] - xmin) * fp_min[j]
12371209
)
1238-
if use_sparse:
1210+
if self.sparse_output:
12391211
# Note: See comment about SparseEfficiencyWarning above.
12401212
XBS_sparse = XBS_sparse.tolil()
12411213
XBS_sparse[below_xmin_mask, j] = linear_extr
@@ -1251,7 +1223,7 @@ def transform(self, X):
12511223
f_max[k]
12521224
+ (X[above_xmax_mask, feature_idx] - xmax) * fp_max[k]
12531225
)
1254-
if use_sparse:
1226+
if self.sparse_output:
12551227
# Note: See comment about SparseEfficiencyWarning above.
12561228
XBS_sparse = XBS_sparse.tolil()
12571229
XBS_sparse[above_xmax_mask, k : k + 1] = linear_extr[
@@ -1262,38 +1234,12 @@ def transform(self, X):
12621234
linear_extr
12631235
)
12641236

1265-
if use_sparse:
1237+
if self.sparse_output:
12661238
XBS_sparse = XBS_sparse.tocsr()
12671239
output_list.append(XBS_sparse)
12681240

1269-
if use_sparse:
1270-
# TODO: Remove this conditional error when the minimum supported version of
1271-
# SciPy is 1.9.2
1272-
# `scipy.sparse.hstack` breaks in scipy<1.9.2
1273-
# when `n_features_out_ > max_int32`
1274-
max_int32 = np.iinfo(np.int32).max
1275-
all_int32 = True
1276-
for mat in output_list:
1277-
all_int32 &= mat.indices.dtype == np.int32
1278-
if (
1279-
sp_version < parse_version("1.9.2")
1280-
and self.n_features_out_ > max_int32
1281-
and all_int32
1282-
):
1283-
raise ValueError(
1284-
"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
1285-
" produces negative columns when:\n1. The output shape contains"
1286-
" `n_cols` too large to be represented by a 32bit signed"
1287-
" integer.\n. All sub-matrices to be stacked have indices of"
1288-
" dtype `np.int32`.\nTo avoid this error, either use a version"
1289-
" of scipy `>=1.9.2` or alter the `SplineTransformer`"
1290-
" transformer to produce fewer than 2^31 output features"
1291-
)
1241+
if self.sparse_output:
12921242
XBS = sparse.hstack(output_list, format="csr")
1293-
elif self.sparse_output:
1294-
# TODO: Remove conversion to csr, once scipy 1.10 is the minimum version:
1295-
# Adjust format of XBS to sparse, for scipy versions < 1.10.0:
1296-
XBS = sparse.csr_matrix(XBS)
12971243

12981244
if self.include_bias:
12991245
return XBS

sklearn/preprocessing/tests/test_polynomial.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@
3636
from sklearn.utils.fixes import (
3737
CSC_CONTAINERS,
3838
CSR_CONTAINERS,
39-
parse_version,
40-
sp_version,
4139
)
4240

4341

@@ -1196,21 +1194,6 @@ def test_csr_polynomial_expansion_index_overflow(
11961194
pf.fit(X)
11971195
return
11981196

1199-
# When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
1200-
# dtype for representing indices and indptr if `n_features` is still
1201-
# small enough so that each block matrix's indices and indptr arrays
1202-
# can be represented with `np.int32`. We test `n_features==65535`
1203-
# since it is guaranteed to run into this bug.
1204-
if (
1205-
sp_version < parse_version("1.9.2")
1206-
and n_features == 65535
1207-
and degree == 2
1208-
and not interaction_only
1209-
): # pragma: no cover
1210-
msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
1211-
with pytest.raises(ValueError, match=msg):
1212-
X_trans = pf.fit_transform(X)
1213-
return
12141197
X_trans = pf.fit_transform(X)
12151198

12161199
expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32

sklearn/utils/fixes.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import scipy
1515
import scipy.sparse.linalg
1616
import scipy.stats
17-
from scipy import optimize
1817

1918
try:
2019
import pandas as pd
@@ -81,38 +80,6 @@ def _sparse_linalg_cg(A, b, **kwargs):
8180
return scipy.sparse.linalg.cg(A, b, **kwargs)
8281

8382

84-
# TODO : remove this when required minimum version of SciPy >= 1.9.0
85-
def _yeojohnson_lambda(_neg_log_likelihood, x):
86-
"""Estimate the optimal Yeo-Johnson transformation parameter (lambda).
87-
88-
This function provides a compatibility workaround for versions of SciPy
89-
older than 1.9.0, where `scipy.stats.yeojohnson` did not return
90-
the estimated lambda directly.
91-
92-
Parameters
93-
----------
94-
_neg_log_likelihood : callable
95-
A function that computes the negative log-likelihood of the Yeo-Johnson
96-
transformation for a given lambda. Used only for SciPy versions < 1.9.0.
97-
98-
x : array-like
99-
Input data to estimate the Yeo-Johnson transformation parameter.
100-
101-
Returns
102-
-------
103-
lmbda : float
104-
The estimated lambda parameter for the Yeo-Johnson transformation.
105-
"""
106-
min_scipy_version = "1.9.0"
107-
108-
if sp_version < parse_version(min_scipy_version):
109-
# choosing bracket -2, 2 like for boxcox
110-
return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
111-
112-
_, lmbda = scipy.stats.yeojohnson(x, lmbda=None)
113-
return lmbda
114-
115-
11683
# TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
11784
# into the public min_max_axis function when SciPy 1.11 is the minimum supported
11885
# version and delete the backport in the else branch below.

0 commit comments

Comments
 (0)