Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,11 @@ repos:
rev: v1.19.1
hooks:
- id: mypy
args: ["--config-file", "python-package/pyproject.toml", "python-package/"]
pass_filenames: false
verbose: true
entry: bash -c 'mypy --config-file=./python-package/pyproject.toml ./python-package || true'
additional_dependencies:
- matplotlib>=3.9.1
- pandas>=2.0
- pyarrow>=17.0
- scikit-learn>=1.5.2
41 changes: 31 additions & 10 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@
scipy.sparse.spmatrix,
List[scipy.sparse.spmatrix],
]
_LGBM_PredictSparseReturnType = Union[
scipy.sparse.spmatrix,
List[scipy.sparse.spmatrix],
]
_LGBM_WeightType = Union[
List[float],
List[int],
Expand Down Expand Up @@ -1183,14 +1187,16 @@ def predict(
preds = np.loadtxt(f.name, dtype=np.float64)
nrow = preds.shape[0]
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(
# TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After fixing the Booster.__pred_for_csr() and similar return type hints, # type: ignore comments like these are necessary fix these mypy warnings:

basic.py:1190: error: Incompatible types in assignment (expression has type "Any | list[Any]", variable has type "ndarray[tuple[Any, ...], dtype[float64]]")  [assignment]
basic.py:1197: error: Incompatible types in assignment (expression has type "Any | list[Any]", variable has type "ndarray[tuple[Any, ...], dtype[float64]]")  [assignment]
basic.py:1234: error: Incompatible types in assignment (expression has type "Any | list[Any]", variable has type "ndarray[tuple[Any, ...], dtype[float64]]")  [assignment]

Those are necessary because mypy assigns a type to preds the first time it's assigned.

This is the type of complexity that can go away once #6348 is completed (I'm planning to return to that soon).

preds, nrow = self.__pred_for_csr( # type: ignore[assignment]
csr=data,
start_iteration=start_iteration,
num_iteration=num_iteration,
predict_type=predict_type,
)
elif isinstance(data, scipy.sparse.csc_matrix):
preds, nrow = self.__pred_for_csc(
# TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved.
preds, nrow = self.__pred_for_csc( # type: ignore[assignment]
csc=data,
start_iteration=start_iteration,
num_iteration=num_iteration,
Expand Down Expand Up @@ -1227,7 +1233,8 @@ def predict(
csr = scipy.sparse.csr_matrix(data)
except BaseException as err:
raise TypeError(f"Cannot predict data for type {type(data).__name__}") from err
preds, nrow = self.__pred_for_csr(
# TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved.
preds, nrow = self.__pred_for_csr( # type: ignore[assignment]
csr=csr,
start_iteration=start_iteration,
num_iteration=num_iteration,
Expand All @@ -1245,6 +1252,7 @@ def predict(

def __get_num_preds(
self,
*,
Copy link
Collaborator Author

@jameslamb jameslamb Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continuing the work I've been doing (e.g. #7111) to enforce more use of keyword-only arguments in internal functions, to make the data flow clearer.

Touching this because some calls to __get_num_preds() were implicated in mypy warnings.

start_iteration: int,
num_iteration: int,
nrow: int,
Expand Down Expand Up @@ -1328,7 +1336,12 @@ def __pred_for_np2d(
sections = np.arange(_MAX_INT32, nrow, _MAX_INT32)
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [
self.__get_num_preds(start_iteration, num_iteration, i, predict_type)
self.__get_num_preds(
start_iteration=start_iteration,
num_iteration=num_iteration,
nrow=int(i),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cast to int() and the other one like it fix these mypy warnings:

basic.py:1331: error: Argument 3 to "__get_num_preds" of "_InnerPredictor" has incompatible type "signedinteger[_32Bit | _64Bit]"; expected "int"  [arg-type]
basic.py:1546: error: Argument 3 to "__get_num_preds" of "_InnerPredictor" has incompatible type "signedinteger[_32Bit | _64Bit]"; expected "int"  [arg-type]

predict_type=predict_type,
)
for i in np.diff([0] + list(sections) + [nrow])
]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
Expand Down Expand Up @@ -1364,7 +1377,7 @@ def __create_sparse_native(
indptr_type: int,
data_type: int,
is_csr: bool,
) -> Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]]:
) -> _LGBM_PredictSparseReturnType:
Copy link
Collaborator Author

@jameslamb jameslamb Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This type was wrong.

The output isn't always a list:

if len(cs_output_matrices) == 1:
return cs_output_matrices[0]

That change results in all the other similar changes to _LGBM_PredictSparseReturnType in this file.

# create numpy array from output arrays
data_indices_len = out_shape[0]
indptr_len = out_shape[1]
Expand Down Expand Up @@ -1472,7 +1485,7 @@ def __inner_predict_csr_sparse(
start_iteration: int,
num_iteration: int,
predict_type: int,
) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]:
) -> Tuple[_LGBM_PredictSparseReturnType, int]:
ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr)
ptr_data, type_ptr_data, _ = _c_float_array(csr.data)
csr_indices = csr.indices.astype(np.int32, copy=False)
Expand Down Expand Up @@ -1530,7 +1543,7 @@ def __pred_for_csr(
start_iteration: int,
num_iteration: int,
predict_type: int,
) -> Tuple[np.ndarray, int]:
) -> Tuple[_LGBM_PredictSparseReturnType, int]:
"""Predict for a CSR data."""
if predict_type == _C_API_PREDICT_CONTRIB:
return self.__inner_predict_csr_sparse(
Expand All @@ -1543,7 +1556,15 @@ def __pred_for_csr(
if nrow > _MAX_INT32:
sections = [0] + list(np.arange(_MAX_INT32, nrow, _MAX_INT32)) + [nrow]
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
n_preds = [
self.__get_num_preds(
start_iteration=start_iteration,
num_iteration=num_iteration,
nrow=int(i),
predict_type=predict_type,
)
for i in np.diff(sections)
]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
preds = np.empty(sum(n_preds), dtype=np.float64)
for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(
Expand Down Expand Up @@ -1573,7 +1594,7 @@ def __inner_predict_sparse_csc(
start_iteration: int,
num_iteration: int,
predict_type: int,
) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]:
) -> Tuple[_LGBM_PredictSparseReturnType, int]:
ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr)
ptr_data, type_ptr_data, _ = _c_float_array(csc.data)
csc_indices = csc.indices.astype(np.int32, copy=False)
Expand Down Expand Up @@ -1631,7 +1652,7 @@ def __pred_for_csc(
start_iteration: int,
num_iteration: int,
predict_type: int,
) -> Tuple[np.ndarray, int]:
) -> Tuple[_LGBM_PredictSparseReturnType, int]:
"""Predict for a CSC data."""
nrow = csc.shape[0]
if nrow > _MAX_INT32:
Expand Down
15 changes: 9 additions & 6 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
_DaskMatrixLike = Union[dask_Array, dask_DataFrame]
_DaskVectorLike = Union[dask_Array, dask_Series]
_DaskPart = Union[np.ndarray, pd_DataFrame, pd_Series, ss.spmatrix]
_PredictionDtype = Union[Type[np.float32], Type[np.float64], Type[np.int32], Type[np.int64]]


class _RemoteSocket:
Expand Down Expand Up @@ -891,6 +890,15 @@ def _predict_part(

# dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series
if isinstance(part, pd_DataFrame):
# assert that 'result' is an array, only necessary because predict(..., pred_contrib=True) on
# sparse matrices returns a list.
#
# This can be removed when https://github.com/microsoft/LightGBM/pull/6348 is resolved.
error_msg = (
f"predict(X) for lightgbm.dask estimators should always return an array, not '{type(result)}', when X is a pandas Dataframe. "
"If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
)
assert hasattr(result, "shape"), error_msg
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolves this:

dask.py:894: error: Item "list[Any]" of "ndarray[tuple[Any, ...], dtype[Any]] | Any | list[Any]" has no attribute "shape"  [union-attr]

We know that predict() can only return a list if the input is a scipy sparse matrix, but mypy doesn't. It sees that a list is technically a possible output type, and correctly warnings that a list doesn't have a .shape attribute.

This type of workaround can be removed when #6348 is completed.

if len(result.shape) == 2:
result = pd_DataFrame(result, index=part.index)
else:
Expand All @@ -908,7 +916,6 @@ def _predict(
pred_proba: bool = False,
pred_leaf: bool = False,
pred_contrib: bool = False,
dtype: _PredictionDtype = np.float32,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Started looking into all this dtype stuff in the Dask estimators because of this mypy warning:

dask.py:1293: error: Argument "dtype" to "_predict" has incompatible type "dtype[Any]"; expected "type[floating[_32Bit]] | type[float64] | type[signedinteger[_32Bit]] | type[signedinteger[_64Bit]]"  [arg-type]

**kwargs: Any,
) -> Union[dask_Array, List[dask_Array]]:
"""Inner predict routine.
Expand All @@ -927,8 +934,6 @@ def _predict(
Whether to predict leaf index.
pred_contrib : bool, optional (default=False)
Whether to predict feature contributions.
dtype : np.dtype, optional (default=np.float32)
Dtype of the output.
**kwargs
Other parameters passed to ``predict`` or ``predict_proba`` method.

Expand Down Expand Up @@ -1041,7 +1046,6 @@ def _extract(items: List[Any], i: int) -> Any:
predict_fn,
chunks=chunks,
meta=pred_row,
dtype=dtype,
Copy link
Collaborator Author

@jameslamb jameslamb Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This dtype argument was brought over from dask-lightgbm (#3515).

The code looks like it's intended to allow setting the dtype of the output, but that's not how it works.

Passing dtype to map_blocks() does not change the dtype of the output. Consider the following:

import dask.array as da
import numpy as np
x = da.arange(6, chunks=3)

x.map_blocks(lambda x: x * 2).compute().dtype
# dtype('int64')

x.map_blocks(lambda x: x * 2, dtype=np.float64).compute().dtype
# dtype('int64')

Instead, it's just there to avoid Dask trying to infer the output dtype of whatever the function passed to map_blocks() returns.

See https://docs.dask.org/en/stable/_modules/dask/array/core.html#map_blocks

dtype
np.dtype, optional
The dtype of the output array. It is recommended to provide this. If not provided, will be inferred by applying the function to a small set of fake data.

It should be safe to allow that type inference, because we're providing the meta input which is the result of calling predict() on a single row of input.

pred_row = predict_fn(data_row) # type: ignore[misc]

return data.map_blocks(
predict_fn,
chunks=chunks,
meta=pred_row,

That's nice because it also avoids needing to encode the logic of which output dtypes match to which mix of input types and raw_score / pred_contrib / pred_leaf.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have time, @jmoralez @ffineis I'd appreciate if one of you could double-check my claims in this comment.

**map_blocks_kwargs,
)
else:
Expand Down Expand Up @@ -1290,7 +1294,6 @@ def predict(
return _predict(
model=self.to_local(),
data=X,
dtype=self.classes_.dtype,
client=_get_dask_client(self.client),
raw_score=raw_score,
start_iteration=start_iteration,
Expand Down
60 changes: 60 additions & 0 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,66 @@ def test_predict_with_raw_score(task, output, cluster):
assert_eq(raw_predictions, pred_proba_raw)


@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("task", tasks)
def test_predict_returns_expected_dtypes(task, output, cluster):
if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip("LGBMRanker is not currently tested on sparse matrices")

with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data(objective=task, output=output, group=None)

model_factory = task_to_dask_factory[task]
params = {
"client": client,
"n_estimators": 1,
"num_leaves": 2,
"time_out": 5,
"verbose": -1,
}
model = model_factory(**params)
model.fit(dX, dy, group=dg)

# use a small sub-sample (to keep the tests fast)
if output.startswith("dataframe"):
dX_sample = dX.sample(frac=0.001)
else:
dX_sample = dX[:1,]
dX_sample.persist()
Comment on lines +1599 to +1604
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my local testing (macOS, dask==2024.11.2), this cut the total time for all new test cases here from 65s to around 10s.


# default predictions:
#
# * classification: int64
# * ranking: float64
# * regression: float64
#
preds = model.predict(dX_sample).compute()
if task.endswith("classification"):
# preds go through LabelEncoder.inverse_transform() and have the same
# dtype as model.classes_ (expected to be an integer type, but exact size
# varies across numpy versions and operating systems)
assert preds.dtype == model.classes_.dtype
assert preds.dtype in (np.int32, np.int64)
else:
assert preds.dtype == np.float64

# raw predictions: always float64
preds_raw = model.predict(dX_sample, raw_score=True).compute()
assert preds_raw.dtype == np.float64

# pred_contrib: always float64
if output.startswith("scipy"):
preds_contrib = [arr.compute() for arr in model.predict(dX_sample, pred_contrib=True)]
assert all(arr.dtype == np.float64 for arr in preds_contrib)
else:
preds_contrib = model.predict(dX_sample, pred_contrib=True).compute()
assert preds_contrib.dtype == np.float64

# pred_leavs: always int32
preds_leaves = model.predict(dX_sample, pred_leaf=True).compute()
assert preds_leaves.dtype == np.int32


@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("use_init_score", [False, True])
def test_predict_stump(output, use_init_score, cluster, rng):
Expand Down
39 changes: 36 additions & 3 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1897,7 +1897,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
assert preds.shape[0] == y.shape[0]


def run_minimal_test(X_type, y_type, g_type, task, rng):
def _run_minimal_test(*, X_type, y_type, g_type, task, rng):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a small cosmetic change...marking this internal and forcing the use of keyword arguments makes the calls a little stricter and clearer, in my opinion.

X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))

Expand Down Expand Up @@ -1987,6 +1987,7 @@ def run_minimal_test(X_type, y_type, g_type, task, rng):
params_fit["eval_group"] = [g]
model.fit(**params_fit)

# --- prediction accuracy --#
preds = model.predict(X)
if task == "binary-classification":
assert accuracy_score(y, preds) >= 0.99
Expand All @@ -1999,6 +2000,38 @@ def run_minimal_test(X_type, y_type, g_type, task, rng):
else:
raise ValueError(f"Unrecognized task: '{task}'")

# --- prediction dtypes ---#

# default predictions:
#
# * classification: int32 or int64
# * ranking: float64
# * regression: float64
#
if task.endswith("classification"):
# preds go through LabelEncoder.inverse_transform() and have the same
# dtype as model.classes_ (expected to be an integer type, but exact size
# varies across numpy versions and operating systems)
assert preds.dtype == model.classes_.dtype
assert preds.dtype in (np.int32, np.int64)
else:
assert preds.dtype == np.float64

# raw predictions: always float64
preds_raw = model.predict(X, raw_score=True)
assert preds_raw.dtype == np.float64

# pred_contrib: always float64
if X_type.startswith("scipy"):
assert all(arr.dtype == np.float64 for arr in model.predict(X, pred_contrib=True))
else:
preds_contrib = model.predict(X, pred_contrib=True)
assert preds_contrib.dtype == np.float64

# pred_leavs: always int32
preds_leaves = model.predict(X, pred_leaf=True)
assert preds_leaves.dtype == np.int32


@pytest.mark.parametrize("X_type", all_x_types)
@pytest.mark.parametrize("y_type", all_y_types)
Expand All @@ -2014,7 +2047,7 @@ def test_classification_and_regression_minimally_work_with_all_accepted_data_typ
if any(t.startswith("pa_") for t in [X_type, y_type]) and not PYARROW_INSTALLED:
pytest.skip("pyarrow is not installed")

run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng)
_run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng)


@pytest.mark.parametrize("X_type", all_x_types)
Expand All @@ -2031,7 +2064,7 @@ def test_ranking_minimally_works_with_all_accepted_data_types(
if any(t.startswith("pa_") for t in [X_type, y_type, g_type]) and not PYARROW_INSTALLED:
pytest.skip("pyarrow is not installed")

run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng)
_run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng)


def test_classifier_fit_detects_classes_every_time():
Expand Down
Loading