-
Notifications
You must be signed in to change notification settings - Fork 4k
[python-package] [dask] [ci] fix predict() type hints, enforce mypy in CI #7140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
f21e4b6
0965ed5
8e129cd
75a1bbd
fe7eb93
0fe8032
791da85
a6fd99b
8c761bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -143,6 +143,10 @@ | |||||
| scipy.sparse.spmatrix, | ||||||
| List[scipy.sparse.spmatrix], | ||||||
| ] | ||||||
| _LGBM_PredictSparseReturnType = Union[ | ||||||
| scipy.sparse.spmatrix, | ||||||
| List[scipy.sparse.spmatrix], | ||||||
| ] | ||||||
| _LGBM_WeightType = Union[ | ||||||
| List[float], | ||||||
| List[int], | ||||||
|
|
@@ -1183,14 +1187,16 @@ def predict( | |||||
| preds = np.loadtxt(f.name, dtype=np.float64) | ||||||
| nrow = preds.shape[0] | ||||||
| elif isinstance(data, scipy.sparse.csr_matrix): | ||||||
| preds, nrow = self.__pred_for_csr( | ||||||
| # TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved. | ||||||
| preds, nrow = self.__pred_for_csr( # type: ignore[assignment] | ||||||
| csr=data, | ||||||
| start_iteration=start_iteration, | ||||||
| num_iteration=num_iteration, | ||||||
| predict_type=predict_type, | ||||||
| ) | ||||||
| elif isinstance(data, scipy.sparse.csc_matrix): | ||||||
| preds, nrow = self.__pred_for_csc( | ||||||
| # TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved. | ||||||
| preds, nrow = self.__pred_for_csc( # type: ignore[assignment] | ||||||
| csc=data, | ||||||
| start_iteration=start_iteration, | ||||||
| num_iteration=num_iteration, | ||||||
|
|
@@ -1227,7 +1233,8 @@ def predict( | |||||
| csr = scipy.sparse.csr_matrix(data) | ||||||
| except BaseException as err: | ||||||
| raise TypeError(f"Cannot predict data for type {type(data).__name__}") from err | ||||||
| preds, nrow = self.__pred_for_csr( | ||||||
| # TODO: remove 'type: ignore[assignment]' when https://github.com/microsoft/LightGBM/pull/6348 is resolved. | ||||||
| preds, nrow = self.__pred_for_csr( # type: ignore[assignment] | ||||||
| csr=csr, | ||||||
| start_iteration=start_iteration, | ||||||
| num_iteration=num_iteration, | ||||||
|
|
@@ -1245,6 +1252,7 @@ def predict( | |||||
|
|
||||||
| def __get_num_preds( | ||||||
| self, | ||||||
| *, | ||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Continuing the work I've been doing (e.g. #7111) to enforce more use of keyword-only arguments in internal functions, to make the data flow clearer. Touching this because some calls to |
||||||
| start_iteration: int, | ||||||
| num_iteration: int, | ||||||
| nrow: int, | ||||||
|
|
@@ -1328,7 +1336,12 @@ def __pred_for_np2d( | |||||
| sections = np.arange(_MAX_INT32, nrow, _MAX_INT32) | ||||||
| # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal | ||||||
| n_preds = [ | ||||||
| self.__get_num_preds(start_iteration, num_iteration, i, predict_type) | ||||||
| self.__get_num_preds( | ||||||
| start_iteration=start_iteration, | ||||||
| num_iteration=num_iteration, | ||||||
| nrow=int(i), | ||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cast to |
||||||
| predict_type=predict_type, | ||||||
| ) | ||||||
| for i in np.diff([0] + list(sections) + [nrow]) | ||||||
| ] | ||||||
| n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() | ||||||
|
|
@@ -1364,7 +1377,7 @@ def __create_sparse_native( | |||||
| indptr_type: int, | ||||||
| data_type: int, | ||||||
| is_csr: bool, | ||||||
| ) -> Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]]: | ||||||
| ) -> _LGBM_PredictSparseReturnType: | ||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This type was wrong. The output isn't always a list: LightGBM/python-package/lightgbm/basic.py Lines 1415 to 1416 in 80ab6d3
That change results in all the other similar changes to |
||||||
| # create numpy array from output arrays | ||||||
| data_indices_len = out_shape[0] | ||||||
| indptr_len = out_shape[1] | ||||||
|
|
@@ -1472,7 +1485,7 @@ def __inner_predict_csr_sparse( | |||||
| start_iteration: int, | ||||||
| num_iteration: int, | ||||||
| predict_type: int, | ||||||
| ) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]: | ||||||
| ) -> Tuple[_LGBM_PredictSparseReturnType, int]: | ||||||
| ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) | ||||||
| ptr_data, type_ptr_data, _ = _c_float_array(csr.data) | ||||||
| csr_indices = csr.indices.astype(np.int32, copy=False) | ||||||
|
|
@@ -1530,7 +1543,7 @@ def __pred_for_csr( | |||||
| start_iteration: int, | ||||||
| num_iteration: int, | ||||||
| predict_type: int, | ||||||
| ) -> Tuple[np.ndarray, int]: | ||||||
| ) -> Tuple[_LGBM_PredictSparseReturnType, int]: | ||||||
| """Predict for a CSR data.""" | ||||||
| if predict_type == _C_API_PREDICT_CONTRIB: | ||||||
| return self.__inner_predict_csr_sparse( | ||||||
|
|
@@ -1543,7 +1556,15 @@ def __pred_for_csr( | |||||
| if nrow > _MAX_INT32: | ||||||
| sections = [0] + list(np.arange(_MAX_INT32, nrow, _MAX_INT32)) + [nrow] | ||||||
| # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal | ||||||
| n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)] | ||||||
| n_preds = [ | ||||||
| self.__get_num_preds( | ||||||
| start_iteration=start_iteration, | ||||||
| num_iteration=num_iteration, | ||||||
| nrow=int(i), | ||||||
| predict_type=predict_type, | ||||||
| ) | ||||||
| for i in np.diff(sections) | ||||||
| ] | ||||||
| n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() | ||||||
| preds = np.empty(sum(n_preds), dtype=np.float64) | ||||||
| for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip( | ||||||
|
|
@@ -1573,7 +1594,7 @@ def __inner_predict_sparse_csc( | |||||
| start_iteration: int, | ||||||
| num_iteration: int, | ||||||
| predict_type: int, | ||||||
| ) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]: | ||||||
| ) -> Tuple[_LGBM_PredictSparseReturnType, int]: | ||||||
| ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) | ||||||
| ptr_data, type_ptr_data, _ = _c_float_array(csc.data) | ||||||
| csc_indices = csc.indices.astype(np.int32, copy=False) | ||||||
|
|
@@ -1631,7 +1652,7 @@ def __pred_for_csc( | |||||
| start_iteration: int, | ||||||
| num_iteration: int, | ||||||
| predict_type: int, | ||||||
| ) -> Tuple[np.ndarray, int]: | ||||||
| ) -> Tuple[_LGBM_PredictSparseReturnType, int]: | ||||||
| """Predict for a CSC data.""" | ||||||
| nrow = csc.shape[0] | ||||||
| if nrow > _MAX_INT32: | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -62,7 +62,6 @@ | |||||||||||
| _DaskMatrixLike = Union[dask_Array, dask_DataFrame] | ||||||||||||
| _DaskVectorLike = Union[dask_Array, dask_Series] | ||||||||||||
| _DaskPart = Union[np.ndarray, pd_DataFrame, pd_Series, ss.spmatrix] | ||||||||||||
| _PredictionDtype = Union[Type[np.float32], Type[np.float64], Type[np.int32], Type[np.int64]] | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| class _RemoteSocket: | ||||||||||||
|
|
@@ -891,6 +890,15 @@ def _predict_part( | |||||||||||
|
|
||||||||||||
| # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series | ||||||||||||
| if isinstance(part, pd_DataFrame): | ||||||||||||
| # assert that 'result' is an array, only necessary because predict(..., pred_contrib=True) on | ||||||||||||
| # sparse matrices returns a list. | ||||||||||||
| # | ||||||||||||
| # This can be removed when https://github.com/microsoft/LightGBM/pull/6348 is resolved. | ||||||||||||
| error_msg = ( | ||||||||||||
| f"predict(X) for lightgbm.dask estimators should always return an array, not '{type(result)}', when X is a pandas Dataframe. " | ||||||||||||
| "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." | ||||||||||||
| ) | ||||||||||||
| assert hasattr(result, "shape"), error_msg | ||||||||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resolves this: We know that This type of workaround can be removed when #6348 is completed. |
||||||||||||
| if len(result.shape) == 2: | ||||||||||||
| result = pd_DataFrame(result, index=part.index) | ||||||||||||
| else: | ||||||||||||
|
|
@@ -908,7 +916,6 @@ def _predict( | |||||||||||
| pred_proba: bool = False, | ||||||||||||
| pred_leaf: bool = False, | ||||||||||||
| pred_contrib: bool = False, | ||||||||||||
| dtype: _PredictionDtype = np.float32, | ||||||||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Started looking into all this |
||||||||||||
| **kwargs: Any, | ||||||||||||
| ) -> Union[dask_Array, List[dask_Array]]: | ||||||||||||
| """Inner predict routine. | ||||||||||||
|
|
@@ -927,8 +934,6 @@ def _predict( | |||||||||||
| Whether to predict leaf index. | ||||||||||||
| pred_contrib : bool, optional (default=False) | ||||||||||||
| Whether to predict feature contributions. | ||||||||||||
| dtype : np.dtype, optional (default=np.float32) | ||||||||||||
| Dtype of the output. | ||||||||||||
| **kwargs | ||||||||||||
| Other parameters passed to ``predict`` or ``predict_proba`` method. | ||||||||||||
|
|
||||||||||||
|
|
@@ -1041,7 +1046,6 @@ def _extract(items: List[Any], i: int) -> Any: | |||||||||||
| predict_fn, | ||||||||||||
| chunks=chunks, | ||||||||||||
| meta=pred_row, | ||||||||||||
| dtype=dtype, | ||||||||||||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This The code looks like it's intended to allow setting the dtype of the output, but that's not how it works. Passing import dask.array as da
import numpy as np
x = da.arange(6, chunks=3)
x.map_blocks(lambda x: x * 2).compute().dtype
# dtype('int64')
x.map_blocks(lambda x: x * 2, dtype=np.float64).compute().dtype
# dtype('int64')Instead, it's just there to avoid Dask trying to infer the output dtype of whatever the function passed to See https://docs.dask.org/en/stable/_modules/dask/array/core.html#map_blocks
It should be safe to allow that type inference, because we're providing the LightGBM/python-package/lightgbm/dask.py Line 1033 in 80ab6d3
LightGBM/python-package/lightgbm/dask.py Lines 1040 to 1043 in 80ab6d3
That's nice because it also avoids needing to encode the logic of which output dtypes match to which mix of input types and
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||
| **map_blocks_kwargs, | ||||||||||||
| ) | ||||||||||||
| else: | ||||||||||||
|
|
@@ -1290,7 +1294,6 @@ def predict( | |||||||||||
| return _predict( | ||||||||||||
| model=self.to_local(), | ||||||||||||
| data=X, | ||||||||||||
| dtype=self.classes_.dtype, | ||||||||||||
| client=_get_dask_client(self.client), | ||||||||||||
| raw_score=raw_score, | ||||||||||||
| start_iteration=start_iteration, | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1576,6 +1576,57 @@ def test_predict_with_raw_score(task, output, cluster): | |
| assert_eq(raw_predictions, pred_proba_raw) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("output", data_output) | ||
| @pytest.mark.parametrize("task", tasks) | ||
| def test_predict_returns_expected_dtypes(task, output, cluster): | ||
| if task == "ranking" and output == "scipy_csr_matrix": | ||
| pytest.skip("LGBMRanker is not currently tested on sparse matrices") | ||
|
|
||
| with Client(cluster) as client: | ||
| _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output=output, group=None) | ||
|
|
||
| model_factory = task_to_dask_factory[task] | ||
| params = { | ||
| "client": client, | ||
| "n_estimators": 1, | ||
| "num_leaves": 2, | ||
| "time_out": 5, | ||
| "verbose": -1, | ||
| } | ||
| model = model_factory(**params) | ||
| model.fit(dX, dy, group=dg) | ||
|
|
||
| # use a small sub-sample (to keep the tests fast) | ||
| if output.startswith("dataframe"): | ||
| dX_sample = dX.sample(frac=0.001) | ||
| else: | ||
| dX_sample = dX[:1,] | ||
| dX_sample.persist() | ||
|
Comment on lines
+1599
to
+1604
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my local testing (macOS, |
||
|
|
||
| # raw predictions are always float64 | ||
| raw_predictions = model.predict(dX_sample, raw_score=True).compute() | ||
| assert raw_predictions.dtype == np.float64 | ||
|
|
||
| # pred_contrib are always float64 | ||
| if output.startswith("scipy"): | ||
| preds_contrib = [arr.compute() for arr in model.predict(dX_sample, pred_contrib=True)] | ||
| assert all(arr.dtype == np.float64 for arr in preds_contrib) | ||
| else: | ||
| preds_contrib = model.predict(dX_sample, pred_contrib=True).compute() | ||
| assert preds_contrib.dtype == np.float64 | ||
|
|
||
| # pred_leaves are: | ||
| # | ||
| # - int64 for binary and multiclass classification | ||
| # - float64 for ranking and regression | ||
| # | ||
| preds_leaves = model.predict(dX_sample, pred_leaves=True).compute() | ||
| if task.endswith("classification"): | ||
| assert preds_leaves.dtype == np.int64 | ||
| else: | ||
| assert preds_leaves.dtype == np.float64 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("output", data_output) | ||
| @pytest.mark.parametrize("use_init_score", [False, True]) | ||
| def test_predict_stump(output, use_init_score, cluster, rng): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1897,7 +1897,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl | |
| assert preds.shape[0] == y.shape[0] | ||
|
|
||
|
|
||
| def run_minimal_test(X_type, y_type, g_type, task, rng): | ||
| def _run_minimal_test(*, X_type, y_type, g_type, task, rng): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a small cosmetic change...marking this internal and forcing the use of keyword arguments makes the calls a little stricter and clearer, in my opinion. |
||
| X, y, g = _create_data(task, n_samples=2_000) | ||
| weights = np.abs(rng.standard_normal(size=(y.shape[0],))) | ||
|
|
||
|
|
@@ -1987,6 +1987,7 @@ def run_minimal_test(X_type, y_type, g_type, task, rng): | |
| params_fit["eval_group"] = [g] | ||
| model.fit(**params_fit) | ||
|
|
||
| # --- prediction accuracy --# | ||
| preds = model.predict(X) | ||
| if task == "binary-classification": | ||
| assert accuracy_score(y, preds) >= 0.99 | ||
|
|
@@ -1999,6 +2000,27 @@ def run_minimal_test(X_type, y_type, g_type, task, rng): | |
| else: | ||
| raise ValueError(f"Unrecognized task: '{task}'") | ||
|
|
||
| # --- prediction dtypes ---# | ||
| # raw predictions are always float64 | ||
| assert model.predict(X, raw_score=True).dtype == np.float64 | ||
|
|
||
| # pred_contrib are always float64 | ||
| if X_type.startswith("scipy"): | ||
| assert all(arr.dtype == np.float64 for arr in model.predict(X, pred_contrib=True)) | ||
| else: | ||
| assert model.predict(X, pred_contrib=True).dtype == np.float64 | ||
|
|
||
| # pred_leaves are: | ||
| # | ||
| # - int64 for binary and multiclass classification | ||
| # - float64 for ranking and regression | ||
| # | ||
| preds_leaves = model.predict(X, pred_leaves=True) | ||
| if task.endswith("classification"): | ||
| assert preds_leaves.dtype == np.int64 | ||
| else: | ||
| assert preds_leaves.dtype == np.float64 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("X_type", all_x_types) | ||
| @pytest.mark.parametrize("y_type", all_y_types) | ||
|
|
@@ -2014,7 +2036,7 @@ def test_classification_and_regression_minimally_work_with_all_accepted_data_typ | |
| if any(t.startswith("pa_") for t in [X_type, y_type]) and not PYARROW_INSTALLED: | ||
| pytest.skip("pyarrow is not installed") | ||
|
|
||
| run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng) | ||
| _run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("X_type", all_x_types) | ||
|
|
@@ -2031,7 +2053,7 @@ def test_ranking_minimally_works_with_all_accepted_data_types( | |
| if any(t.startswith("pa_") for t in [X_type, y_type, g_type]) and not PYARROW_INSTALLED: | ||
| pytest.skip("pyarrow is not installed") | ||
|
|
||
| run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng) | ||
| _run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng) | ||
|
|
||
|
|
||
| def test_classifier_fit_detects_classes_every_time(): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After fixing the
Booster.__pred_for_csr()and similar return type hints,# type: ignorecomments like these are necessary fix thesemypywarnings:Those are necessary because
mypyassigns a type topredsthe first time it's assigned.This is the type of complexity that can go away once #6348 is completed (I'm planning to return to that soon).