From 4bfdc0188e95bf42b44202e7b86e631665b6f778 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 28 Oct 2020 20:23:10 -0700 Subject: [PATCH 1/4] fix is_dask_array_sparse --- dask_glm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_glm/utils.py b/dask_glm/utils.py index 0fe3429..7c22464 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -121,7 +121,7 @@ def is_dask_array_sparse(X): """ Check using _meta if a dask array contains sparse arrays """ - return isinstance(X._meta, sparse.SparseArray) + return isinstance(X, da.Array) and isinstance(X._meta, sparse.SparseArray) @dispatch(np.ndarray) From 2ad455ff199dc26dcea2b4e699b1b28e50a7b579 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 28 Oct 2020 20:59:09 -0700 Subject: [PATCH 2/4] numpy works. cupy works except admm & lbfgs --- dask_glm/algorithms.py | 8 ++++---- dask_glm/utils.py | 17 ++++++++++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 1320e7b..45f9c2d 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -11,7 +11,7 @@ from scipy.optimize import fmin_l_bfgs_b -from dask_glm.utils import dot, normalize, scatter_array, get_distributed_client +from dask_glm.utils import dot, normalize, scatter_array, get_distributed_client, safe_zeros_like from dask_glm.families import Logistic from dask_glm.regularizers import Regularizer @@ -97,7 +97,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) for k in range(max_iter): # how necessary is this recalculation? @@ -161,7 +161,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): """ gradient, hessian = family.gradient, family.hessian n, p = X.shape - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) Xbeta = dot(X, beta) iter_count = 0 @@ -387,7 +387,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) regularizer = Regularizer.get(regularizer) for k in range(max_iter): diff --git a/dask_glm/utils.py b/dask_glm/utils.py index 7c22464..ebf0bc3 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -23,7 +23,7 @@ def normalize_inputs(X, y, *args, **kwargs): raise ValueError('Multiple constant columns detected!') mean[intercept_idx] = 0 std[intercept_idx] = 1 - mean = mean if len(intercept_idx[0]) else np.zeros_like(X._meta, shape=mean.shape) + mean = mean if len(intercept_idx[0]) else safe_zeros_like(X, shape=mean.shape) Xn = (X - mean) / std out = algo(Xn, y, *args, **kwargs).copy() i_adj = np.sum(out * mean / std) @@ -41,7 +41,7 @@ def sigmoid(x): @dispatch(object) def exp(A): - return A.exp() + return np.exp(A) @dispatch(float) @@ -91,7 +91,7 @@ def sign(A): @dispatch(object) def log1p(A): - return A.log1p() + return np.log1p(A) @dispatch(np.ndarray) @@ -149,6 +149,11 @@ def add_intercept(X): return X_i +@dispatch(object) +def add_intercept(X): + return np.concatenate([X, np.ones_like(X, shape=(X.shape[0], 1))], axis=1) + + def make_y(X, beta=np.array([1.5, -3]), chunks=2): n, p = X.shape z0 = X.dot(beta) @@ -205,3 +210,9 @@ def get_distributed_client(): return get_client() except ValueError: return None + + +def safe_zeros_like(X, shape): + if isinstance(X, da.Array): + return np.zeros_like(X._meta, shape=shape) + return np.zeros_like(X, shape=shape) From 9a8170c11fc253bccaf8cbcfea95dc2b97aaf649 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 28 Oct 2020 22:49:07 -0700 Subject: [PATCH 3/4] add one test for numpy input --- dask_glm/tests/test_estimators.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py index d2212c4..9a19b9b 100644 --- a/dask_glm/tests/test_estimators.py +++ b/dask_glm/tests/test_estimators.py @@ -45,8 +45,13 @@ def test_pr_init(solver): @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('is_sparse', [True, False]) -def test_fit(fit_intercept, is_sparse): +@pytest.mark.parametrize('is_numpy', [True, False]) +def test_fit(fit_intercept, is_sparse, is_numpy): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) + if is_numpy: + if is_sparse: + return + X, y = dask.compute(X, y) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) From 1af0b03c5f1daf3ead17ecbde07ade18296ea51c Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 28 Oct 2020 23:03:16 -0700 Subject: [PATCH 4/4] fix test_fit --- dask_glm/tests/test_estimators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py index 9a19b9b..fdded81 100644 --- a/dask_glm/tests/test_estimators.py +++ b/dask_glm/tests/test_estimators.py @@ -44,13 +44,13 @@ def test_pr_init(solver): @pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('is_sparse', [True, False]) -@pytest.mark.parametrize('is_numpy', [True, False]) +@pytest.mark.parametrize('is_sparse,is_numpy', [ + (True, False), + (False, False), + (False, True)]) def test_fit(fit_intercept, is_sparse, is_numpy): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) if is_numpy: - if is_sparse: - return X, y = dask.compute(X, y) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y)