diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 1320e7b..45f9c2d 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -11,7 +11,7 @@ from scipy.optimize import fmin_l_bfgs_b -from dask_glm.utils import dot, normalize, scatter_array, get_distributed_client +from dask_glm.utils import dot, normalize, scatter_array, get_distributed_client, safe_zeros_like from dask_glm.families import Logistic from dask_glm.regularizers import Regularizer @@ -97,7 +97,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) for k in range(max_iter): # how necessary is this recalculation? @@ -161,7 +161,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): """ gradient, hessian = family.gradient, family.hessian n, p = X.shape - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) Xbeta = dot(X, beta) iter_count = 0 @@ -387,7 +387,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = safe_zeros_like(X, shape=p) regularizer = Regularizer.get(regularizer) for k in range(max_iter): diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py index d2212c4..fdded81 100644 --- a/dask_glm/tests/test_estimators.py +++ b/dask_glm/tests/test_estimators.py @@ -44,9 +44,14 @@ def test_pr_init(solver): @pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('is_sparse', [True, False]) -def test_fit(fit_intercept, is_sparse): +@pytest.mark.parametrize('is_sparse,is_numpy', [ + (True, False), + (False, False), + (False, True)]) +def test_fit(fit_intercept, is_sparse, is_numpy): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) + if is_numpy: + X, y = dask.compute(X, y) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) diff --git a/dask_glm/utils.py b/dask_glm/utils.py index 0fe3429..ebf0bc3 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -23,7 +23,7 @@ def normalize_inputs(X, y, *args, **kwargs): raise ValueError('Multiple constant columns detected!') mean[intercept_idx] = 0 std[intercept_idx] = 1 - mean = mean if len(intercept_idx[0]) else np.zeros_like(X._meta, shape=mean.shape) + mean = mean if len(intercept_idx[0]) else safe_zeros_like(X, shape=mean.shape) Xn = (X - mean) / std out = algo(Xn, y, *args, **kwargs).copy() i_adj = np.sum(out * mean / std) @@ -41,7 +41,7 @@ def sigmoid(x): @dispatch(object) def exp(A): - return A.exp() + return np.exp(A) @dispatch(float) @@ -91,7 +91,7 @@ def sign(A): @dispatch(object) def log1p(A): - return A.log1p() + return np.log1p(A) @dispatch(np.ndarray) @@ -121,7 +121,7 @@ def is_dask_array_sparse(X): """ Check using _meta if a dask array contains sparse arrays """ - return isinstance(X._meta, sparse.SparseArray) + return isinstance(X, da.Array) and isinstance(X._meta, sparse.SparseArray) @dispatch(np.ndarray) @@ -149,6 +149,11 @@ def add_intercept(X): return X_i +@dispatch(object) +def add_intercept(X): + return np.concatenate([X, np.ones_like(X, shape=(X.shape[0], 1))], axis=1) + + def make_y(X, beta=np.array([1.5, -3]), chunks=2): n, p = X.shape z0 = X.dot(beta) @@ -205,3 +210,9 @@ def get_distributed_client(): return get_client() except ValueError: return None + + +def safe_zeros_like(X, shape): + if isinstance(X, da.Array): + return np.zeros_like(X._meta, shape=shape) + return np.zeros_like(X, shape=shape)