From 78ef281968d7e0ad1441e0af30fec63760a5c497 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Sun, 16 Aug 2020 12:13:10 -0700
Subject: [PATCH 01/24] migrate gaussian distr to subclass of BaseDistribution

---
 pyglmnet/distributions.py |  80 +++++++++++++++++++++++
 pyglmnet/metrics.py       |   9 ++-
 pyglmnet/pyglmnet.py      | 130 +++++++++++++++++++++++---------------
 pyglmnet/utils.py         |  10 ++-
 4 files changed, 171 insertions(+), 58 deletions(-)
 create mode 100644 pyglmnet/distributions.py

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
new file mode 100644
index 00000000..8a96c96e
--- /dev/null
+++ b/pyglmnet/distributions.py
@@ -0,0 +1,80 @@
+"""Exponential family distributions for GLM estimators."""
+
+from abc import ABC, abstractmethod
+import numpy as np
+
+
+class BaseDistribution(ABC):
+    """Base class for distributions."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    @abstractmethod
+    def mu(self, z):
+        """Inverse link function."""
+        pass
+
+    @abstractmethod
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        pass
+
+    @abstractmethod
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        pass
+
+    def grad_log_likelihood(self):
+        """Gradient of L2-penalized log likelihood."""
+        msg = (f"""Gradients of log likelihood are not specified for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
+    def gradhess_log_likelihood_1d(self):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        msg = (f"""Gradients and Hessians of 1-d log likelihood are not specified for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
+    def _z(self, beta0, beta, X):
+        """Compute z to be passed through non-linearity."""
+        return beta0 + np.dot(X, beta)
+
+
+class Gaussian(BaseDistribution):
+    """Base class for distributions."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = z
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = np.ones_like(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        log_likelihood = -0.5 * np.sum((y - y_hat) ** 2)
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        grad_beta0 = np.sum((mu - y) * grad_mu)
+        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        gk = np.sum((z - y) * xk)
+        hk = np.sum(xk * xk)
+        return gk, hk
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index 27bd1080..60225b9f 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -28,7 +28,8 @@ def deviance(y, yhat, distr, theta):
     else:
         LS = 0
 
-    L1 = _logL(distr, y, yhat, theta=theta)
+    # L1 = _logL(distr, y, yhat, theta=theta)
+    L1 = distr.log_likelihood(y, yhat)
     score = -2 * (L1 - LS)
     return score
 
@@ -60,8 +61,10 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     else:
         LS = 0
 
-    L0 = _logL(distr, y, ynull_, theta=theta)
-    L1 = _logL(distr, y, yhat, theta=theta)
+    # L0 = _logL(distr, y, ynull_, theta=theta)
+    # L1 = _logL(distr, y, yhat, theta=theta)
+    L0 = distr.log_likelihood(y, ynull_)
+    L1 = distr.log_likelihood(y, yhat)
 
     if distr in ['softplus', 'poisson', 'neg-binomial']:
         score = (1 - (LS - L1) / (LS - L0))
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 9e00a9c1..5a18950a 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -14,10 +14,22 @@
 from .externals.sklearn.utils import check_random_state, check_array, check_X_y
 from .externals.sklearn.utils.validation import check_is_fitted
 
+from .distributions import BaseDistribution, Gaussian
+
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
 
 
+def _get_distr(distr):
+    if isinstance(distr, BaseDistribution):
+        return distr
+
+    elif distr == "gaussian":
+        distr = Gaussian()
+
+    return distr
+
+
 def _probit_g1(z, pdfz, cdfz, thresh=5):
     res = np.zeros_like(z)
     res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
@@ -244,51 +256,59 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
             Tau = np.eye(beta.shape[0])
     InvCov = np.dot(Tau.T, Tau)
 
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    mu = _mu(distr, z, eta, fit_intercept)
-    grad_mu = _grad_mu(distr, z, eta)
-
-    grad_beta0 = 0.
-    if distr in ['poisson', 'softplus']:
-        if fit_intercept:
-            grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-        grad_beta = ((np.dot(grad_mu.T, X) -
-                      np.dot((y * grad_mu / mu).T, X)).T)
-
-    elif distr == 'gaussian':
-        if fit_intercept:
-            grad_beta0 = np.sum((mu - y) * grad_mu)
-        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
-
-    elif distr == 'binomial':
-        if fit_intercept:
-            grad_beta0 = np.sum(mu - y)
-        grad_beta = np.dot((mu - y).T, X).T
-
-    elif distr == 'probit':
-        grad_logl = (y * _probit_g3(z, grad_mu, mu) -
-                     (1 - y) * _probit_g4(z, grad_mu, mu))
-        if fit_intercept:
-            grad_beta0 = -np.sum(grad_logl)
-        grad_beta = -np.dot(grad_logl.T, X).T
+    # z = _z(beta[0], beta[1:], X, fit_intercept)
+    # mu = _mu(distr, z, eta, fit_intercept)
+    # grad_mu = _grad_mu(distr, z, eta)
 
-    elif distr == 'gamma':
-        nu = 1.
-        grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
-        if fit_intercept:
-            grad_beta0 = -nu * np.sum(grad_logl)
-        grad_beta = -nu * np.dot(grad_logl.T, X).T
-
-    elif distr == 'neg-binomial':
-        partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
-
-        if fit_intercept:
-            grad_beta0 = np.sum(partial_beta_0)
-
-        grad_beta = np.dot(partial_beta_0.T, X)
+    if fit_intercept:
+        beta0_, beta_ = beta[0], beta[1:]
+    else:
+        beta0_, beta_ = 0., beta
+    grad_beta0, grad_beta = distr.grad_log_likelihood(X, y, beta0_, beta_)
+    grad_beta0 = grad_beta0 if fit_intercept else 0.
+
+    # grad_beta0 = 0.
+    # if distr in ['poisson', 'softplus']:
+    #     if fit_intercept:
+    #         grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
+    #     grad_beta = ((np.dot(grad_mu.T, X) -
+    #                   np.dot((y * grad_mu / mu).T, X)).T)
+    #
+    # elif distr == 'gaussian':
+    #     if fit_intercept:
+    #         grad_beta0 = np.sum((mu - y) * grad_mu)
+    #     grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
+    #
+    # elif distr == 'binomial':
+    #     if fit_intercept:
+    #         grad_beta0 = np.sum(mu - y)
+    #     grad_beta = np.dot((mu - y).T, X).T
+    #
+    # elif distr == 'probit':
+    #     grad_logl = (y * _probit_g3(z, grad_mu, mu) -
+    #                  (1 - y) * _probit_g4(z, grad_mu, mu))
+    #     if fit_intercept:
+    #         grad_beta0 = -np.sum(grad_logl)
+    #     grad_beta = -np.dot(grad_logl.T, X).T
+    #
+    # elif distr == 'gamma':
+    #     nu = 1.
+    #     grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
+    #     if fit_intercept:
+    #         grad_beta0 = -nu * np.sum(grad_logl)
+    #     grad_beta = -nu * np.dot(grad_logl.T, X).T
+    #
+    # elif distr == 'neg-binomial':
+    #     partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
+    #
+    #     if fit_intercept:
+    #         grad_beta0 = np.sum(partial_beta_0)
+    #
+    #     grad_beta = np.dot(partial_beta_0.T, X)
 
     grad_beta0 *= 1. / n_samples
     grad_beta *= 1. / n_samples
+
     if fit_intercept:
         grad_beta += reg_lambda * (1 - alpha) * np.dot(InvCov, beta[1:])
         g = np.zeros((n_features + 1, ))
@@ -300,7 +320,6 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
 
     return g
 
-
 def _gradhess_logloss_1d(distr, xk, y, z, eta, theta, fit_intercept=True):
     """
     Compute gradient (1st derivative)
@@ -495,8 +514,8 @@ class GLM(BaseEstimator):
 
     Parameters
     ----------
-    distr: str
-        distribution family can be one of the following
+    distr: str or object subclassed from BaseDistribution
+        if str, distribution family can be one of the following
         'gaussian' | 'binomial' | 'poisson' | 'softplus'
         | 'probit' | 'gamma' | 'neg-binomial'
         default: 'poisson'.
@@ -604,8 +623,7 @@ def __init__(self, distr='poisson', alpha=0.5,
 
         _check_params(distr=distr, max_iter=max_iter,
                       fit_intercept=fit_intercept)
-
-        self.distr = distr
+        self.distr = _get_distr(distr)
         self.alpha = alpha
         self.reg_lambda = reg_lambda
         self.Tau = Tau
@@ -770,8 +788,11 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
                     xk = X[:, k]
 
                 # Calculate grad and hess of log likelihood term
-                gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta,
-                                              self.theta, fit_intercept)
+                # gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta,
+                #                               self.theta, fit_intercept)
+                gk, hk = self.distr.gradhess_log_likelihood_1d(xk, y, z)
+                gk = 1. / n_samples * gk
+                hk = 1. / n_samples * hk
 
                 # Add grad and hess of regularization term
                 if self.Tau is None:
@@ -998,8 +1019,11 @@ def predict(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-                    fit_intercept=True)
+        # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
+        #             fit_intercept=True)
+
+        z = self.distr._z(self.beta0_, self.beta_, X)
+        yhat = self.distr.mu(z)
 
         if self.distr == 'binomial':
             yhat = (yhat > 0.5).astype(int)
@@ -1028,8 +1052,10 @@ def _predict_proba(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-                    fit_intercept=True)
+        # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
+        #             fit_intercept=True)
+        z = self.distr._z(self.beta0_, self.beta_, X)
+        yhat = self.distr.mu(z)
         yhat = np.asarray(yhat)
         return yhat
 
diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index ce8bc864..0f5c6b7d 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -7,6 +7,7 @@
 import logging
 from scipy.special import log1p
 
+from .distributions import BaseDistribution
 
 logger = logging.getLogger('pyglmnet')
 logger.addHandler(logging.StreamHandler())
@@ -113,9 +114,12 @@ def tikhonov_from_prior(prior_cov, n_samples, threshold=0.0001):
 def _check_params(distr, max_iter, fit_intercept):
     from .pyglmnet import ALLOWED_DISTRS
 
-    if distr not in ALLOWED_DISTRS:
-        raise ValueError('distr must be one of %s, Got '
-                         '%s' % (', '.join(ALLOWED_DISTRS), distr))
+    err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
+               'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
+    if isinstance(distr, str) and distr not in ALLOWED_DISTRS:
+            raise ValueError(err_msg)
+    if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
+            raise ValueError(err_msg)
 
     if not isinstance(max_iter, int):
         raise ValueError('max_iter must be of type int')

From 5cf1a01e73fdc025725b289864d65cdf8caaf968 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Mon, 17 Aug 2020 23:56:25 -0700
Subject: [PATCH 02/24] add Poisson class

---
 pyglmnet/distributions.py | 58 ++++++++++++++++++++++++++++++++++++++-
 pyglmnet/metrics.py       | 16 +++++++----
 pyglmnet/pyglmnet.py      | 10 +++++--
 3 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 8a96c96e..9e9f50d4 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -42,7 +42,7 @@ def _z(self, beta0, beta, X):
 
 
 class Gaussian(BaseDistribution):
-    """Base class for distributions."""
+    """Class for Gaussian distribution."""
 
     def __init__(self):
         """init."""
@@ -78,3 +78,59 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
         gk = np.sum((z - y) * xk)
         hk = np.sum(xk * xk)
         return gk, hk
+
+
+class Poisson(BaseDistribution):
+    """Class for Poisson distribution."""
+
+    def __init__(self):
+        """init."""
+        self.eta = None
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = z.copy()
+        mu0 = (1 - self.eta) * np.exp(self.eta)
+        mu[z > self.eta] = z[z > self.eta] * np.exp(self.eta) + mu0
+        mu[z <= self.eta] = np.exp(z[z <= self.eta])
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = z.copy()
+        grad_mu[z > self.eta] = \
+            np.ones_like(z)[z > self.eta] * np.exp(self.eta)
+        grad_mu[z <= self.eta] = np.exp(z[z <= self.eta])
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        eps = np.spacing(1)
+        log_likelihood = np.sum(y * np.log(y_hat + eps) - y_hat)
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
+        grad_beta = ((np.dot(grad_mu.T, X) -
+                      np.dot((y * grad_mu / mu).T, X)).T)
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        s = expit(z)
+        gk = np.sum((mu[z <= self.eta] - y[z <= self.eta]) *
+                    xk[z <= self.eta]) + \
+            np.exp(self.eta) * \
+            np.sum((1 - y[z > self.eta] / mu[z > self.eta]) *
+                   xk[z > self.eta])
+        hk = np.sum(mu[z <= self.eta] * xk[z <= self.eta] ** 2) + \
+            np.exp(self.eta) ** 2 * \
+            np.sum(y[z > self.eta] / (mu[z > self.eta] ** 2) *
+                   (xk[z > self.eta] ** 2))
+        return gk, hk
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index 60225b9f..332296d1 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 from .pyglmnet import _logL
+from .distributions import Poisson
 
 
 def deviance(y, yhat, distr, theta):
@@ -23,8 +24,10 @@ def deviance(y, yhat, distr, theta):
     score : float
         Deviance of the predicted labels.
     """
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
-        LS = _logL(distr, y, y, theta=theta)
+    # if distr in ['softplus', 'poisson', 'neg-binomial']:
+        # LS = _logL(distr, y, y, theta=theta)
+    if isinstance(distr, Poisson):
+        LS = distr.log_likelihood(y, y)
     else:
         LS = 0
 
@@ -56,8 +59,10 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     score : float
         Pseudo-R2 score.
     """
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
-        LS = _logL(distr, y, y, theta=theta)
+    # if distr in ['softplus', 'poisson', 'neg-binomial']:
+    #     LS = _logL(distr, y, y, theta=theta)
+    if isinstance(distr, Poisson):
+        LS = distr.log_likelihood(y, y)
     else:
         LS = 0
 
@@ -66,7 +71,8 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     L0 = distr.log_likelihood(y, ynull_)
     L1 = distr.log_likelihood(y, yhat)
 
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
+    # if distr in ['softplus', 'poisson', 'neg-binomial']:
+    if isinstance(distr, Poisson):
         score = (1 - (LS - L1) / (LS - L0))
     else:
         score = (1 - L1 / L0)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 5a18950a..0d34c3ca 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -14,7 +14,7 @@
 from .externals.sklearn.utils import check_random_state, check_array, check_X_y
 from .externals.sklearn.utils.validation import check_is_fitted
 
-from .distributions import BaseDistribution, Gaussian
+from .distributions import BaseDistribution, Gaussian, Poisson
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
@@ -24,9 +24,12 @@ def _get_distr(distr):
     if isinstance(distr, BaseDistribution):
         return distr
 
-    elif distr == "gaussian":
+    elif distr == 'gaussian':
         distr = Gaussian()
 
+    elif distr == 'poisson':
+        distr = Poisson()
+
     return distr
 
 
@@ -624,6 +627,9 @@ def __init__(self, distr='poisson', alpha=0.5,
         _check_params(distr=distr, max_iter=max_iter,
                       fit_intercept=fit_intercept)
         self.distr = _get_distr(distr)
+        if isinstance(self.distr, Poisson):
+            self.distr.eta = eta
+        print(self.distr.eta)
         self.alpha = alpha
         self.reg_lambda = reg_lambda
         self.Tau = Tau

From 5592298fbeb34206875b6f767242b61232164a26 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 00:15:53 -0700
Subject: [PATCH 03/24] add PoissonSoftplus class

---
 pyglmnet/distributions.py | 57 +++++++++++++++++++++++++++++++++++++++
 pyglmnet/metrics.py       |  8 +++---
 pyglmnet/pyglmnet.py      |  9 ++++---
 pyglmnet/utils.py         | 11 --------
 4 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 9e9f50d4..8ac5e974 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -2,6 +2,17 @@
 
 from abc import ABC, abstractmethod
 import numpy as np
+from scipy.special import expit, log1p
+
+
+def softplus(z):
+    """Numerically stable version of log(1 + exp(z))."""
+    # see stabilizing softplus: http://sachinashanbhag.blogspot.com/2014/05/numerically-approximation-of-log-1-expy.html # noqa
+    mu = z.copy()
+    mu[z > 35] = z[z > 35]
+    mu[z < -10] = np.exp(z[z < -10])
+    mu[(z >= -10) & (z <= 35)] = log1p(np.exp(z[(z >= -10) & (z <= 35)]))
+    return mu
 
 
 class BaseDistribution(ABC):
@@ -134,3 +145,49 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
             np.sum(y[z > self.eta] / (mu[z > self.eta] ** 2) *
                    (xk[z > self.eta] ** 2))
         return gk, hk
+
+
+class PoissonSoftplus(BaseDistribution):
+    """Class for Poisson distribution with softplus inverse link."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = softplus(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        eps = np.spacing(1)
+        log_likelihood = np.sum(y * np.log(y_hat + eps) - y_hat)
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
+        grad_beta = ((np.dot(grad_mu.T, X) -
+                      np.dot((y * grad_mu / mu).T, X)).T)
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        s = expit(z)
+        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
+
+        grad_s = s * (1 - s)
+        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
+        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
+        return gk, hk
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index 332296d1..c164a289 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 from .pyglmnet import _logL
-from .distributions import Poisson
+from .distributions import Poisson, PoissonSoftplus
 
 
 def deviance(y, yhat, distr, theta):
@@ -26,7 +26,7 @@ def deviance(y, yhat, distr, theta):
     """
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
         # LS = _logL(distr, y, y, theta=theta)
-    if isinstance(distr, Poisson):
+    if isinstance(distr, (Poisson, PoissonSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
         LS = 0
@@ -61,7 +61,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     """
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
     #     LS = _logL(distr, y, y, theta=theta)
-    if isinstance(distr, Poisson):
+    if isinstance(distr, (Poisson, PoissonSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
         LS = 0
@@ -72,7 +72,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     L1 = distr.log_likelihood(y, yhat)
 
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
-    if isinstance(distr, Poisson):
+    if isinstance(distr, (Poisson, PoissonSoftplus)):
         score = (1 - (LS - L1) / (LS - L0))
     else:
         score = (1 - L1 / L0)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 0d34c3ca..18b93bbf 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -8,13 +8,14 @@
 from scipy.stats import norm
 
 from .utils import logger, set_log_level, _check_params, \
-    _verbose_iterable, _tqdm_log, softplus
+    _verbose_iterable, _tqdm_log
 from .base import BaseEstimator, is_classifier, check_version
 
 from .externals.sklearn.utils import check_random_state, check_array, check_X_y
 from .externals.sklearn.utils.validation import check_is_fitted
 
-from .distributions import BaseDistribution, Gaussian, Poisson
+from .distributions import BaseDistribution, Gaussian, Poisson, \
+    PoissonSoftplus, softplus
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
@@ -30,6 +31,9 @@ def _get_distr(distr):
     elif distr == 'poisson':
         distr = Poisson()
 
+    elif distr == 'softplus':
+        distr = PoissonSoftplus()
+
     return distr
 
 
@@ -629,7 +633,6 @@ def __init__(self, distr='poisson', alpha=0.5,
         self.distr = _get_distr(distr)
         if isinstance(self.distr, Poisson):
             self.distr.eta = eta
-        print(self.distr.eta)
         self.alpha = alpha
         self.reg_lambda = reg_lambda
         self.Tau = Tau
diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index 0f5c6b7d..2940e151 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -5,7 +5,6 @@
 import numpy as np
 from copy import copy
 import logging
-from scipy.special import log1p
 
 from .distributions import BaseDistribution
 
@@ -22,16 +21,6 @@
     has_tqdm = False
 
 
-def softplus(z):
-    """Numerically stable version of log(1 + exp(z))."""
-    # see stabilizing softplus: http://sachinashanbhag.blogspot.com/2014/05/numerically-approximation-of-log-1-expy.html # noqa
-    mu = z.copy()
-    mu[z > 35] = z[z > 35]
-    mu[z < -10] = np.exp(z[z < -10])
-    mu[(z >= -10) & (z <= 35)] = log1p(np.exp(z[(z >= -10) & (z <= 35)]))
-    return mu
-
-
 def softmax(w):
     """Softmax function of given array of number w.
 

From f53e5f39e6b2e14df2e046053da7f4446d8f73ea Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 00:28:07 -0700
Subject: [PATCH 04/24] add NegBinomialSoftplus class

---
 pyglmnet/distributions.py | 56 ++++++++++++++++++++++++++++++++++++++-
 pyglmnet/metrics.py       |  8 +++---
 pyglmnet/pyglmnet.py      |  7 ++++-
 3 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 8ac5e974..91da2fa5 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -2,7 +2,7 @@
 
 from abc import ABC, abstractmethod
 import numpy as np
-from scipy.special import expit, log1p
+from scipy.special import expit, log1p, loggamma
 
 
 def softplus(z):
@@ -191,3 +191,57 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
         grad_s_by_mu = grad_s / mu - s / (mu ** 2)
         hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
         return gk, hk
+
+
+class NegBinomialSoftplus(BaseDistribution):
+    """Class for Negative binomial distribution with softplus inverse link."""
+
+    def __init__(self):
+        """init."""
+        self.theta = None
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = softplus(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        theta = self.theta
+        log_likelihood = \
+            np.sum(loggamma(y + theta) - loggamma(theta) - loggamma(y + 1) +
+                   theta * np.log(theta) + y * np.log(y_hat) - (theta + y) *
+                   np.log(y_hat + theta))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        theta = self.theta
+        partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
+        grad_beta0 = np.sum(partial_beta_0)
+        grad_beta = np.dot(partial_beta_0.T, X)
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = expit(z)
+        hess_mu = np.exp(-z) * expit(z) ** 2
+
+        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
+        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
+        partial_beta_0_2 = grad_mu ** 2 * \
+            ((y + theta) / (mu + theta) ** 2 - y / mu ** 2)
+        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
+        gk = np.dot(gradient_beta_j.T, xk)
+        hk = np.dot(partial_beta_0.T, xk ** 2)
+        return gk, hk
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index c164a289..28c018cc 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 from .pyglmnet import _logL
-from .distributions import Poisson, PoissonSoftplus
+from .distributions import Poisson, PoissonSoftplus, NegBinomialSoftplus
 
 
 def deviance(y, yhat, distr, theta):
@@ -26,7 +26,7 @@ def deviance(y, yhat, distr, theta):
     """
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
         # LS = _logL(distr, y, y, theta=theta)
-    if isinstance(distr, (Poisson, PoissonSoftplus)):
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
         LS = 0
@@ -61,7 +61,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     """
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
     #     LS = _logL(distr, y, y, theta=theta)
-    if isinstance(distr, (Poisson, PoissonSoftplus)):
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
         LS = 0
@@ -72,7 +72,7 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     L1 = distr.log_likelihood(y, yhat)
 
     # if distr in ['softplus', 'poisson', 'neg-binomial']:
-    if isinstance(distr, (Poisson, PoissonSoftplus)):
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         score = (1 - (LS - L1) / (LS - L0))
     else:
         score = (1 - L1 / L0)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 18b93bbf..f0e6ad33 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -15,7 +15,7 @@
 from .externals.sklearn.utils.validation import check_is_fitted
 
 from .distributions import BaseDistribution, Gaussian, Poisson, \
-    PoissonSoftplus, softplus
+    PoissonSoftplus, NegBinomialSoftplus, softplus
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
@@ -34,6 +34,9 @@ def _get_distr(distr):
     elif distr == 'softplus':
         distr = PoissonSoftplus()
 
+    elif distr == 'neg-binomial':
+        distr = NegBinomialSoftplus()
+
     return distr
 
 
@@ -633,6 +636,8 @@ def __init__(self, distr='poisson', alpha=0.5,
         self.distr = _get_distr(distr)
         if isinstance(self.distr, Poisson):
             self.distr.eta = eta
+        if isinstance(self.distr, NegBinomialSoftplus):
+            self.distr.theta = theta
         self.alpha = alpha
         self.reg_lambda = reg_lambda
         self.Tau = Tau

From a93165dafa64717998adf45068114180c2de3dff Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 01:01:13 -0700
Subject: [PATCH 05/24] add Binomial and Probit classes

---
 pyglmnet/distributions.py | 146 ++++++++++++++++++++++++++++++++++++++
 pyglmnet/pyglmnet.py      |  21 ++++--
 2 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 91da2fa5..91bb641f 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 import numpy as np
 from scipy.special import expit, log1p, loggamma
+from scipy.stats import norm
 
 
 def softplus(z):
@@ -245,3 +246,148 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
         gk = np.dot(gradient_beta_j.T, xk)
         hk = np.dot(partial_beta_0.T, xk ** 2)
         return gk, hk
+
+
+class Binomial(BaseDistribution):
+    """Class for binomial distribution."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = expit(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z) * (1 - expit(z))
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        # prevents underflow
+        if z is not None:
+            log_likelihood = np.sum(y * z - np.log(1 + np.exp(z)))
+        # for scoring
+        else:
+            log_likelihood = \
+                np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_beta0 = np.sum(mu - y)
+        grad_beta = np.dot((mu - y).T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        gk = np.sum((mu - y) * xk)
+        hk = np.sum(mu * (1.0 - mu) * xk * xk)
+        return gk, hk
+
+
+class Probit(BaseDistribution):
+    """Class for probit distribution."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    def _probit_g1(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
+        res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
+        res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
+        return res
+
+    def _probit_g2(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
+        res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
+        res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
+        return res
+
+    def _probit_g3(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = -z[z < -thresh]
+        res[np.abs(z) <= thresh] = \
+            pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
+        res[z > thresh] = pdfz[z > thresh]
+        return res
+
+    def _probit_g4(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = pdfz[z < -thresh]
+        res[np.abs(z) <= thresh] = \
+            pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
+        res[z > thresh] = z[z > thresh]
+        return res
+
+    def _probit_g5(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = 0 * z[z < -thresh]
+        res[np.abs(z) <= thresh] = \
+            z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+            cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
+                                         cdfz[np.abs(z) <= thresh]) ** 2
+        res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
+        return res
+
+    def _probit_g6(self, z, pdfz, cdfz, thresh=5):
+        res = np.zeros_like(z)
+        res[z < -thresh] = \
+            pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
+        res[np.abs(z) <= thresh] = \
+            (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
+            z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+            (1 - cdfz[np.abs(z) <= thresh])
+        res[z > thresh] = 0 * z[z > thresh]
+        return res
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = norm.cdf(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = norm.pdf(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        if z is not None:
+            pdfz, cdfz = norm.pdf(z), norm.cdf(z)
+            log_likelihood = \
+                np.sum(y * self._probit_g1(z, pdfz, cdfz) +
+                       (1 - y) * self._probit_g2(z, pdfz, cdfz))
+        else:
+            log_likelihood = \
+                np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_beta0 = np.sum(mu - y)
+        grad_beta = np.dot((mu - y).T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        z = self._z(beta0, beta, X)
+        pdfz = norm.pdf(z)
+        cdfz = norm.cdf(z)
+        gk = -np.sum((y * self._probit_g3(z, pdfz, cdfz) -
+                      (1 - y) * self._probit_g4(z, pdfz, cdfz)) * xk)
+        hk = np.sum((y * self._probit_g5(z, pdfz, cdfz) +
+                     (1 - y) * self._probit_g6(z, pdfz, cdfz)) * (xk * xk))
+        return gk, hk
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index f0e6ad33..6010d8be 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -15,7 +15,7 @@
 from .externals.sklearn.utils.validation import check_is_fitted
 
 from .distributions import BaseDistribution, Gaussian, Poisson, \
-    PoissonSoftplus, NegBinomialSoftplus, softplus
+    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, softplus
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
@@ -37,6 +37,11 @@ def _get_distr(distr):
     elif distr == 'neg-binomial':
         distr = NegBinomialSoftplus()
 
+    elif distr == 'binomial':
+        distr = Binomial()
+
+    elif distr == 'probit':
+        distr = Probit()
     return distr
 
 
@@ -1039,7 +1044,7 @@ def predict(self, X):
         z = self.distr._z(self.beta0_, self.beta_, X)
         yhat = self.distr.mu(z)
 
-        if self.distr == 'binomial':
+        if isinstance(self.distr, Binomial):
             yhat = (yhat > 0.5).astype(int)
         yhat = np.asarray(yhat)
         return yhat
@@ -1058,7 +1063,8 @@ def _predict_proba(self, X):
             The predicted targets of shape (n_samples,).
 
         """
-        if self.distr not in ['binomial', 'probit']:
+        # if self.distr not in ['binomial', 'probit']:
+        if not isinstance(self.distr, (Binomial, Probit)):
             raise ValueError('This is only applicable for \
                               the binomial distribution.')
 
@@ -1095,7 +1101,8 @@ def predict_proba(self, X):
         X = check_array(X, accept_sparse=False)
         check_is_fitted(self, 'is_fitted_')
 
-        if self.distr in ['binomial', 'probit']:
+        # if self.distr in ['binomial', 'probit']:
+        if isinstance(self.distr, (Binomial, Probit)):
             return self._predict_proba(X)
         else:
             warnings.warn('This is only applicable for \
@@ -1150,14 +1157,16 @@ def score(self, X, y):
 
         # For f1 as well
         if self.score_metric in ['accuracy']:
-            if self.distr not in ['binomial', 'multinomial']:
+            # if self.distr not in ['binomial', 'multinomial']:
+            if not isinstance(self.distr, (Binomial, Probit)):
                 raise ValueError(self.score_metric +
                                  ' is only defined for binomial ' +
                                  'or multinomial distributions')
 
         y = np.asarray(y).ravel()
 
-        if self.distr in ['binomial', 'probit'] and \
+        # if self.distr in ['binomial', 'probit'] and \
+        if isinstance(self.distr, (Binomial, Probit)) and \
            self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)
         else:

From be79bdc024c7034fc3d76f0c33d11f9cd043268b Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 01:11:48 -0700
Subject: [PATCH 06/24] add GammaSoftplus class

---
 pyglmnet/distributions.py | 42 +++++++++++++++++++++++++++++++++++++++
 pyglmnet/pyglmnet.py      |  6 +++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 91bb641f..21fda8c8 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -391,3 +391,45 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
         hk = np.sum((y * self._probit_g5(z, pdfz, cdfz) +
                      (1 - y) * self._probit_g6(z, pdfz, cdfz)) * (xk * xk))
         return gk, hk
+
+
+class GammaSoftplus(BaseDistribution):
+    """Class for Gamma distribution with softplus inverse link."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = softplus(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        # see
+        # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
+        nu = 1.  # shape parameter, exponential for now
+        log_likelihood = np.sum(nu * (-y / y_hat - np.log(y_hat)))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        nu = 1.
+        grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
+        grad_beta0 = -nu * np.sum(grad_logl)
+        grad_beta = -nu * np.dot(grad_logl.T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        raise NotImplementedError('cdfast is not implemented for Gamma '
+                                  'distribution')
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 6010d8be..6b005752 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -15,7 +15,8 @@
 from .externals.sklearn.utils.validation import check_is_fitted
 
 from .distributions import BaseDistribution, Gaussian, Poisson, \
-    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, softplus
+    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, GammaSoftplus, \
+    softplus
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
@@ -42,6 +43,9 @@ def _get_distr(distr):
 
     elif distr == 'probit':
         distr = Probit()
+
+    elif distr == 'gamma':
+        distr = GammaSoftplus()
     return distr
 
 

From 9e43a027db7c4c36a4eaf2aba85b4777c7bad367 Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Tue, 18 Aug 2020 14:59:05 -0400
Subject: [PATCH 07/24] TST add test for distributions

---
 pyglmnet/utils.py           |  4 +--
 tests/test_distributions.py | 71 +++++++++++++++++++++++++++++++++++++
 tests/test_pyglmnet.py      | 33 +----------------
 3 files changed, 74 insertions(+), 34 deletions(-)
 create mode 100644 tests/test_distributions.py

diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index 2940e151..0622f441 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -106,9 +106,9 @@ def _check_params(distr, max_iter, fit_intercept):
     err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
                'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
     if isinstance(distr, str) and distr not in ALLOWED_DISTRS:
-            raise ValueError(err_msg)
+        raise ValueError(err_msg)
     if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
-            raise ValueError(err_msg)
+        raise TypeError(err_msg)
 
     if not isinstance(max_iter, int):
         raise ValueError('max_iter must be of type int')
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
new file mode 100644
index 00000000..8ab48929
--- /dev/null
+++ b/tests/test_distributions.py
@@ -0,0 +1,71 @@
+"""Tests for distributions."""
+
+from functools import partial
+
+import pytest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+import scipy.sparse as sps
+from scipy.optimize import approx_fprime
+from sklearn.preprocessing import StandardScaler
+
+from pyglmnet import ALLOWED_DISTRS, simulate_glm, GLM, _grad_L2loss, _L2loss
+from pyglmnet.distributions import BaseDistribution
+
+
+def test_base_distribution():
+    """Test the base distribution."""
+    class TestDistr(BaseDistribution):
+        def mu():
+            pass
+
+        def grad_mu():
+            pass
+
+    class TestDistr2(TestDistr):
+        def log_likelihood():
+            pass
+
+    with pytest.raises(TypeError, match='abstract methods log_likelihood'):
+        distr = TestDistr()
+
+    msg = 'Gradients of log likelihood are not specified'
+    with pytest.raises(NotImplementedError, match=msg):
+        distr = TestDistr2()
+        distr.grad_log_likelihood()
+
+    msg = 'distr must be one of'
+    with pytest.raises(ValueError, match=msg):
+        GLM(distr='blah')
+    with pytest.raises(TypeError, match=msg):
+        GLM(distr=['gaussian'])
+
+@pytest.mark.parametrize("distr", ALLOWED_DISTRS)
+def test_gradients(distr):
+    """Test gradient accuracy."""
+    # data
+    scaler = StandardScaler()
+    n_samples, n_features = 1000, 100
+    X = np.random.normal(0.0, 1.0, [n_samples, n_features])
+    X = scaler.fit_transform(X)
+
+    density = 0.1
+    beta_ = np.zeros(n_features + 1)
+    beta_[0] = np.random.rand()
+    beta_[1:] = sps.rand(n_features, 1, density=density).toarray()[:, 0]
+
+    reg_lambda = 0.1
+
+    glm = GLM(distr=distr, reg_lambda=reg_lambda)
+    y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
+
+    func = partial(_L2loss, distr, glm.alpha,
+                   glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
+    grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
+                   reg_lambda, X, y,
+                   glm.eta, glm.theta)
+    approx_grad = approx_fprime(beta_, func, 1.5e-8)
+    analytical_grad = grad(beta_)
+    assert_allclose(approx_grad, analytical_grad, rtol=1e-5, atol=1e-3)
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index b9401a63..7acc02f4 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -3,14 +3,12 @@
 import subprocess
 import os.path as op
 
-from functools import partial
 import pytest
 import numpy as np
 from numpy.testing import assert_allclose, assert_array_equal
 from pytest import raises
 
 import scipy.sparse as sps
-from scipy.optimize import approx_fprime
 
 from sklearn.datasets import make_regression
 from sklearn.preprocessing import StandardScaler
@@ -18,7 +16,7 @@
 from sklearn.linear_model import ElasticNet
 from sklearn.utils.estimator_checks import check_estimator
 
-from pyglmnet import (GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm,
+from pyglmnet import (GLM, GLMCV, simulate_glm,
                       _gradhess_logloss_1d, _loss, datasets, ALLOWED_DISTRS)
 
 matplotlib.use('agg')
@@ -29,35 +27,6 @@ def test_glm_estimator():
     check_estimator(GLM)
 
 
-@pytest.mark.parametrize("distr", ALLOWED_DISTRS)
-def test_gradients(distr):
-    """Test gradient accuracy."""
-    # data
-    scaler = StandardScaler()
-    n_samples, n_features = 1000, 100
-    X = np.random.normal(0.0, 1.0, [n_samples, n_features])
-    X = scaler.fit_transform(X)
-
-    density = 0.1
-    beta_ = np.zeros(n_features + 1)
-    beta_[0] = np.random.rand()
-    beta_[1:] = sps.rand(n_features, 1, density=density).toarray()[:, 0]
-
-    reg_lambda = 0.1
-
-    glm = GLM(distr=distr, reg_lambda=reg_lambda)
-    y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
-
-    func = partial(_L2loss, distr, glm.alpha,
-                   glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
-    grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
-                   reg_lambda, X, y,
-                   glm.eta, glm.theta)
-    approx_grad = approx_fprime(beta_, func, 1.5e-8)
-    analytical_grad = grad(beta_)
-    assert_allclose(approx_grad, analytical_grad, rtol=1e-5, atol=1e-3)
-
-
 def test_tikhonov():
     """Tikhonov regularization test."""
     n_samples, n_features = 100, 10

From 381829119dbdbac0fa359424133666cb574969d4 Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Tue, 18 Aug 2020 15:56:44 -0400
Subject: [PATCH 08/24] DOC: add example of cloglog

---
 examples/plot_custom_distributions.py | 76 +++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 examples/plot_custom_distributions.py

diff --git a/examples/plot_custom_distributions.py b/examples/plot_custom_distributions.py
new file mode 100644
index 00000000..edab74ff
--- /dev/null
+++ b/examples/plot_custom_distributions.py
@@ -0,0 +1,76 @@
+"""
+======================
+Rolling out custom GLM
+======================
+
+This is an example demonstrating rolling out your custom
+GLM class using Pyglmnet.
+"""
+########################################################
+
+# Author: Pavan Ramkumar <pavan.ramkumar@gmail.com>
+# License: MIT
+
+from sklearn.model_selection import train_test_split
+from pyglmnet import GLMCV, datasets
+
+########################################################
+# Download and preprocess data files
+
+X, y = datasets.fetch_community_crime_data()
+n_samples, n_features = X.shape
+
+########################################################
+# Split the data into training and test sets
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, test_size=0.33, random_state=0)
+
+########################################################
+# Now we define our own distribution class. This must
+# inherit from BaseDistribution. The BaseDistribution
+# class requires defining the following methods:
+#   - mu: inverse link function
+#   - grad_mu: gradient of the inverse link function
+#   - log_likelihood: the log likelihood function
+#   - grad_log_likelihood: the gradient of the log 
+#     likelihood.
+# All distributions in pyglmnet inherit from BaseDistribution
+#
+# This is really powerful. For instance, we can start from
+# the existing Binomial distribution and override mu and grad_mu
+# if we want to use the cloglog link function.
+
+import numpy as np
+from pyglmnet.distributions import Binomial
+
+
+class CustomBinomial(Binomial):
+    """Custom binomial distribution."""
+
+    def mu(self, z):
+        """clogclog inverse link"""
+        mu = 1 - np.exp(-np.exp(z))
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = np.exp(1 - np.exp(z))
+        return grad_mu
+
+
+distr = CustomBinomial()
+
+########################################################
+# Now we pass it to the GLMCV class just as before.
+
+# use the default value for reg_lambda
+glm = GLMCV(distr=distr, alpha=0.05, score_metric='pseudo_R2', cv=3,
+            tol=1e-4)
+
+# fit model
+glm.fit(X_train, y_train)
+
+# score the test set prediction
+y_test_hat = glm.predict_proba(X_test)
+print("test set pseudo $R^2$ = %f" % glm.score(X_test, y_test))

From f6536d1cf1c247aab8ca045c152d8b3614c31b0f Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Tue, 18 Aug 2020 16:37:04 -0400
Subject: [PATCH 09/24] Simplify Poisson

---
 pyglmnet/distributions.py | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 21fda8c8..598c5301 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -148,7 +148,7 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
         return gk, hk
 
 
-class PoissonSoftplus(BaseDistribution):
+class PoissonSoftplus(Poisson):
     """Class for Poisson distribution with softplus inverse link."""
 
     def __init__(self):
@@ -165,34 +165,6 @@ def grad_mu(self, z):
         grad_mu = expit(z)
         return grad_mu
 
-    def log_likelihood(self, y, y_hat):
-        """Log L2-penalized likelihood."""
-        eps = np.spacing(1)
-        log_likelihood = np.sum(y * np.log(y_hat + eps) - y_hat)
-        return log_likelihood
-
-    def grad_log_likelihood(self, X, y, beta0, beta):
-        """Gradient of L2-penalized log likelihood."""
-        z = self._z(beta0, beta, X)
-        mu = self.mu(z)
-        grad_mu = self.grad_mu(z)
-        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-        grad_beta = ((np.dot(grad_mu.T, X) -
-                      np.dot((y * grad_mu / mu).T, X)).T)
-        return grad_beta0, grad_beta
-
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
-        """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
-        mu = self.mu(z)
-        s = expit(z)
-        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
-
-        grad_s = s * (1 - s)
-        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
-        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
-        return gk, hk
-
 
 class NegBinomialSoftplus(BaseDistribution):
     """Class for Negative binomial distribution with softplus inverse link."""

From 63ee2ca335dbfea32360bb32841af9963a81e5a9 Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Tue, 18 Aug 2020 16:41:47 -0400
Subject: [PATCH 10/24] No need of init

---
 pyglmnet/distributions.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 598c5301..2ac636a5 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -56,10 +56,6 @@ def _z(self, beta0, beta, X):
 class Gaussian(BaseDistribution):
     """Class for Gaussian distribution."""
 
-    def __init__(self):
-        """init."""
-        pass
-
     def mu(self, z):
         """Inverse link function."""
         mu = z
@@ -151,10 +147,6 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
 class PoissonSoftplus(Poisson):
     """Class for Poisson distribution with softplus inverse link."""
 
-    def __init__(self):
-        """init."""
-        pass
-
     def mu(self, z):
         """Inverse link function."""
         mu = softplus(z)
@@ -223,10 +215,6 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
 class Binomial(BaseDistribution):
     """Class for binomial distribution."""
 
-    def __init__(self):
-        """init."""
-        pass
-
     def mu(self, z):
         """Inverse link function."""
         mu = expit(z)
@@ -268,10 +256,6 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
 class Probit(BaseDistribution):
     """Class for probit distribution."""
 
-    def __init__(self):
-        """init."""
-        pass
-
     def _probit_g1(self, z, pdfz, cdfz, thresh=5):
         res = np.zeros_like(z)
         res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
@@ -368,10 +352,6 @@ def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
 class GammaSoftplus(BaseDistribution):
     """Class for Gamma distribution with softplus inverse link."""
 
-    def __init__(self):
-        """init."""
-        pass
-
     def mu(self, z):
         """Inverse link function."""
         mu = softplus(z)

From 4cfe839e055df3444cf2e4e17280cc159063f9c4 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 20:21:36 -0700
Subject: [PATCH 11/24] revert API of gradhess_log_likelihood_1d to accept z

---
 pyglmnet/distributions.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 2ac636a5..121bda81 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -80,9 +80,8 @@ def grad_log_likelihood(self, X, y, beta0, beta):
         grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
         gk = np.sum((z - y) * xk)
         hk = np.sum(xk * xk)
         return gk, hk
@@ -127,9 +126,8 @@ def grad_log_likelihood(self, X, y, beta0, beta):
                       np.dot((y * grad_mu / mu).T, X)).T)
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
         mu = self.mu(z)
         s = expit(z)
         gk = np.sum((mu[z <= self.eta] - y[z <= self.eta]) *
@@ -195,13 +193,12 @@ def grad_log_likelihood(self, X, y, beta0, beta):
         grad_beta = np.dot(partial_beta_0.T, X)
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
         mu = self.mu(z)
         grad_mu = expit(z)
         hess_mu = np.exp(-z) * expit(z) ** 2
-
+        theta = self.theta
         gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
         partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
         partial_beta_0_2 = grad_mu ** 2 * \
@@ -244,9 +241,8 @@ def grad_log_likelihood(self, X, y, beta0, beta):
         grad_beta = np.dot((mu - y).T, X).T
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
         mu = self.mu(z)
         gk = np.sum((mu - y) * xk)
         hk = np.sum(mu * (1.0 - mu) * xk * xk)
@@ -337,9 +333,8 @@ def grad_log_likelihood(self, X, y, beta0, beta):
         grad_beta = np.dot((mu - y).T, X).T
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
-        z = self._z(beta0, beta, X)
         pdfz = norm.pdf(z)
         cdfz = norm.cdf(z)
         gk = -np.sum((y * self._probit_g3(z, pdfz, cdfz) -
@@ -381,7 +376,7 @@ def grad_log_likelihood(self, X, y, beta0, beta):
         grad_beta = -nu * np.dot(grad_logl.T, X).T
         return grad_beta0, grad_beta
 
-    def gradhess_log_likelihood_1d(self, xk, y, beta0, beta):
+    def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
         raise NotImplementedError('cdfast is not implemented for Gamma '
                                   'distribution')

From 6a8b7f403dd17951895dc96f41a4e46f1ac55f0a Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 20:21:59 -0700
Subject: [PATCH 12/24] make most tests pass

---
 pyglmnet/pyglmnet.py        | 55 ++++++++++++++++++++++---------------
 tests/test_distributions.py |  3 +-
 tests/test_pyglmnet.py      | 10 +++----
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 6b005752..55425a65 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -465,9 +465,19 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     y: array
         simulated target data of shape (n_samples,)
     """
-    if distr not in ALLOWED_DISTRS:
-        raise ValueError("'distr' must be in %s, got %s"
-                         % (repr(ALLOWED_DISTRS), distr))
+    err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
+               'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
+    if isinstance(distr, str) and distr not in ALLOWED_DISTRS:
+        raise ValueError(err_msg)
+    if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
+        raise TypeError(err_msg)
+
+    if isinstance(distr, str):
+        distr = _get_distr(distr)
+        if isinstance(distr, Poisson):
+            distr.eta = eta
+        if isinstance(distr, NegBinomialSoftplus):
+            distr.theta = theta
 
     if not isinstance(beta0, float):
         raise ValueError("'beta0' must be float, got %s" % type(beta0))
@@ -475,28 +485,28 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     if beta.ndim != 1:
         raise ValueError("'beta' must be 1D, got %dD" % beta.ndim)
 
+    if not fit_intercept:
+        beta0 = 0.
+
+    mu = distr.mu(distr._z(beta0, beta, X))
+
     if not sample:
-        return _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
+        return mu
 
     _random_state = check_random_state(random_state)
-    if distr == 'softplus' or distr == 'poisson':
-        y = _random_state.poisson(
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'gaussian':
-        y = _random_state.normal(
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'binomial' or distr == 'probit':
-        y = _random_state.binomial(
-            1,
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'gamma':
-        mu = _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
+    if isinstance(distr, (PoissonSoftplus, Poisson)):
+        y = _random_state.poisson(mu)
+
+    if isinstance(distr, Gaussian):
+        y = _random_state.normal(mu)
+
+    if isinstance(distr, (Binomial, Probit)):
+        y = _random_state.binomial(1, mu)
+
+    if isinstance(distr, GammaSoftplus):
         y = np.exp(mu)
-    if distr == 'neg-binomial':
-        mu = _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
+
+    if isinstance(distr, NegBinomialSoftplus):
         p = theta / (theta + mu)  # Probability of success
         y = _random_state.negative_binomial(theta, p)
     return y
@@ -838,7 +848,8 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
                 update = 1. / hk * gk if hk > 1. else self.learning_rate * gk
 
                 # Update parameters, z
-                beta[k], z = beta[k] - update, z - update * xk
+                # beta[k], z = beta[k] - update, z - update * xk
+                beta[k] = beta[k] - update
         return beta
 
     def fit(self, X, y):
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index 8ab48929..e14dc0ce 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -42,6 +42,7 @@ def log_likelihood():
     with pytest.raises(TypeError, match=msg):
         GLM(distr=['gaussian'])
 
+
 @pytest.mark.parametrize("distr", ALLOWED_DISTRS)
 def test_gradients(distr):
     """Test gradient accuracy."""
@@ -63,7 +64,7 @@ def test_gradients(distr):
 
     func = partial(_L2loss, distr, glm.alpha,
                    glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
-    grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
+    grad = partial(_grad_L2loss, glm.distr, glm.alpha, glm.Tau,
                    reg_lambda, X, y,
                    glm.eta, glm.theta)
     approx_grad = approx_fprime(beta_, func, 1.5e-8)
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index 7acc02f4..f7ad73b0 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -17,7 +17,7 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 from pyglmnet import (GLM, GLMCV, simulate_glm,
-                      _gradhess_logloss_1d, _loss, datasets, ALLOWED_DISTRS)
+                      _loss, datasets, ALLOWED_DISTRS)
 
 matplotlib.use('agg')
 
@@ -343,7 +343,9 @@ def test_cdfast(distr):
     z = beta_[0] + np.dot(X, beta_[1:])
     k = 1
     xk = X[:, k - 1]
-    gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, glm.eta, glm.theta)
+    gk, hk = glm.distr.gradhess_log_likelihood_1d(xk, y, z)
+    gk = 1. / n_samples * gk
+    hk = 1. / n_samples * hk
 
     # test grad and hess
     if distr != 'multinomial':
@@ -407,7 +409,6 @@ def test_random_state_consistency():
 @pytest.mark.parametrize("distr", ALLOWED_DISTRS)
 def test_simulate_glm(distr):
     """Test that every generative model can be simulated from."""
-
     random_state = 1
     state = np.random.RandomState(random_state)
     n_samples, n_features = 10, 3
@@ -427,13 +428,12 @@ def test_simulate_glm(distr):
 
     # If the distribution name is garbage it will fail
     distr = 'multivariate_gaussian_poisson'
-    with pytest.raises(ValueError, match="'distr' must be in"):
+    with pytest.raises(ValueError):
         simulate_glm(distr, 1.0, 1.0, np.array([[1.0]]))
 
 
 def test_api_input():
     """Test that the input value of y can be of different types."""
-
     random_state = 1
     state = np.random.RandomState(random_state)
     n_samples, n_features = 100, 5

From 051c1d93a871e775d4a8b1a374369ef50cb54922 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 21:54:05 -0700
Subject: [PATCH 13/24] re-add gradhess_log_likelihood_1d for softplus

---
 pyglmnet/distributions.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 121bda81..0e200a56 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -155,6 +155,17 @@ def grad_mu(self, z):
         grad_mu = expit(z)
         return grad_mu
 
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        mu = self.mu(z)
+        s = expit(z)
+        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
+
+        grad_s = s * (1 - s)
+        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
+        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
+        return gk, hk
+
 
 class NegBinomialSoftplus(BaseDistribution):
     """Class for Negative binomial distribution with softplus inverse link."""

From e0c2f3107fe000f274d683bb66a016f5b902fc3e Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 21:54:31 -0700
Subject: [PATCH 14/24] bug in cdfast update for z

---
 pyglmnet/pyglmnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 55425a65..c5edf0d6 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -848,8 +848,8 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
                 update = 1. / hk * gk if hk > 1. else self.learning_rate * gk
 
                 # Update parameters, z
-                # beta[k], z = beta[k] - update, z - update * xk
-                beta[k] = beta[k] - update
+                beta[k], z = beta[k] - update, z - update * xk
+                
         return beta
 
     def fit(self, X, y):

From e6e6d227982c16acfa53f58b5d444aa5ae2a57c4 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 21:54:57 -0700
Subject: [PATCH 15/24] style fix for test_compare_sklearn

---
 tests/test_pyglmnet.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index f7ad73b0..08c8afd4 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -285,7 +285,7 @@ def test_compare_sklearn(solver):
     def rmse(a, b):
         return np.sqrt(np.mean((a - b) ** 2))
 
-    X, Y, coef_ = make_regression(
+    X, y, coef_ = make_regression(
         n_samples=1000, n_features=500,
         noise=0.1, n_informative=10, coef=True,
         random_state=42)
@@ -294,18 +294,18 @@ def rmse(a, b):
     l1_ratio = 0.5
 
     clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, tol=1e-5)
-    clf.fit(X, Y)
+    clf.fit(X, y)
     glm = GLM(distr='gaussian', alpha=l1_ratio, reg_lambda=alpha,
               solver=solver, tol=1e-6, max_iter=500)
-    glm.fit(X, Y)
+    glm.fit(X, y)
 
     y_sk = clf.predict(X)
     y_pg = glm.predict(X)
-    assert abs(rmse(Y, y_sk) - rmse(Y, y_pg)) < 0.5
+    assert abs(rmse(y, y_sk) - rmse(y, y_pg)) < 0.5
 
     glm = GLM(distr='gaussian', alpha=l1_ratio, reg_lambda=alpha,
               solver=solver, tol=1e-6, max_iter=5, fit_intercept=False)
-    glm.fit(X, Y)
+    glm.fit(X, y)
     assert glm.beta0_ == 0.
 
     glm.predict(X)

From 888e5b87a09a6d5a6c85c5ad079762d197db2efb Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 22:40:47 -0700
Subject: [PATCH 16/24] private method _set_distr derives self.distr_ object
 from self.distr str

---
 pyglmnet/pyglmnet.py        | 95 +++++++++++++++----------------------
 pyglmnet/utils.py           |  4 +-
 tests/test_distributions.py |  5 +-
 tests/test_pyglmnet.py      |  3 +-
 4 files changed, 44 insertions(+), 63 deletions(-)

diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index c5edf0d6..50d9dee4 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -22,33 +22,6 @@
                   'probit', 'gamma', 'neg-binomial']
 
 
-def _get_distr(distr):
-    if isinstance(distr, BaseDistribution):
-        return distr
-
-    elif distr == 'gaussian':
-        distr = Gaussian()
-
-    elif distr == 'poisson':
-        distr = Poisson()
-
-    elif distr == 'softplus':
-        distr = PoissonSoftplus()
-
-    elif distr == 'neg-binomial':
-        distr = NegBinomialSoftplus()
-
-    elif distr == 'binomial':
-        distr = Binomial()
-
-    elif distr == 'probit':
-        distr = Probit()
-
-    elif distr == 'gamma':
-        distr = GammaSoftplus()
-    return distr
-
-
 def _probit_g1(z, pdfz, cdfz, thresh=5):
     res = np.zeros_like(z)
     res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
@@ -472,12 +445,8 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
         raise TypeError(err_msg)
 
-    if isinstance(distr, str):
-        distr = _get_distr(distr)
-        if isinstance(distr, Poisson):
-            distr.eta = eta
-        if isinstance(distr, NegBinomialSoftplus):
-            distr.theta = theta
+    glm = GLM(distr=distr)
+    glm._set_distr()
 
     if not isinstance(beta0, float):
         raise ValueError("'beta0' must be float, got %s" % type(beta0))
@@ -488,7 +457,7 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     if not fit_intercept:
         beta0 = 0.
 
-    mu = distr.mu(distr._z(beta0, beta, X))
+    mu = glm.distr_.mu(glm.distr_._z(beta0, beta, X))
 
     if not sample:
         return mu
@@ -652,11 +621,7 @@ def __init__(self, distr='poisson', alpha=0.5,
 
         _check_params(distr=distr, max_iter=max_iter,
                       fit_intercept=fit_intercept)
-        self.distr = _get_distr(distr)
-        if isinstance(self.distr, Poisson):
-            self.distr.eta = eta
-        if isinstance(self.distr, NegBinomialSoftplus):
-            self.distr.theta = theta
+        self.distr = distr
         self.alpha = alpha
         self.reg_lambda = reg_lambda
         self.Tau = Tau
@@ -674,9 +639,27 @@ def __init__(self, distr='poisson', alpha=0.5,
         self.theta = theta
         set_log_level(verbose)
 
+    def _set_distr(self):
+        distr_lookup = {
+            'gaussian': Gaussian(),
+            'poisson': Poisson(),
+            'softplus': PoissonSoftplus(),
+            'neg-binomial': NegBinomialSoftplus(),
+            'binomial': Binomial(),
+            'probit': Probit(),
+            'gamma': GammaSoftplus()
+        }
+        self.distr_ = distr_lookup[self.distr]
+        if isinstance(self.distr_, Poisson):
+            self.distr_.eta = self.eta
+        if isinstance(self.distr_, NegBinomialSoftplus):
+            self.distr_.theta = self.theta
+
     def _set_cv(cv, estimator=None, X=None, y=None):
-        """Set the default CV depending on whether clf
-           is classifier/regressor."""
+        """Set the default CV.
+
+        Depends on whether clf is classifier/regressor.
+        """
         # Detect whether classification or regression
         if estimator in ['classifier', 'regressor']:
             est_is_classifier = estimator == 'classifier'
@@ -823,7 +806,7 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
                 # Calculate grad and hess of log likelihood term
                 # gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta,
                 #                               self.theta, fit_intercept)
-                gk, hk = self.distr.gradhess_log_likelihood_1d(xk, y, z)
+                gk, hk = self.distr_.gradhess_log_likelihood_1d(xk, y, z)
                 gk = 1. / n_samples * gk
                 hk = 1. / n_samples * hk
 
@@ -849,7 +832,7 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
 
                 # Update parameters, z
                 beta[k], z = beta[k] - update, z - update * xk
-                
+
         return beta
 
     def fit(self, X, y):
@@ -869,7 +852,7 @@ def fit(self, X, y):
             The fitted model.
         """
         X, y = check_X_y(X, y, accept_sparse=False)
-
+        self._set_distr()
         self.beta0_ = None
         self.beta_ = None
         self.ynull_ = None
@@ -942,7 +925,7 @@ def fit(self, X, y):
             self.n_iter_ += 1
             beta_old = beta.copy()
             if self.solver == 'batch-gradient':
-                grad = _grad_L2loss(self.distr,
+                grad = _grad_L2loss(self.distr_,
                                     alpha, self.Tau,
                                     reg_lambda, X, y, self.eta,
                                     self.theta, beta, self.fit_intercept)
@@ -1056,10 +1039,10 @@ def predict(self, X):
         # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
         #             fit_intercept=True)
 
-        z = self.distr._z(self.beta0_, self.beta_, X)
-        yhat = self.distr.mu(z)
+        z = self.distr_._z(self.beta0_, self.beta_, X)
+        yhat = self.distr_.mu(z)
 
-        if isinstance(self.distr, Binomial):
+        if isinstance(self.distr_, Binomial):
             yhat = (yhat > 0.5).astype(int)
         yhat = np.asarray(yhat)
         return yhat
@@ -1079,7 +1062,7 @@ def _predict_proba(self, X):
 
         """
         # if self.distr not in ['binomial', 'probit']:
-        if not isinstance(self.distr, (Binomial, Probit)):
+        if not isinstance(self.distr_, (Binomial, Probit)):
             raise ValueError('This is only applicable for \
                               the binomial distribution.')
 
@@ -1089,8 +1072,8 @@ def _predict_proba(self, X):
 
         # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
         #             fit_intercept=True)
-        z = self.distr._z(self.beta0_, self.beta_, X)
-        yhat = self.distr.mu(z)
+        z = self.distr_._z(self.beta0_, self.beta_, X)
+        yhat = self.distr_.mu(z)
         yhat = np.asarray(yhat)
         return yhat
 
@@ -1117,7 +1100,7 @@ def predict_proba(self, X):
         check_is_fitted(self, 'is_fitted_')
 
         # if self.distr in ['binomial', 'probit']:
-        if isinstance(self.distr, (Binomial, Probit)):
+        if isinstance(self.distr_, (Binomial, Probit)):
             return self._predict_proba(X)
         else:
             warnings.warn('This is only applicable for \
@@ -1173,7 +1156,7 @@ def score(self, X, y):
         # For f1 as well
         if self.score_metric in ['accuracy']:
             # if self.distr not in ['binomial', 'multinomial']:
-            if not isinstance(self.distr, (Binomial, Probit)):
+            if not isinstance(self.distr_, (Binomial, Probit)):
                 raise ValueError(self.score_metric +
                                  ' is only defined for binomial ' +
                                  'or multinomial distributions')
@@ -1181,7 +1164,7 @@ def score(self, X, y):
         y = np.asarray(y).ravel()
 
         # if self.distr in ['binomial', 'probit'] and \
-        if isinstance(self.distr, (Binomial, Probit)) and \
+        if isinstance(self.distr_, (Binomial, Probit)) and \
            self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)
         else:
@@ -1189,10 +1172,10 @@ def score(self, X, y):
 
         # Check whether we have a list of estimators or a single estimator
         if self.score_metric == 'deviance':
-            return metrics.deviance(y, yhat, self.distr, self.theta)
+            return metrics.deviance(y, yhat, self.distr_, self.theta)
         elif self.score_metric == 'pseudo_R2':
             return metrics.pseudo_R2(X, y, yhat, self.ynull_,
-                                     self.distr, self.theta)
+                                     self.distr_, self.theta)
         if self.score_metric == 'accuracy':
             return metrics.accuracy(y, yhat)
 
diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index 0622f441..82df0482 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -105,10 +105,8 @@ def _check_params(distr, max_iter, fit_intercept):
 
     err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
                'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
-    if isinstance(distr, str) and distr not in ALLOWED_DISTRS:
+    if distr not in ALLOWED_DISTRS:
         raise ValueError(err_msg)
-    if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
-        raise TypeError(err_msg)
 
     if not isinstance(max_iter, int):
         raise ValueError('max_iter must be of type int')
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index e14dc0ce..d9c0c1c6 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -39,8 +39,6 @@ def log_likelihood():
     msg = 'distr must be one of'
     with pytest.raises(ValueError, match=msg):
         GLM(distr='blah')
-    with pytest.raises(TypeError, match=msg):
-        GLM(distr=['gaussian'])
 
 
 @pytest.mark.parametrize("distr", ALLOWED_DISTRS)
@@ -60,11 +58,12 @@ def test_gradients(distr):
     reg_lambda = 0.1
 
     glm = GLM(distr=distr, reg_lambda=reg_lambda)
+    glm._set_distr()
     y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
 
     func = partial(_L2loss, distr, glm.alpha,
                    glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
-    grad = partial(_grad_L2loss, glm.distr, glm.alpha, glm.Tau,
+    grad = partial(_grad_L2loss, glm.distr_, glm.alpha, glm.Tau,
                    reg_lambda, X, y,
                    glm.eta, glm.theta)
     approx_grad = approx_fprime(beta_, func, 1.5e-8)
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index 08c8afd4..490a495b 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -325,6 +325,7 @@ def test_cdfast(distr):
         return
 
     glm = GLM(distr, solver='cdfast')
+    glm._set_distr()
 
     np.random.seed(glm.random_state)
 
@@ -343,7 +344,7 @@ def test_cdfast(distr):
     z = beta_[0] + np.dot(X, beta_[1:])
     k = 1
     xk = X[:, k - 1]
-    gk, hk = glm.distr.gradhess_log_likelihood_1d(xk, y, z)
+    gk, hk = glm.distr_.gradhess_log_likelihood_1d(xk, y, z)
     gk = 1. / n_samples * gk
     hk = 1. / n_samples * hk
 

From d33c6cadcce75cf58f80500e37b58717878f0ac5 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 23:22:47 -0700
Subject: [PATCH 17/24] remove legacy functions

---
 pyglmnet/__init__.py        |   2 +-
 pyglmnet/distributions.py   |   2 +-
 pyglmnet/metrics.py         |  10 +-
 pyglmnet/pyglmnet.py        | 321 +++---------------------------------
 pyglmnet/utils.py           |   5 +-
 tests/test_distributions.py |   2 +-
 tests/test_pyglmnet.py      |   7 +-
 7 files changed, 33 insertions(+), 316 deletions(-)

diff --git a/pyglmnet/__init__.py b/pyglmnet/__init__.py
index 89ae6e23..028a9030 100644
--- a/pyglmnet/__init__.py
+++ b/pyglmnet/__init__.py
@@ -19,7 +19,7 @@
 __version__ = '1.2.dev0'
 
 
-from .pyglmnet import GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm, _gradhess_logloss_1d, _loss, ALLOWED_DISTRS
+from .pyglmnet import GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm, _loss, ALLOWED_DISTRS
 from .utils import softmax, label_binarizer, set_log_level
 from .datasets import fetch_tikhonov_data, fetch_rgc_spike_trains, fetch_community_crime_data, fetch_group_lasso_data
 from . import externals
diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 0e200a56..ca2f7021 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -34,7 +34,7 @@ def grad_mu(self, z):
         pass
 
     @abstractmethod
-    def log_likelihood(self, y, y_hat):
+    def log_likelihood(self, y, y_hat, z=None):
         """Log L2-penalized likelihood."""
         pass
 
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index 28c018cc..d97754bc 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -1,7 +1,6 @@
-""" A set of scoring functions. """
+"""A set of scoring functions."""
 
 import numpy as np
-from .pyglmnet import _logL
 from .distributions import Poisson, PoissonSoftplus, NegBinomialSoftplus
 
 
@@ -24,8 +23,6 @@ def deviance(y, yhat, distr, theta):
     score : float
         Deviance of the predicted labels.
     """
-    # if distr in ['softplus', 'poisson', 'neg-binomial']:
-        # LS = _logL(distr, y, y, theta=theta)
     if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
@@ -59,19 +56,14 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     score : float
         Pseudo-R2 score.
     """
-    # if distr in ['softplus', 'poisson', 'neg-binomial']:
-    #     LS = _logL(distr, y, y, theta=theta)
     if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         LS = distr.log_likelihood(y, y)
     else:
         LS = 0
 
-    # L0 = _logL(distr, y, ynull_, theta=theta)
-    # L1 = _logL(distr, y, yhat, theta=theta)
     L0 = distr.log_likelihood(y, ynull_)
     L1 = distr.log_likelihood(y, yhat)
 
-    # if distr in ['softplus', 'poisson', 'neg-binomial']:
     if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         score = (1 - (LS - L1) / (LS - L0))
     else:
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 50d9dee4..8471181a 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -4,8 +4,6 @@
 from copy import deepcopy
 
 import numpy as np
-from scipy.special import expit, loggamma
-from scipy.stats import norm
 
 from .utils import logger, set_log_level, _check_params, \
     _verbose_iterable, _tqdm_log
@@ -15,154 +13,12 @@
 from .externals.sklearn.utils.validation import check_is_fitted
 
 from .distributions import BaseDistribution, Gaussian, Poisson, \
-    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, GammaSoftplus, \
-    softplus
+    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, GammaSoftplus
 
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
 
 
-def _probit_g1(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
-    res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
-    return res
-
-
-def _probit_g2(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
-    res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
-    return res
-
-
-def _probit_g3(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = -z[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
-    res[z > thresh] = pdfz[z > thresh]
-    return res
-
-
-def _probit_g4(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = pdfz[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = z[z > thresh]
-    return res
-
-
-def _probit_g5(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = 0 * z[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-        cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
-                                     cdfz[np.abs(z) <= thresh]) ** 2
-    res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
-    return res
-
-
-def _probit_g6(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = \
-        pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
-        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-        (1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = 0 * z[z > thresh]
-    return res
-
-
-def _z(beta0, beta, X, fit_intercept):
-    """Compute z to be passed through non-linearity."""
-    if fit_intercept:
-        z = beta0 + np.dot(X, beta)
-    else:
-        z = np.dot(X, np.r_[beta0, beta])
-    return z
-
-
-def _lmb(distr, beta0, beta, X, eta, fit_intercept=True):
-    """Conditional intensity function."""
-    z = _z(beta0, beta, X, fit_intercept)
-    return _mu(distr, z, eta, fit_intercept)
-
-
-def _mu(distr, z, eta, fit_intercept):
-    """The non-linearity (inverse link)."""
-    if distr in ['softplus', 'gamma', 'neg-binomial']:
-        mu = softplus(z)
-    elif distr == 'poisson':
-        mu = z.copy()
-        beta0 = (1 - eta) * np.exp(eta) if fit_intercept else 0.
-        mu[z > eta] = z[z > eta] * np.exp(eta) + beta0
-        mu[z <= eta] = np.exp(z[z <= eta])
-    elif distr == 'gaussian':
-        mu = z
-    elif distr == 'binomial':
-        mu = expit(z)
-    elif distr == 'probit':
-        mu = norm.cdf(z)
-    return mu
-
-
-def _grad_mu(distr, z, eta):
-    """Derivative of the non-linearity."""
-    if distr in ['softplus', 'gamma', 'neg-binomial']:
-        grad_mu = expit(z)
-    elif distr == 'poisson':
-        grad_mu = z.copy()
-        grad_mu[z > eta] = np.ones_like(z)[z > eta] * np.exp(eta)
-        grad_mu[z <= eta] = np.exp(z[z <= eta])
-    elif distr == 'gaussian':
-        grad_mu = np.ones_like(z)
-    elif distr == 'binomial':
-        grad_mu = expit(z) * (1 - expit(z))
-    elif distr in 'probit':
-        grad_mu = norm.pdf(z)
-    return grad_mu
-
-
-def _logL(distr, y, y_hat, z=None, theta=1.0):
-    """The log likelihood."""
-    if distr in ['softplus', 'poisson']:
-        eps = np.spacing(1)
-        logL = np.sum(y * np.log(y_hat + eps) - y_hat)
-    elif distr == 'gaussian':
-        logL = -0.5 * np.sum((y - y_hat)**2)
-    elif distr == 'binomial':
-
-        # prevents underflow
-        if z is not None:
-            logL = np.sum(y * z - np.log(1 + np.exp(z)))
-        # for scoring
-        else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
-    elif distr == 'probit':
-        if z is not None:
-            pdfz, cdfz = norm.pdf(z), norm.cdf(z)
-            logL = np.sum(y * _probit_g1(z, pdfz, cdfz) +
-                          (1 - y) * _probit_g2(z, pdfz, cdfz))
-        else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
-    elif distr == 'gamma':
-        # see
-        # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
-        nu = 1.  # shape parameter, exponential for now
-        logL = np.sum(nu * (-y / y_hat - np.log(y_hat)))
-    elif distr == 'neg-binomial':
-        logL = np.sum(loggamma(y + theta) - loggamma(theta) - loggamma(y + 1) +
-                      theta * np.log(theta) + y * np.log(y_hat) - (theta + y) *
-                      np.log(y_hat + theta))
-    return logL
-
-
 def _penalty(alpha, beta, Tau, group):
     """The penalty."""
     # Combine L1 and L2 penalty terms
@@ -209,9 +65,15 @@ def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, group, beta,
           fit_intercept=True):
     """Define the objective function for elastic net."""
     n_samples, n_features = X.shape
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    y_hat = _mu(distr, z, eta, fit_intercept)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z, theta)
+    if fit_intercept:
+        z = distr._z(beta[0], beta[1:], X)
+    else:
+        z = distr._z(0., beta, X)
+    y_hat = distr.mu(z)
+    if isinstance(distr, (Binomial, Probit)):
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat, z)
+    else:
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat)
     if fit_intercept:
         P = _penalty(alpha, beta[1:], Tau, group)
     else:
@@ -224,9 +86,15 @@ def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, group, beta,
             fit_intercept=True):
     """Define the objective function for elastic net."""
     n_samples, n_features = X.shape
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    y_hat = _mu(distr, z, eta, fit_intercept)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z, theta)
+    if fit_intercept:
+        z = distr._z(beta[0], beta[1:], X)
+    else:
+        z = distr._z(0., beta, X)
+    y_hat = distr.mu(z)
+    if isinstance(distr, (Binomial, Probit)):
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat, z)
+    else:
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat)
     if fit_intercept:
         P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau)
     else:
@@ -248,10 +116,6 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
             Tau = np.eye(beta.shape[0])
     InvCov = np.dot(Tau.T, Tau)
 
-    # z = _z(beta[0], beta[1:], X, fit_intercept)
-    # mu = _mu(distr, z, eta, fit_intercept)
-    # grad_mu = _grad_mu(distr, z, eta)
-
     if fit_intercept:
         beta0_, beta_ = beta[0], beta[1:]
     else:
@@ -259,45 +123,6 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
     grad_beta0, grad_beta = distr.grad_log_likelihood(X, y, beta0_, beta_)
     grad_beta0 = grad_beta0 if fit_intercept else 0.
 
-    # grad_beta0 = 0.
-    # if distr in ['poisson', 'softplus']:
-    #     if fit_intercept:
-    #         grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-    #     grad_beta = ((np.dot(grad_mu.T, X) -
-    #                   np.dot((y * grad_mu / mu).T, X)).T)
-    #
-    # elif distr == 'gaussian':
-    #     if fit_intercept:
-    #         grad_beta0 = np.sum((mu - y) * grad_mu)
-    #     grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
-    #
-    # elif distr == 'binomial':
-    #     if fit_intercept:
-    #         grad_beta0 = np.sum(mu - y)
-    #     grad_beta = np.dot((mu - y).T, X).T
-    #
-    # elif distr == 'probit':
-    #     grad_logl = (y * _probit_g3(z, grad_mu, mu) -
-    #                  (1 - y) * _probit_g4(z, grad_mu, mu))
-    #     if fit_intercept:
-    #         grad_beta0 = -np.sum(grad_logl)
-    #     grad_beta = -np.dot(grad_logl.T, X).T
-    #
-    # elif distr == 'gamma':
-    #     nu = 1.
-    #     grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
-    #     if fit_intercept:
-    #         grad_beta0 = -nu * np.sum(grad_logl)
-    #     grad_beta = -nu * np.dot(grad_logl.T, X).T
-    #
-    # elif distr == 'neg-binomial':
-    #     partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
-    #
-    #     if fit_intercept:
-    #         grad_beta0 = np.sum(partial_beta_0)
-    #
-    #     grad_beta = np.dot(partial_beta_0.T, X)
-
     grad_beta0 *= 1. / n_samples
     grad_beta *= 1. / n_samples
 
@@ -312,104 +137,6 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
 
     return g
 
-def _gradhess_logloss_1d(distr, xk, y, z, eta, theta, fit_intercept=True):
-    """
-    Compute gradient (1st derivative)
-    and Hessian (2nd derivative)
-    of log likelihood for a single coordinate.
-
-    Parameters
-    ----------
-    xk: float
-        (n_samples,)
-    y: float
-        (n_samples,)
-    z: float
-        (n_samples,)
-
-    Returns
-    -------
-    gk: gradient, float:
-        (n_features + 1,)
-    hk: float:
-        (n_features + 1,)
-    """
-    n_samples = xk.shape[0]
-
-    if distr == 'softplus':
-        mu = _mu(distr, z, eta, fit_intercept)
-        s = expit(z)
-        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
-
-        grad_s = s * (1 - s)
-        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
-        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
-
-    elif distr == 'poisson':
-        mu = _mu(distr, z, eta, fit_intercept)
-        s = expit(z)
-        gk = np.sum((mu[z <= eta] - y[z <= eta]) *
-                    xk[z <= eta]) + \
-            np.exp(eta) * \
-            np.sum((1 - y[z > eta] / mu[z > eta]) *
-                   xk[z > eta])
-        hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \
-            np.exp(eta) ** 2 * \
-            np.sum(y[z > eta] / (mu[z > eta] ** 2) *
-                   (xk[z > eta] ** 2))
-
-    elif distr == 'gaussian':
-        gk = np.sum((z - y) * xk)
-        hk = np.sum(xk * xk)
-
-    elif distr == 'binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        gk = np.sum((mu - y) * xk)
-        hk = np.sum(mu * (1.0 - mu) * xk * xk)
-
-    elif distr == 'probit':
-        pdfz = norm.pdf(z)
-        cdfz = norm.cdf(z)
-        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
-                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
-        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
-                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
-
-    elif distr == 'neg-binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        grad_mu = _grad_mu(distr, z, eta)
-        hess_mu = np.exp(-z) * expit(z)**2
-
-        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_2 = grad_mu**2 * \
-            ((y + theta) / (mu + theta)**2 - y / mu**2)
-        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
-        gk = np.dot(gradient_beta_j.T, xk)
-        hk = np.dot(partial_beta_0.T, xk**2)
-
-    elif distr == 'gamma':
-        raise NotImplementedError('cdfast is not implemented for Gamma '
-                                  'distribution')
-    elif distr == 'neg-binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        grad_mu = _grad_mu(distr, z, eta)
-        hess_mu = np.exp(-z) * expit(z) ** 2
-
-        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_2 = grad_mu ** 2 * \
-            ((y + theta) / (mu + theta) ** 2 - y / mu ** 2)
-        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
-        gk = np.dot(gradient_beta_j.T, xk)
-        hk = np.dot(partial_beta_0.T, xk ** 2)
-
-    elif distr == 'gamma':
-        raise NotImplementedError('cdfast is not implemented for Gamma '
-                                  'distribution')
-
-    return 1. / n_samples * gk, 1. / n_samples * hk
-
 
 def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
                  sample=False, theta=1.0, fit_intercept=True):
@@ -791,7 +518,10 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
         """
         n_samples, n_features = X.shape
         reg_scale = rl * (1 - self.alpha)
-        z = _z(beta[0], beta[1:], X, fit_intercept)
+        if fit_intercept:
+            z = self.distr_._z(beta[0], beta[1:], X)
+        else:
+            z = self.distr_._z(0., beta, X)
         for k in range(0, n_features + int(fit_intercept)):
             # Only update parameters in active set
             if ActiveSet[k] != 0:
@@ -1036,9 +766,6 @@ def predict(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-        #             fit_intercept=True)
-
         z = self.distr_._z(self.beta0_, self.beta_, X)
         yhat = self.distr_.mu(z)
 
@@ -1070,8 +797,6 @@ def _predict_proba(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        # yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-        #             fit_intercept=True)
         z = self.distr_._z(self.beta0_, self.beta_, X)
         yhat = self.distr_.mu(z)
         yhat = np.asarray(yhat)
diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index 82df0482..6f9046c0 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -1,12 +1,9 @@
-"""
-A few miscellaneous helper functions for pyglmnet.py
-"""
+"""A few miscellaneous helper functions for pyglmnet."""
 
 import numpy as np
 from copy import copy
 import logging
 
-from .distributions import BaseDistribution
 
 logger = logging.getLogger('pyglmnet')
 logger.addHandler(logging.StreamHandler())
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index d9c0c1c6..ef4fe3c4 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -61,7 +61,7 @@ def test_gradients(distr):
     glm._set_distr()
     y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
 
-    func = partial(_L2loss, distr, glm.alpha,
+    func = partial(_L2loss, glm.distr_, glm.alpha,
                    glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
     grad = partial(_grad_L2loss, glm.distr_, glm.alpha, glm.Tau,
                    reg_lambda, X, y,
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index 490a495b..7f10e689 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -172,10 +172,13 @@ def test_glmnet(distr, reg_lambda, fit_intercept, solver):
         group = None
         Tau = None
 
+        glm_ = GLM(distr=distr)
+        glm_._set_distr()
+
         def callback(beta):
             Tau = None
             loss_trace.append(
-                _loss(distr, alpha, Tau, reg_lambda,
+                _loss(glm_.distr_, alpha, Tau, reg_lambda,
                       X_train, y_train, eta, theta, group, beta,
                       fit_intercept=fit_intercept))
 
@@ -194,7 +197,7 @@ def callback(beta):
         # true loss and beta should be recovered when reg_lambda == 0
         if reg_lambda == 0.:
             # verify loss at convergence = loss when beta=beta_
-            l_true = _loss(distr, alpha, Tau, reg_lambda,
+            l_true = _loss(glm_.distr_, alpha, Tau, reg_lambda,
                            X_train, y_train, eta, theta, group,
                            np.concatenate(([beta0], beta)))
             assert_allclose(loss_trace[-1], l_true, rtol=1e-4, atol=1e-5)

From c495dfc86e15f1f7d6c0288d01f25c9e96c4ccd6 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 23:31:45 -0700
Subject: [PATCH 18/24] remove more comments

---
 pyglmnet/pyglmnet.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 8471181a..4eae1d29 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -788,7 +788,6 @@ def _predict_proba(self, X):
             The predicted targets of shape (n_samples,).
 
         """
-        # if self.distr not in ['binomial', 'probit']:
         if not isinstance(self.distr_, (Binomial, Probit)):
             raise ValueError('This is only applicable for \
                               the binomial distribution.')
@@ -824,7 +823,6 @@ def predict_proba(self, X):
         X = check_array(X, accept_sparse=False)
         check_is_fitted(self, 'is_fitted_')
 
-        # if self.distr in ['binomial', 'probit']:
         if isinstance(self.distr_, (Binomial, Probit)):
             return self._predict_proba(X)
         else:
@@ -880,7 +878,6 @@ def score(self, X, y):
 
         # For f1 as well
         if self.score_metric in ['accuracy']:
-            # if self.distr not in ['binomial', 'multinomial']:
             if not isinstance(self.distr_, (Binomial, Probit)):
                 raise ValueError(self.score_metric +
                                  ' is only defined for binomial ' +
@@ -888,7 +885,6 @@ def score(self, X, y):
 
         y = np.asarray(y).ravel()
 
-        # if self.distr in ['binomial', 'probit'] and \
         if isinstance(self.distr_, (Binomial, Probit)) and \
            self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)

From be4cd0b564a5055ca818571ae326f2f297842607 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Tue, 18 Aug 2020 23:55:36 -0700
Subject: [PATCH 19/24] simulate methods for distribution classes

---
 pyglmnet/distributions.py | 61 +++++++++++++++++++++++++++++++++++++++
 pyglmnet/pyglmnet.py      | 22 +-------------
 2 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index ca2f7021..527ef889 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -5,6 +5,8 @@
 from scipy.special import expit, log1p, loggamma
 from scipy.stats import norm
 
+from .externals.sklearn.utils import check_random_state
+
 
 def softplus(z):
     """Numerically stable version of log(1 + exp(z))."""
@@ -48,6 +50,11 @@ def gradhess_log_likelihood_1d(self):
         msg = (f"""Gradients and Hessians of 1-d log likelihood are not specified for {self.__class__} distribution""") # noqa
         raise NotImplementedError(msg)
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        msg = (f"""Simulation of model is not implemented for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
     def _z(self, beta0, beta, X):
         """Compute z to be passed through non-linearity."""
         return beta0 + np.dot(X, beta)
@@ -86,6 +93,15 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         hk = np.sum(xk * xk)
         return gk, hk
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.normal(mu)
+
 
 class Poisson(BaseDistribution):
     """Class for Poisson distribution."""
@@ -141,6 +157,15 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
                    (xk[z > self.eta] ** 2))
         return gk, hk
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.poisson(mu)
+
 
 class PoissonSoftplus(Poisson):
     """Class for Poisson distribution with softplus inverse link."""
@@ -219,6 +244,16 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         hk = np.dot(partial_beta_0.T, xk ** 2)
         return gk, hk
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            p = theta / (theta + mu)  # Probability of success
+            y = _random_state.negative_binomial(theta, p)
+
 
 class Binomial(BaseDistribution):
     """Class for binomial distribution."""
@@ -259,6 +294,15 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         hk = np.sum(mu * (1.0 - mu) * xk * xk)
         return gk, hk
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.binomial(1, mu)
+
 
 class Probit(BaseDistribution):
     """Class for probit distribution."""
@@ -354,6 +398,15 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
                      (1 - y) * self._probit_g6(z, pdfz, cdfz)) * (xk * xk))
         return gk, hk
 
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.binomial(1, mu)
+
 
 class GammaSoftplus(BaseDistribution):
     """Class for Gamma distribution with softplus inverse link."""
@@ -391,3 +444,11 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
         raise NotImplementedError('cdfast is not implemented for Gamma '
                                   'distribution')
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            return np.exp(mu)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 4eae1d29..b3fe8b40 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -184,27 +184,7 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     if not fit_intercept:
         beta0 = 0.
 
-    mu = glm.distr_.mu(glm.distr_._z(beta0, beta, X))
-
-    if not sample:
-        return mu
-
-    _random_state = check_random_state(random_state)
-    if isinstance(distr, (PoissonSoftplus, Poisson)):
-        y = _random_state.poisson(mu)
-
-    if isinstance(distr, Gaussian):
-        y = _random_state.normal(mu)
-
-    if isinstance(distr, (Binomial, Probit)):
-        y = _random_state.binomial(1, mu)
-
-    if isinstance(distr, GammaSoftplus):
-        y = np.exp(mu)
-
-    if isinstance(distr, NegBinomialSoftplus):
-        p = theta / (theta + mu)  # Probability of success
-        y = _random_state.negative_binomial(theta, p)
+    y = glm.distr_.simulate(beta0, beta, X, random_state, sample)
     return y
 
 

From b2a9cced0b8aaebffbd4b199b73481e7a2318cd2 Mon Sep 17 00:00:00 2001
From: Pavan Ramkumar <pavan.ramkumar@gmail.com>
Date: Wed, 19 Aug 2020 00:17:30 -0700
Subject: [PATCH 20/24] flake8

---
 examples/plot_custom_distributions.py | 2 +-
 pyglmnet/distributions.py             | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/plot_custom_distributions.py b/examples/plot_custom_distributions.py
index edab74ff..68f2cd52 100644
--- a/examples/plot_custom_distributions.py
+++ b/examples/plot_custom_distributions.py
@@ -33,7 +33,7 @@
 #   - mu: inverse link function
 #   - grad_mu: gradient of the inverse link function
 #   - log_likelihood: the log likelihood function
-#   - grad_log_likelihood: the gradient of the log 
+#   - grad_log_likelihood: the gradient of the log
 #     likelihood.
 # All distributions in pyglmnet inherit from BaseDistribution
 #
diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 527ef889..62708ac6 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -145,7 +145,6 @@ def grad_log_likelihood(self, X, y, beta0, beta):
     def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
         mu = self.mu(z)
-        s = expit(z)
         gk = np.sum((mu[z <= self.eta] - y[z <= self.eta]) *
                     xk[z <= self.eta]) + \
             np.exp(self.eta) * \
@@ -251,8 +250,8 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return mu
         else:
             _random_state = check_random_state(random_state)
-            p = theta / (theta + mu)  # Probability of success
-            y = _random_state.negative_binomial(theta, p)
+            p = self.theta / (self.theta + mu)  # Probability of success
+            return _random_state.negative_binomial(self.theta, p)
 
 
 class Binomial(BaseDistribution):

From 179e1c6ab3a039783b5bdc42b3dac07544193a29 Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Wed, 19 Aug 2020 12:44:40 -0400
Subject: [PATCH 21/24] Flake8 and style

---
 examples/plot_custom_distributions.py         |   4 +-
 pyglmnet/distributions.py                     | 112 +++++++++---------
 .../externals/sklearn/utils/validation.py     |   2 +-
 3 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/examples/plot_custom_distributions.py b/examples/plot_custom_distributions.py
index 68f2cd52..a13d38f6 100644
--- a/examples/plot_custom_distributions.py
+++ b/examples/plot_custom_distributions.py
@@ -41,8 +41,8 @@
 # the existing Binomial distribution and override mu and grad_mu
 # if we want to use the cloglog link function.
 
-import numpy as np
-from pyglmnet.distributions import Binomial
+import numpy as np  # noqa: E402
+from pyglmnet.distributions import Binomial  # noqa: E402
 
 
 class CustomBinomial(Binomial):
diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 62708ac6..a6357778 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -303,60 +303,66 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return _random_state.binomial(1, mu)
 
 
+def _probit_g1(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
+    res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
+    return res
+
+
+def _probit_g2(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
+    res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
+    return res
+
+
+def _probit_g3(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = -z[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
+    res[z > thresh] = pdfz[z > thresh]
+    return res
+
+
+def _probit_g4(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = pdfz[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = z[z > thresh]
+    return res
+
+
+def _probit_g5(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = 0 * z[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+        cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
+                                     cdfz[np.abs(z) <= thresh]) ** 2
+    res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
+    return res
+
+
+def _probit_g6(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = \
+        pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
+        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+        (1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = 0 * z[z > thresh]
+    return res
+
+
 class Probit(BaseDistribution):
     """Class for probit distribution."""
 
-    def _probit_g1(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
-        res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
-        res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
-        return res
-
-    def _probit_g2(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
-        res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
-        res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
-        return res
-
-    def _probit_g3(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = -z[z < -thresh]
-        res[np.abs(z) <= thresh] = \
-            pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
-        res[z > thresh] = pdfz[z > thresh]
-        return res
-
-    def _probit_g4(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = pdfz[z < -thresh]
-        res[np.abs(z) <= thresh] = \
-            pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
-        res[z > thresh] = z[z > thresh]
-        return res
-
-    def _probit_g5(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = 0 * z[z < -thresh]
-        res[np.abs(z) <= thresh] = \
-            z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-            cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
-                                         cdfz[np.abs(z) <= thresh]) ** 2
-        res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
-        return res
-
-    def _probit_g6(self, z, pdfz, cdfz, thresh=5):
-        res = np.zeros_like(z)
-        res[z < -thresh] = \
-            pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
-        res[np.abs(z) <= thresh] = \
-            (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
-            z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-            (1 - cdfz[np.abs(z) <= thresh])
-        res[z > thresh] = 0 * z[z > thresh]
-        return res
-
     def mu(self, z):
         """Inverse link function."""
         mu = norm.cdf(z)
@@ -372,8 +378,8 @@ def log_likelihood(self, y, y_hat, z=None):
         if z is not None:
             pdfz, cdfz = norm.pdf(z), norm.cdf(z)
             log_likelihood = \
-                np.sum(y * self._probit_g1(z, pdfz, cdfz) +
-                       (1 - y) * self._probit_g2(z, pdfz, cdfz))
+                np.sum(y * _probit_g1(z, pdfz, cdfz) +
+                       (1 - y) * _probit_g2(z, pdfz, cdfz))
         else:
             log_likelihood = \
                 np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
diff --git a/pyglmnet/externals/sklearn/utils/validation.py b/pyglmnet/externals/sklearn/utils/validation.py
index 7cda34e4..3e09c504 100644
--- a/pyglmnet/externals/sklearn/utils/validation.py
+++ b/pyglmnet/externals/sklearn/utils/validation.py
@@ -119,7 +119,7 @@ def check_consistent_length(*arrays):
     uniques = np.unique(lengths)
     if len(uniques) > 1:
         raise ValueError("Found input variables with inconsistent numbers of"
-                         " samples: %r" % [int(l) for l in lengths])
+                         " samples: %r" % [int(ll) for ll in lengths])
 
 
 def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,

From 615fcf90275f8c1a3614d068531c9fedb270e2d5 Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Wed, 19 Aug 2020 13:14:48 -0400
Subject: [PATCH 22/24] DOC update api and refactor softplus

---
 doc/api.rst               | 30 +++++++++++++++++++-----
 pyglmnet/distributions.py | 49 +++++++++++++--------------------------
 2 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index fd0f5047..9d5b1f0b 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -6,14 +6,32 @@ API Documentation
 
 .. currentmodule:: pyglmnet
 
-Classes
-=======
+GLM Classes
+===========
 
-.. autoclass:: GLM
-   :members:
+.. currentmodule:: pyglmnet
+
+.. autosummary::
+   :toctree: generated/
+
+   GLM
+   GLMCV
+
+Distribution Classes
+====================
+
+.. currentmodule:: pyglmnet.distributions
+
+.. autosummary::
+   :toctree: generated/
 
-.. autoclass:: GLMCV
-  :members:
+   BaseDistribution
+   Poisson
+   PoissonSoftplus
+   NegBinomialSoftplus
+   Binomial
+   Probit
+   GammaSoftplus
 
 
 Datasets
diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index a6357778..988a7ca6 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -60,6 +60,19 @@ def _z(self, beta0, beta, X):
         return beta0 + np.dot(X, beta)
 
 
+class SoftplusLink:
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = softplus(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z)
+        return grad_mu
+
+
 class Gaussian(BaseDistribution):
     """Class for Gaussian distribution."""
 
@@ -166,19 +179,9 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return _random_state.poisson(mu)
 
 
-class PoissonSoftplus(Poisson):
+class PoissonSoftplus(Poisson, SoftplusLink):
     """Class for Poisson distribution with softplus inverse link."""
 
-    def mu(self, z):
-        """Inverse link function."""
-        mu = softplus(z)
-        return mu
-
-    def grad_mu(self, z):
-        """Gradient of inverse link."""
-        grad_mu = expit(z)
-        return grad_mu
-
     def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
         mu = self.mu(z)
@@ -191,23 +194,13 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         return gk, hk
 
 
-class NegBinomialSoftplus(BaseDistribution):
+class NegBinomialSoftplus(BaseDistribution, SoftplusLink):
     """Class for Negative binomial distribution with softplus inverse link."""
 
     def __init__(self):
         """init."""
         self.theta = None
 
-    def mu(self, z):
-        """Inverse link function."""
-        mu = softplus(z)
-        return mu
-
-    def grad_mu(self, z):
-        """Gradient of inverse link."""
-        grad_mu = expit(z)
-        return grad_mu
-
     def log_likelihood(self, y, y_hat):
         """Log L2-penalized likelihood."""
         theta = self.theta
@@ -413,19 +406,9 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return _random_state.binomial(1, mu)
 
 
-class GammaSoftplus(BaseDistribution):
+class GammaSoftplus(BaseDistribution, SoftplusLink):
     """Class for Gamma distribution with softplus inverse link."""
 
-    def mu(self, z):
-        """Inverse link function."""
-        mu = softplus(z)
-        return mu
-
-    def grad_mu(self, z):
-        """Gradient of inverse link."""
-        grad_mu = expit(z)
-        return grad_mu
-
     def log_likelihood(self, y, y_hat, z=None):
         """Log L2-penalized likelihood."""
         # see

From 9bdab7638565220005ee672fe708884934c44f9a Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Wed, 19 Aug 2020 13:16:34 -0400
Subject: [PATCH 23/24] FIX probit

---
 pyglmnet/distributions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 988a7ca6..9f87b9fd 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -390,10 +390,10 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         """One-dimensional Gradient and Hessian of log likelihood."""
         pdfz = norm.pdf(z)
         cdfz = norm.cdf(z)
-        gk = -np.sum((y * self._probit_g3(z, pdfz, cdfz) -
-                      (1 - y) * self._probit_g4(z, pdfz, cdfz)) * xk)
-        hk = np.sum((y * self._probit_g5(z, pdfz, cdfz) +
-                     (1 - y) * self._probit_g6(z, pdfz, cdfz)) * (xk * xk))
+        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
+                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
+        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
+                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
         return gk, hk
 
     def simulate(self, beta0, beta, X, random_state, sample):

From 081fa04820881a0588731ce5588a75bc9eaee52c Mon Sep 17 00:00:00 2001
From: Mainak Jas <mainakjas@gmail.com>
Date: Wed, 19 Aug 2020 13:30:14 -0400
Subject: [PATCH 24/24] TST make basedistribution happy

---
 pyglmnet/distributions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
index 9f87b9fd..ffdc7279 100644
--- a/pyglmnet/distributions.py
+++ b/pyglmnet/distributions.py
@@ -179,7 +179,7 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return _random_state.poisson(mu)
 
 
-class PoissonSoftplus(Poisson, SoftplusLink):
+class PoissonSoftplus(SoftplusLink, Poisson):
     """Class for Poisson distribution with softplus inverse link."""
 
     def gradhess_log_likelihood_1d(self, xk, y, z):
@@ -194,7 +194,7 @@ def gradhess_log_likelihood_1d(self, xk, y, z):
         return gk, hk
 
 
-class NegBinomialSoftplus(BaseDistribution, SoftplusLink):
+class NegBinomialSoftplus(SoftplusLink, BaseDistribution):
     """Class for Negative binomial distribution with softplus inverse link."""
 
     def __init__(self):
@@ -406,7 +406,7 @@ def simulate(self, beta0, beta, X, random_state, sample):
             return _random_state.binomial(1, mu)
 
 
-class GammaSoftplus(BaseDistribution, SoftplusLink):
+class GammaSoftplus(SoftplusLink, BaseDistribution):
     """Class for Gamma distribution with softplus inverse link."""
 
     def log_likelihood(self, y, y_hat, z=None):