diff --git a/.gitignore b/.gitignore index bedeacf1..9aa1ade2 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,6 @@ _build/ # PyCharm ######### .idea/ +.vscode/launch.json +.vscode/settings.json +.env diff --git a/pygam/distributions.py b/pygam/distributions.py index c7e87478..63cc925b 100644 --- a/pygam/distributions.py +++ b/pygam/distributions.py @@ -644,10 +644,184 @@ def sample(self, mu): return np.random.wald(mean=mu, scale=self.scale, size=None) +class TweedieDist(Distribution): + """ + Tweedie Distribution + """ + + def __init__(self, power, scale=None): + if 0 < power < 1: + raise ValueError( + "Power parameter p must not be between 0 and 1 (non-inclusive) for the Tweedie distribution." + ) + super(TweedieDist, self).__init__(name='tweedie', scale=scale) + self.power = power + + def log_pdf(self, y, mu, weights=None): + """ + Computes the log of the PDF (or PMF for y=0) of the Tweedie distribution. + + Parameters + ---------- + y : array-like of length n + Target values. + mu : array-like of length n + Expected values. + weights : array-like shape (n,) or None, default: None + Sample weights. If None, defaults to array of ones. + + Returns + ------- + log_pdf : np.array of length n + """ + y = np.asarray(y, dtype=np.float64) + mu = np.asarray(mu, dtype=np.float64) + + if np.any(y < 0) or np.any(mu <= 0): + raise ValueError( + "Negative values detected in y or non-positive values in mu." + ) + + if weights is None: + weights = np.ones_like(mu) + + phi = self.scale # dispersion parameter + p = self.power # power parameter + + # Initialize log_pdf array + log_pdf_values = np.zeros_like(y, dtype=np.float64) + + # Indices where y > 0 + idx_positive = y > 0 + # Indices where y == 0 + idx_zero = y == 0 + + # For y == 0 + if np.any(idx_zero): + mu_zero = mu[idx_zero] + # Compute log probability mass at zero + log_p_zero = -(mu_zero ** (2 - p)) / ((2 - p) * phi) + log_pdf_values[idx_zero] = log_p_zero + + # For y > 0 + if np.any(idx_positive): + y_pos = y[idx_positive] + mu_pos = mu[idx_positive] + + # Compute terms carefully to avoid numerical issues + term1 = (y_pos * mu_pos ** (1 - p)) / (1 - p) + term2 = mu_pos ** (2 - p) / (2 - p) + term3 = y_pos ** (2 - p) / (2 - p) + + log_pdf = (1 / phi) * (term1 - term2 - term3) + + # Adjust with log(y) term and constants if necessary + # Since constants cancel out in likelihood ratios, they are often omitted + log_pdf_values[idx_positive] = log_pdf + + return log_pdf_values + + @divide_weights + def V(self, mu): + return mu**self.power + + @multiply_weights + def deviance(self, y, mu, scaled=True): + """ + Computes the deviance for the Tweedie distribution. + + Parameters + ---------- + y : array-like of length n + Target values. + mu : array-like of length n + Expected values. + scaled : boolean, default: True + Whether to divide the deviance by the distribution scale. + + Returns + ------- + deviance : np.array of length n + """ + if np.any(y < 0) or np.any(mu < 0): + raise ValueError("Negative values detected in y or mu.") + p = self.power + if p < 0: + dev = 2 * ( + np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p)) + - y * np.power(mu, 1 - p) / (1 - p) + + np.power(mu, 2 - p) / (2 - p) + ) + elif p == 0: + dev = (y - mu) ** 2 + elif p == 1: + dev = 2 * (y * np.log(y / mu) - (y - mu)) + elif p == 2: + dev = 2 * (np.log(mu / y) + (y / mu) - 1) + else: + dev = 2 * ( + np.power(y, 2 - p) / ((1 - p) * (2 - p)) + - y * np.power(mu, 1 - p) / (1 - p) + + np.power(mu, 2 - p) / (2 - p) + ) + if scaled: + dev /= self.scale + return dev + + def sample(self, mu): + """ + Generates random samples from the Tweedie distribution. + + Parameters + ---------- + mu : array-like of shape n_samples + Expected values. + + Returns + ------- + random_samples : np.array of same shape as mu + """ + p = self.power + phi = self.scale + + if p <= 1: + raise ValueError( + "Power parameter p must be > 1 for the Tweedie distribution." + ) + elif 1 < p < 2: + # Compound Poisson-Gamma + lambda_ = mu ** (2 - p) / (phi * (2 - p)) + num_events = np.random.poisson(lambda_) + # Gamma parameters + a = (2 - p) / (p - 1) + s = phi * (p - 1) * mu ** (p - 1) + samples = np.zeros_like(mu) + positive_indices = num_events > 0 + samples[positive_indices] = np.array( + [ + np.sum(np.random.gamma(shape=a, scale=s[i], size=n)) + for i, n in enumerate(num_events[positive_indices]) + ] + ) + + return samples + elif p == 2: + # Corrected Gamma distribution parameters + shape = 1 / phi + scale = phi * mu + return np.random.gamma(shape=shape, scale=scale) + + else: + raise NotImplementedError( + "Sampling not implemented for p > 2 in TweedieDist." + ) + + DISTRIBUTIONS = { "normal": NormalDist, "poisson": PoissonDist, "binomial": BinomialDist, "gamma": GammaDist, "inv_gauss": InvGaussDist, + "tweedie": TweedieDist, } diff --git a/pygam/pygam.py b/pygam/pygam.py index 6ee05d5d..6a669bc4 100644 --- a/pygam/pygam.py +++ b/pygam/pygam.py @@ -176,6 +176,14 @@ def __init__( self.terms = TermList(terms) if isinstance(terms, Term) else terms self.fit_intercept = fit_intercept + # Handle additional parameters for specific distributions + if distribution == 'tweedie': + self.power = kwargs.pop('power', None) + if self.power is None: + raise ValueError("Tweedie distribution requires a 'power' parameter.") + else: + self.power = None + for k, v in kwargs.items(): if k not in self._plural: raise TypeError(f"__init__() got an unexpected keyword argument {k}") @@ -261,7 +269,10 @@ def _validate_params(self): ): raise ValueError(f"unsupported distribution {self.distribution}") if self.distribution in DISTRIBUTIONS: - self.distribution = DISTRIBUTIONS[self.distribution]() + if self.distribution == 'tweedie': + self.distribution = DISTRIBUTIONS[self.distribution](power=self.power) + else: + self.distribution = DISTRIBUTIONS[self.distribution]() # link if not ((self.link in LINKS) or isinstance(self.link, Link)): diff --git a/pygam/tests/test_penalties.py b/pygam/tests/test_penalties.py index e06941f9..7e7820fd 100644 --- a/pygam/tests/test_penalties.py +++ b/pygam/tests/test_penalties.py @@ -22,6 +22,7 @@ def test_single_spline_penalty(): monotonic_ and convexity_ should be 0. """ coef = np.array(1.0) + assert np.all(derivative(1, coef).toarray() == 0.0) assert np.all(l2(1, coef).toarray() == 1.0) assert np.all(monotonic_inc(1, coef).toarray() == 0.0) diff --git a/pygam/tests/test_tweedie.py b/pygam/tests/test_tweedie.py new file mode 100644 index 00000000..a207d57e --- /dev/null +++ b/pygam/tests/test_tweedie.py @@ -0,0 +1,133 @@ +import pytest +import numpy as np +from pygam.distributions import TweedieDist +from pygam import GAM, s + + +@pytest.fixture +def tweedie_dist(): + return TweedieDist(power=1.5, scale=1.0) + + +def test_log_pdf(tweedie_dist): + mu = np.array([1.0, 2.0, 3.0]) + y = np.array([1.5, 2.5, 3.5]) + weights = np.array([1.0, 1.0, 1.0]) + log_pdf = tweedie_dist.log_pdf(y, mu, weights) + assert log_pdf.shape == y.shape + assert np.all(np.isfinite(log_pdf)), "Log PDF contains non-finite values." + + +def test_deviance(tweedie_dist): + mu = np.array([1.0, 2.0, 3.0]) + y = np.array([1.5, 2.5, 3.5]) + deviance = tweedie_dist.deviance(y, mu, scaled=True) + assert deviance.shape == y.shape + assert np.all(deviance >= 0), "Deviance contains negative values." + + +def test_sample(tweedie_dist): + mu = np.array([1.0, 2.0, 3.0]) + # Generate 1000 samples for each mu + samples = np.array([tweedie_dist.sample(mu) for _ in range(1000)]) + sample_mean = np.mean(samples) + expected_mean = np.mean(mu) + # Adjust the tolerance based on the variance + tolerance = 0.1 * expected_mean + assert ( + abs(sample_mean - expected_mean) < tolerance + ), "Sample mean is not within the expected range." + + +def test_invalid_power(): + with pytest.raises(ValueError): + TweedieDist(power=0.5, scale=1.0) # Power less than 1 is invalid + + +def test_not_implemented_power(): + dist = TweedieDist(power=3.0, scale=1.0) + mu = np.array([1.0, 2.0, 3.0]) + with pytest.raises(NotImplementedError): + dist.sample(mu) + + +def test_gam_tweedie_fit(): + # Generate synthetic data + np.random.seed(0) + n_samples = 100 + X = np.linspace(0, 10, n_samples).reshape(-1, 1) + true_coef = 2.0 + # Generate target variable following a Tweedie distribution + # For simplicity, using Gamma distribution as a proxy when power=2 + y = X.flatten() * true_coef + np.random.gamma(shape=2.0, scale=1.0, size=n_samples) + + # Initialize and fit GAM with Tweedie distribution + gam = GAM(terms=s(0), distribution='tweedie', power=1.5, fit_intercept=True) + gam.fit(X, y) + + # Predict and check the shape of predictions + y_pred = gam.predict(X) + assert y_pred.shape == y.shape, "Predictions shape mismatch." + + # Check that predictions are finite + assert np.all(np.isfinite(y_pred)), "Predictions contain non-finite values." + + # Optionally, check if the mean of predictions is close to the mean of y + sample_mean = np.mean(y_pred) + expected_mean = np.mean(y) + assert ( + abs(sample_mean - expected_mean) < 1.0 + ), "Sample mean is not within the expected range." + + +def test_variance_function(tweedie_dist): + mu = np.array([1.0, 2.0, 3.0]) + variance = tweedie_dist.V(mu) + expected_variance = mu**tweedie_dist.power + assert np.allclose( + variance, expected_variance + ), "Variance function V(mu) is incorrect." + + +def test_zero_targets(tweedie_dist): + mu = np.array([1.0, 2.0, 3.0]) + y = np.array([0.0, 0.0, 0.0]) + log_pdf = tweedie_dist.log_pdf(y, mu) + assert log_pdf.shape == y.shape + assert np.all( + np.isfinite(log_pdf) + ), "Log PDF with zero targets contains non-finite values." + + +def test_negative_inputs(tweedie_dist): + mu = np.array([-1.0, 2.0, 3.0]) + y = np.array([1.0, -2.0, 3.0]) + with pytest.raises(ValueError): + tweedie_dist.log_pdf(y, mu) + with pytest.raises(ValueError): + tweedie_dist.deviance(y, mu) + + +def test_sample_with_zero_mu(tweedie_dist): + mu = np.array([0.0, 0.0, 0.0]) + samples = tweedie_dist.sample(mu) + assert np.all(samples == 0), "Samples with zero mu should be zeros." + + +def test_boundary_power_values(): + mu = np.array([1.0, 2.0, 3.0]) + y = np.array([1.0, 2.0, 3.0]) + + # Power approaching 1 + tweedie_dist = TweedieDist(power=1.0001, scale=1.0) + log_pdf = tweedie_dist.log_pdf(y, mu) + assert np.all( + np.isfinite(log_pdf) + ), "Log PDF near power=1 contains non-finite values." + + # Power approaching 2 + tweedie_dist = TweedieDist(power=1.9999, scale=1.0) + log_pdf = tweedie_dist.log_pdf(y, mu) + assert np.all( + np.isfinite(log_pdf) + ), "Log PDF near power=2 contains non-finite values."