Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/source/api_reference/regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,21 @@ This section lists simple regressors which can be used as baselines.
DeltaPointRegressor
DummyProbaRegressor

.. currentmodule:: skpro.regression.unconditional_distfit

.. autosummary::
:toctree: auto_generated/
:template: class.rst

UnconditionalDistfitRegressor

.. currentmodule:: skpro.regression.deterministic_reduction

.. autosummary::
:toctree: auto_generated/
:template: class.rst

DeterministicReductionRegressor

Linear regression
-----------------
Expand Down
27 changes: 27 additions & 0 deletions examples/baseline_regressors_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Example usage for baseline probabilistic regressors."""
import logging

import numpy as np
from sklearn.linear_model import LinearRegression

from skpro.regression.deterministic_reduction import DeterministicReductionRegressor
from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor

# Generate synthetic data
X = np.random.randn(100, 3)
y = 2 * X[:, 0] + np.random.randn(100)

# 1. Unconditional density baseline (featureless)
reg1 = UnconditionalDistfitRegressor()
reg1.fit(X, y)
dist1 = reg1.predict_proba(X)
logging.info("UnconditionalDistfitRegressor mean: %s", dist1.mean())
logging.info("Sample from unconditional: %s", dist1.sample(5))

# 2. Deterministic-style baseline (mean from regressor, constant variance)
reg2 = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
reg2.fit(X, y)
dist2 = reg2.predict_proba(X)
logging.info("DeterministicReductionRegressor mean: %s", dist2.mean)
logging.info("DeterministicReductionRegressor sigma: %s", dist2.sigma)
logging.info("Sample from deterministic baseline: %s", dist2.sample(5))
21 changes: 21 additions & 0 deletions examples/baseline_regressors_kde_hist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Example: using KDE and histogram with UnconditionalDistfitRegressor."""
import logging

import numpy as np

from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor

X = np.random.randn(80, 2)
y = np.random.randn(80)

# KDE baseline
reg_kde = UnconditionalDistfitRegressor(fit_kde=True)
reg_kde.fit(X, y)
dist_kde = reg_kde.predict_proba(X)
logging.info("KDE baseline mean: %s", dist_kde.mean())

# Histogram baseline
reg_hist = UnconditionalDistfitRegressor(fit_histogram=True)
reg_hist.fit(X, y)
dist_hist = reg_hist.predict_proba(X)
logging.info("Histogram baseline mean: %s", dist_hist.mean())
40 changes: 40 additions & 0 deletions examples/benchmark_baseline_regressors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Benchmark script for baseline probabilistic regressors."""
import logging

import numpy as np
from sklearn.linear_model import LinearRegression

from skpro.metrics import PinballLoss
from skpro.regression.deterministic_reduction import DeterministicReductionRegressor
from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor

# Generate synthetic data
X = np.random.randn(200, 5)
y = 3 * X[:, 0] - 2 * X[:, 1] + np.random.randn(200)

# Split
X_train, X_test = X[:150], X[150:]
y_train, y_test = y[:150], y[150:]

# Baseline 1: Unconditional
reg1 = UnconditionalDistfitRegressor()
reg1.fit(X_train, y_train)
dist1 = reg1.predict_proba(X_test)

# Baseline 2: Deterministic reduction
reg2 = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
reg2.fit(X_train, y_train)
dist2 = reg2.predict_proba(X_test)

# Evaluate pinball loss at alpha=0.1, 0.5, 0.9
alphas = [0.1, 0.5, 0.9]
for alpha in alphas:
loss1 = PinballLoss(alpha=alpha)(y_test, dist1)
loss2 = PinballLoss(alpha=alpha)(y_test, dist2)
logging.info(
"Alpha=%s: UnconditionalDistfitRegressor pinball loss=%.4f, "
"DeterministicReductionRegressor pinball loss=%.4f",
alpha,
loss1,
loss2,
)
4 changes: 4 additions & 0 deletions skpro/regression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
MapieCrossConformalRegressor,
MapieSplitConformalRegressor,
)
from skpro.regression.deterministic_reduction import DeterministicReductionRegressor
from skpro.regression.jackknife import MapieJackknifeAfterBootstrapRegressor
from skpro.regression.nonparametric import NadarayaWatsonCDE
from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor

__all__ = [
"DeterministicReductionRegressor",
"MapieSplitConformalRegressor",
"MapieCrossConformalRegressor",
"MapieConformalizedQuantileRegressor",
"MapieJackknifeAfterBootstrapRegressor",
"NadarayaWatsonCDE",
"UnconditionalDistfitRegressor",
]
146 changes: 146 additions & 0 deletions skpro/regression/deterministic_reduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Deterministic regression reduction baseline.

Outputs Gaussian (or Laplace) with mean=prediction, var=training sample var.
"""

import numpy as np

from skpro.distributions.laplace import Laplace
from skpro.distributions.normal import Normal
from skpro.regression.base import BaseProbaRegressor


class DeterministicReductionRegressor(BaseProbaRegressor):
"""
Wraps a deterministic regressor to output a Gaussian or Laplace.

The output has mean=prediction, var=training sample var.
Multi-output y is not supported (raises NotImplementedError).

Examples
--------
>>> from sklearn.linear_model import LinearRegression
>>> from skpro.regression.deterministic_reduction import (
... DeterministicReductionRegressor
... )
>>> import pandas as pd
>>> X = pd.DataFrame({"a": [1, 2, 3]})
>>> y = pd.DataFrame([1, 2, 3])
>>> reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
>>> reg.fit(X, y) # doctest: +ELLIPSIS
DeterministicReductionRegressor(...)
>>> dist = reg.predict_proba(X)
>>> dist.mean() # doctest: +NORMALIZE_WHITESPACE
0
0 1.0
1 2.0
2 3.0

References
----------
- Gaussian Processes for State Space Models and Change Point Detection
(Turner, 2011 thesis). https://mlg.eng.cam.ac.uk/pub/pdf/Tur11.pdf
- A Probabilistic View of Linear Regression
(Bishop, PRML; Keng, 2016; various tutorials).
- mlr3proba and related probabilistic ML frameworks.
- Efficient and Distance-Aware Deep Regressor for Uncertainty Quantification
(Bui et al., 2024).
https://proceedings.mlr.press/v238/manh-bui24a/manh-bui24a.pdf
"""
_tags = {
"authors": ["arnavk23"],
"estimator_type": "regressor_proba",
# estimator tags
# --------------
"capability:multioutput": False,
"capability:missing": True,
"X_inner_mtype": "pd_DataFrame_Table",
"y_inner_mtype": "pd_DataFrame_Table",
}

def __init__(self, regressor, distr_type="gaussian"):
allowed_types = ["gaussian", "laplace"]
if distr_type not in allowed_types:
raise ValueError(
f"distr_type must be one of {allowed_types}, got {distr_type}"
)
self.regressor = regressor
self.distr_type = distr_type
super().__init__()

def _fit(self, X, y, C=None):
# Ensure X and y are DataFrames with string column names
import pandas as pd
from sklearn.base import clone

if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = X.copy()
X.columns = [str(col) for col in X.columns]
if not isinstance(y, pd.DataFrame):
y = pd.DataFrame(y)
y = y.copy()
y.columns = [str(col) for col in y.columns]
if y.shape[1] > 1:
raise NotImplementedError(
"DeterministicReductionRegressor only supports univariate y. "
f"Got shape: {y.shape}"
)
self._X_cols = X.columns
self._y_cols = y.columns
self._X_index = X.index
self._y_index = y.index
# Clone the regressor to avoid mutating the parameter
self.regressor_ = clone(self.regressor)
self.regressor_ = self.regressor_.fit(
X, y.values.ravel() if y.shape[1] == 1 else y
)
y_arr = y.values.flatten()
self.train_mean_ = np.mean(y_arr)
self.train_var_ = np.var(y_arr)
return self

def _predict_proba(self, X):
import pandas as pd

# Ensure X is a DataFrame with string column names
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self._X_cols)
X = X.copy()
X.columns = [str(col) for col in X.columns]
mean_pred = self.regressor_.predict(X)
# Ensure output shape matches y
if mean_pred.ndim == 1:
mean_pred = mean_pred.reshape(-1, 1)
# Return distribution with correct index/columns
if self.distr_type == "gaussian":
return Normal(
mu=mean_pred,
sigma=np.sqrt(self.train_var_),
index=X.index,
columns=self._y_cols,
)
if self.distr_type == "laplace":
# Laplace scale = sqrt(var/2)
return Laplace(
mu=mean_pred,
scale=np.sqrt(self.train_var_ / 2),
index=X.index,
columns=self._y_cols,
)
raise ValueError(f"Unknown distr_type: {self.distr_type}")

def get_params(self, deep=True):
"""Get parameters for this estimator."""
# Only return true hyperparameters, not fitted attributes
return {"regressor": self.regressor, "distr_type": self.distr_type}

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter sets for automated tests."""
from sklearn.linear_model import LinearRegression

return [
{"regressor": LinearRegression(), "distr_type": "gaussian"},
{"regressor": LinearRegression(), "distr_type": "laplace"},
]
85 changes: 85 additions & 0 deletions skpro/regression/tests/test_baseline_regressors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np
import pytest
from sklearn.linear_model import LinearRegression

from skpro.regression.deterministic_reduction import DeterministicReductionRegressor
from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor


@pytest.mark.skipif(
not pytest.importorskip("distfit", reason="distfit required"),
reason="distfit not installed",
)
def test_unconditional_distfit_regressor():
def test_unconditional_distfit_regressor_invalid_distr_type():
with pytest.raises(ValueError):
UnconditionalDistfitRegressor(distr_type="not_a_dist")

def test_unconditional_distfit_regressor_multioutput():
X = np.random.randn(100, 3)
y = np.random.randn(100, 2)
reg = UnconditionalDistfitRegressor(distr_type="norm")
with pytest.raises(NotImplementedError):
reg.fit(X, y)

X = np.random.randn(100, 3)
y = np.random.randn(100)
reg = UnconditionalDistfitRegressor(distr_type="norm")
reg.fit(X, y)
dist = reg.predict_proba(X)
samples = dist.sample(10)
assert samples.shape[0] == 10
assert hasattr(dist, "pdf")
assert hasattr(dist, "mean")
assert hasattr(dist, "var")


def test_deterministic_reduction_regressor_gaussian():
def test_deterministic_reduction_regressor_invalid_distr_type():
with pytest.raises(ValueError):
DeterministicReductionRegressor(LinearRegression(), distr_type="not_a_dist")

def test_deterministic_reduction_regressor_multioutput():
X = np.random.randn(100, 2)
y = np.random.randn(100, 2)
reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
with pytest.raises(NotImplementedError):
reg.fit(X, y)

X = np.random.randn(100, 2)
y = np.random.randn(100)
reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
reg.fit(X, y)
dist = reg.predict_proba(X)
assert hasattr(dist, "mean")
assert hasattr(dist, "sigma")
assert np.allclose(dist.sigma, np.sqrt(np.var(y)))


def test_deterministic_reduction_regressor_laplace():
def test_unconditional_distfit_regressor_non_dataframe():
# Should work with numpy arrays as y
X = np.random.randn(50, 2)
y = np.random.randn(50)
reg = UnconditionalDistfitRegressor(distr_type="norm")
reg.fit(X, y)
dist = reg.predict_proba(X)
assert hasattr(dist, "mean")

def test_deterministic_reduction_regressor_non_dataframe():
# Should work with numpy arrays as X and y
X = np.random.randn(50, 2)
y = np.random.randn(50)
reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian")
reg.fit(X, y)
dist = reg.predict_proba(X)
assert hasattr(dist, "mean")

X = np.random.randn(100, 2)
y = np.random.randn(100)
reg = DeterministicReductionRegressor(LinearRegression(), distr_type="laplace")
reg.fit(X, y)
dist = reg.predict_proba(X)
assert hasattr(dist, "mu")
assert hasattr(dist, "scale")
assert np.allclose(dist.scale, np.sqrt(np.var(y) / 2))
Loading
Loading