Skip to content

Commit a0c84c5

Browse files
committed
documented confidence.py
1 parent ce4c000 commit a0c84c5

File tree

1 file changed

+160
-33
lines changed

1 file changed

+160
-33
lines changed

quapy/method/confidence.py

+160-33
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,108 @@
1-
from functools import cached_property
21
import numpy as np
32
import quapy as qp
43
import quapy.functional as F
54
from quapy.data import LabelledCollection
65
from quapy.method.aggregative import AggregativeQuantifier
76
from scipy.stats import chi2
8-
from scipy.special import gamma
97
from sklearn.utils import resample
108
from abc import ABC, abstractmethod
119
from scipy.special import softmax, factorial
1210
import copy
1311
from functools import lru_cache
1412

15-
13+
"""
14+
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
15+
for AggregativeQuantifiers.
16+
"""
1617

1718
class ConfidenceRegionABC(ABC):
19+
"""
20+
Abstract class of confidence regions
21+
"""
1822

1923
@abstractmethod
2024
def point_estimate(self) -> np.ndarray:
25+
"""
26+
Returns the point estimate corresponding to a set of bootstrap estimates.
27+
28+
:return: np.ndarray
29+
"""
2130
...
2231

23-
def ndim(self):
32+
def ndim(self) -> int:
33+
"""
34+
Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
35+
of the simplex is therefore ndim-1
36+
37+
:return: int
38+
"""
2439
return len(self.point_estimate())
2540

2641
@abstractmethod
27-
def coverage(self, true_value):
42+
def coverage(self, true_value) -> float:
43+
"""
44+
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
45+
fraction of these that are contained in the region, if more than one value is passed. If only one value is
46+
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
47+
48+
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
49+
:return: float in [0,1]
50+
"""
2851
...
2952

3053
@lru_cache
3154
def simplex_portion(self):
55+
"""
56+
Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
57+
itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
58+
contained in the region. A default implementation, based on Monte Carlo approximation, is provided.
59+
60+
:return: float, the fraction of the simplex covered by the region
61+
"""
3262
return self.montecarlo_proportion()
3363

3464
@lru_cache
3565
def montecarlo_proportion(self, n_trials=10_000):
66+
"""
67+
Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
68+
out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
69+
are included in the region. The value is only computed once when multiple calls are made.
70+
71+
:return: float in [0,1]
72+
"""
3673
with qp.util.temp_seed(0):
3774
uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
3875
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
3976
return proportion
4077

4178

4279
class WithConfidenceABC(ABC):
80+
"""
81+
Abstract class for confidence regions.
82+
"""
83+
4384
@abstractmethod
4485
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
86+
"""
87+
Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
88+
also the confidence region around it.
89+
90+
:param instances: a np.ndarray of shape (n_instances, n_features,)
91+
:confidence_level: float in (0, 1)
92+
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
93+
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
94+
"""
4595
...
4696

4797

4898
def simplex_volume(n):
99+
"""
100+
Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
101+
is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.
102+
103+
:param n: int, the dimensionality of the simplex
104+
:return: float, the volume of the n-dimensional simplex
105+
"""
49106
return 1 / factorial(n)
50107

51108

@@ -54,17 +111,16 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
54111
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
55112
at a distance `chi2_critical`.
56113
57-
:param values: a np.ndarray with shape (ndim,) or (n_values,ndim,)
58-
:param mean: a np.ndarray with the mean of the sample
114+
:param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
115+
:param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
59116
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
60-
covariance matrix) of the sample. If this inverse cannot be computed
117+
covariance matrix) of the ellipse. If this inverse cannot be computed
61118
then None must be passed
62-
:param chi2_critical: the chi2 critical value
119+
:param chi2_critical: float, the chi2 critical value
63120
64-
:return: the fraction of values that are contained in the ellipse
65-
defined by the mean, the precision matrix, and the chi2_critical.
66-
If values is only one value, then either 0 (not contained) or
67-
1 (contained) is returned.
121+
:return: float in [0,1], the fraction of values that are contained in the ellipse
122+
defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
123+
If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
68124
"""
69125
if prec_matrix is None:
70126
return 0.
@@ -84,6 +140,12 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
84140

85141

86142
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
143+
"""
144+
Instantiates a Confidence Ellipse in the probability simplex.
145+
146+
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
147+
:param confidence_level: float, the confidence level (default 0.95)
148+
"""
87149

88150
def __init__(self, X, confidence_level=0.95):
89151

@@ -107,20 +169,32 @@ def __init__(self, X, confidence_level=0.95):
107169
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
108170

109171
def point_estimate(self):
172+
"""
173+
Returns the point estimate, the center of the ellipse.
174+
175+
:return: np.ndarray of shape (n_classes,)
176+
"""
110177
return self.mean_
111178

112179
def coverage(self, true_value):
113180
"""
114-
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
115-
confidence_level None means that the confidence_level is taken from the __init__
116-
returns true or false depending on whether true_value is in the ellipse or not,
117-
or returns the proportion of true_values that are within the ellipse if more
118-
than one are passed
181+
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
182+
fraction of these that are contained in the region, if more than one value is passed. If only one value is
183+
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
184+
185+
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
186+
:return: float in [0,1]
119187
"""
120188
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
121189

122190

123191
class ConfidenceEllipseCLR(ConfidenceRegionABC):
192+
"""
193+
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
194+
195+
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
196+
:param confidence_level: float, the confidence level (default 0.95)
197+
"""
124198

125199
def __init__(self, X, confidence_level=0.95):
126200
self.clr = CLRtransformation()
@@ -129,26 +203,36 @@ def __init__(self, X, confidence_level=0.95):
129203
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
130204

131205
def point_estimate(self):
132-
# Z_mean = self.conf_region_clr.mean()
133-
# return self.clr.inverse(Z_mean)
134-
# the inverse of the CLR does not coincide with the clean mean because the geometric mean
135-
# requires smoothing the prevalence vectors and this affects the softmax (inverse)
206+
"""
207+
Returns the point estimate, the center of the ellipse.
208+
209+
:return: np.ndarray of shape (n_classes,)
210+
"""
211+
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
212+
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
213+
# return self.clr.inverse(self.mean_) # <- does not coincide
136214
return self.mean_
137215

138216
def coverage(self, true_value):
139217
"""
140-
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
141-
confidence_level None means that the confidence_level is taken from the __init__
142-
returns true or false depending on whether true_value is in the ellipse or not,
143-
or returns the proportion of true_values that are within the ellipse if more
144-
than one are passed
218+
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
219+
fraction of these that are contained in the region, if more than one value is passed. If only one value is
220+
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
221+
222+
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
223+
:return: float in [0,1]
145224
"""
146225
transformed_values = self.clr(true_value)
147226
return self.conf_region_clr.coverage(transformed_values)
148227

149228

150229
class ConfidenceIntervals(ConfidenceRegionABC):
230+
"""
231+
Instantiates a region based on (independent) Confidence Intervals.
151232
233+
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
234+
:param confidence_level: float, the confidence level (default 0.95)
235+
"""
152236
def __init__(self, X, confidence_level=0.95):
153237
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
154238

@@ -158,14 +242,21 @@ def __init__(self, X, confidence_level=0.95):
158242
self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
159243

160244
def point_estimate(self):
245+
"""
246+
Returns the point estimate, the class-wise average of the bootstrapped estimates
247+
248+
:return: np.ndarray of shape (n_classes,)
249+
"""
161250
return self.means_
162251

163252
def coverage(self, true_value):
164253
"""
165-
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
166-
returns true or false depending on whether true_value is in the ellipse or not,
167-
or returns the proportion of true_values that are within the ellipse if more
168-
than one are passed
254+
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
255+
fraction of these that are contained in the region, if more than one value is passed. If only one value is
256+
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
257+
258+
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
259+
:return: float in [0,1]
169260
"""
170261
within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
171262
within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
@@ -176,20 +267,56 @@ def coverage(self, true_value):
176267

177268
class CLRtransformation:
178269
"""
179-
Centered log-ratio
270+
Centered log-ratio, from component analysis
180271
"""
181-
182272
def __call__(self, X, epsilon=1e-6):
273+
"""
274+
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
275+
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
276+
277+
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
278+
:param epsilon: small float for prevalence smoothing
279+
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
280+
"""
183281
X = np.asarray(X)
184282
X = qp.error.smooth(X, epsilon)
185283
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
186284
return np.log(X / G)
187285

188286
def inverse(self, X):
287+
"""
288+
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
289+
290+
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
291+
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
292+
"""
189293
return softmax(X, axis=-1)
190294

191295

192296
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
297+
"""
298+
Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
299+
point-estimates of class prevalence values. This method implements some optimizations for
300+
speeding up the computations, which are only possible due to the two phases of the aggregative
301+
quantifiers.
302+
303+
During training, the bootstrap repetitions are only carried out over pre-classified training instances,
304+
after the classifier has been trained (only once), in order to train a series of aggregation
305+
functions (model-based approach).
306+
307+
During inference, the bootstrap repetitions are applied to the pre-classified test instances.
308+
309+
:param quantifier: an aggregative quantifier
310+
:para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
311+
model-based bootstrap approach)
312+
:para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
313+
population-based bootstrap approach)
314+
:param confidence_level: float, the confidence level for the confidence region (default 0.95)
315+
:param method: string, set to `intervals` for constructing confidence intervals (default), or to
316+
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
317+
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
318+
:param random_state: int for replicating samples, None (default) for non-replicable samples
319+
"""
193320

194321
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
195322

0 commit comments

Comments
 (0)