1
- from functools import cached_property
2
1
import numpy as np
3
2
import quapy as qp
4
3
import quapy .functional as F
5
4
from quapy .data import LabelledCollection
6
5
from quapy .method .aggregative import AggregativeQuantifier
7
6
from scipy .stats import chi2
8
- from scipy .special import gamma
9
7
from sklearn .utils import resample
10
8
from abc import ABC , abstractmethod
11
9
from scipy .special import softmax , factorial
12
10
import copy
13
11
from functools import lru_cache
14
12
15
-
13
+ """
14
+ This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
15
+ for AggregativeQuantifiers.
16
+ """
16
17
17
18
class ConfidenceRegionABC (ABC ):
19
+ """
20
+ Abstract class of confidence regions
21
+ """
18
22
19
23
@abstractmethod
20
24
def point_estimate (self ) -> np .ndarray :
25
+ """
26
+ Returns the point estimate corresponding to a set of bootstrap estimates.
27
+
28
+ :return: np.ndarray
29
+ """
21
30
...
22
31
23
- def ndim (self ):
32
+ def ndim (self ) -> int :
33
+ """
34
+ Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
35
+ of the simplex is therefore ndim-1
36
+
37
+ :return: int
38
+ """
24
39
return len (self .point_estimate ())
25
40
26
41
@abstractmethod
27
- def coverage (self , true_value ):
42
+ def coverage (self , true_value ) -> float :
43
+ """
44
+ Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
45
+ fraction of these that are contained in the region, if more than one value is passed. If only one value is
46
+ passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
47
+
48
+ :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
49
+ :return: float in [0,1]
50
+ """
28
51
...
29
52
30
53
@lru_cache
31
54
def simplex_portion (self ):
55
+ """
56
+ Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
57
+ itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
58
+ contained in the region. A default implementation, based on Monte Carlo approximation, is provided.
59
+
60
+ :return: float, the fraction of the simplex covered by the region
61
+ """
32
62
return self .montecarlo_proportion ()
33
63
34
64
@lru_cache
35
65
def montecarlo_proportion (self , n_trials = 10_000 ):
66
+ """
67
+ Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
68
+ out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
69
+ are included in the region. The value is only computed once when multiple calls are made.
70
+
71
+ :return: float in [0,1]
72
+ """
36
73
with qp .util .temp_seed (0 ):
37
74
uniform_simplex = F .uniform_simplex_sampling (n_classes = self .ndim (), size = n_trials )
38
75
proportion = np .clip (self .coverage (uniform_simplex ), 0. , 1. )
39
76
return proportion
40
77
41
78
42
79
class WithConfidenceABC (ABC ):
80
+ """
81
+ Abstract class for confidence regions.
82
+ """
83
+
43
84
@abstractmethod
44
85
def quantify_conf (self , instances , confidence_level = None ) -> (np .ndarray , ConfidenceRegionABC ):
86
+ """
87
+ Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
88
+ also the confidence region around it.
89
+
90
+ :param instances: a np.ndarray of shape (n_instances, n_features,)
91
+ :confidence_level: float in (0, 1)
92
+ :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
93
+ (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
94
+ """
45
95
...
46
96
47
97
48
98
def simplex_volume (n ):
99
+ """
100
+ Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
101
+ is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.
102
+
103
+ :param n: int, the dimensionality of the simplex
104
+ :return: float, the volume of the n-dimensional simplex
105
+ """
49
106
return 1 / factorial (n )
50
107
51
108
@@ -54,17 +111,16 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
54
111
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
55
112
at a distance `chi2_critical`.
56
113
57
- :param values: a np.ndarray with shape (ndim ,) or (n_values,ndim ,)
58
- :param mean: a np.ndarray with the mean of the sample
114
+ :param values: a np.ndarray of shape (n_dim ,) or (n_values, n_dim ,)
115
+ :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
59
116
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
60
- covariance matrix) of the sample . If this inverse cannot be computed
117
+ covariance matrix) of the ellipse . If this inverse cannot be computed
61
118
then None must be passed
62
- :param chi2_critical: the chi2 critical value
119
+ :param chi2_critical: float, the chi2 critical value
63
120
64
- :return: the fraction of values that are contained in the ellipse
65
- defined by the mean, the precision matrix, and the chi2_critical.
66
- If values is only one value, then either 0 (not contained) or
67
- 1 (contained) is returned.
121
+ :return: float in [0,1], the fraction of values that are contained in the ellipse
122
+ defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
123
+ If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
68
124
"""
69
125
if prec_matrix is None :
70
126
return 0.
@@ -84,6 +140,12 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
84
140
85
141
86
142
class ConfidenceEllipseSimplex (ConfidenceRegionABC ):
143
+ """
144
+ Instantiates a Confidence Ellipse in the probability simplex.
145
+
146
+ :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
147
+ :param confidence_level: float, the confidence level (default 0.95)
148
+ """
87
149
88
150
def __init__ (self , X , confidence_level = 0.95 ):
89
151
@@ -107,20 +169,32 @@ def __init__(self, X, confidence_level=0.95):
107
169
self .chi2_critical_ = chi2 .ppf (confidence_level , df = self .ddof )
108
170
109
171
def point_estimate (self ):
172
+ """
173
+ Returns the point estimate, the center of the ellipse.
174
+
175
+ :return: np.ndarray of shape (n_classes,)
176
+ """
110
177
return self .mean_
111
178
112
179
def coverage (self , true_value ):
113
180
"""
114
- true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
115
- confidence_level None means that the confidence_level is taken from the __init__
116
- returns true or false depending on whether true_value is in the ellipse or not,
117
- or returns the proportion of true_values that are within the ellipse if more
118
- than one are passed
181
+ Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
182
+ fraction of these that are contained in the region, if more than one value is passed. If only one value is
183
+ passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
184
+
185
+ :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
186
+ :return: float in [0,1]
119
187
"""
120
188
return within_ellipse_prop (true_value , self .mean_ , self .precision_matrix_ , self .chi2_critical_ )
121
189
122
190
123
191
class ConfidenceEllipseCLR (ConfidenceRegionABC ):
192
+ """
193
+ Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
194
+
195
+ :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
196
+ :param confidence_level: float, the confidence level (default 0.95)
197
+ """
124
198
125
199
def __init__ (self , X , confidence_level = 0.95 ):
126
200
self .clr = CLRtransformation ()
@@ -129,26 +203,36 @@ def __init__(self, X, confidence_level=0.95):
129
203
self .conf_region_clr = ConfidenceEllipseSimplex (Z , confidence_level = confidence_level )
130
204
131
205
def point_estimate (self ):
132
- # Z_mean = self.conf_region_clr.mean()
133
- # return self.clr.inverse(Z_mean)
134
- # the inverse of the CLR does not coincide with the clean mean because the geometric mean
135
- # requires smoothing the prevalence vectors and this affects the softmax (inverse)
206
+ """
207
+ Returns the point estimate, the center of the ellipse.
208
+
209
+ :return: np.ndarray of shape (n_classes,)
210
+ """
211
+ # The inverse of the CLR does not coincide with the true mean, because the geometric mean
212
+ # requires smoothing the prevalence vectors and this affects the softmax (inverse);
213
+ # return self.clr.inverse(self.mean_) # <- does not coincide
136
214
return self .mean_
137
215
138
216
def coverage (self , true_value ):
139
217
"""
140
- true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
141
- confidence_level None means that the confidence_level is taken from the __init__
142
- returns true or false depending on whether true_value is in the ellipse or not,
143
- or returns the proportion of true_values that are within the ellipse if more
144
- than one are passed
218
+ Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
219
+ fraction of these that are contained in the region, if more than one value is passed. If only one value is
220
+ passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
221
+
222
+ :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
223
+ :return: float in [0,1]
145
224
"""
146
225
transformed_values = self .clr (true_value )
147
226
return self .conf_region_clr .coverage (transformed_values )
148
227
149
228
150
229
class ConfidenceIntervals (ConfidenceRegionABC ):
230
+ """
231
+ Instantiates a region based on (independent) Confidence Intervals.
151
232
233
+ :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
234
+ :param confidence_level: float, the confidence level (default 0.95)
235
+ """
152
236
def __init__ (self , X , confidence_level = 0.95 ):
153
237
assert 0 < confidence_level < 1 , f'{ confidence_level = } must be in range(0,1)'
154
238
@@ -158,14 +242,21 @@ def __init__(self, X, confidence_level=0.95):
158
242
self .I_low , self .I_high = np .percentile (X , q = [2.5 , 97.5 ], axis = 0 )
159
243
160
244
def point_estimate (self ):
245
+ """
246
+ Returns the point estimate, the class-wise average of the bootstrapped estimates
247
+
248
+ :return: np.ndarray of shape (n_classes,)
249
+ """
161
250
return self .means_
162
251
163
252
def coverage (self , true_value ):
164
253
"""
165
- true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
166
- returns true or false depending on whether true_value is in the ellipse or not,
167
- or returns the proportion of true_values that are within the ellipse if more
168
- than one are passed
254
+ Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
255
+ fraction of these that are contained in the region, if more than one value is passed. If only one value is
256
+ passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
257
+
258
+ :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
259
+ :return: float in [0,1]
169
260
"""
170
261
within_intervals = np .logical_and (self .I_low <= true_value , true_value <= self .I_high )
171
262
within_all_intervals = np .all (within_intervals , axis = - 1 , keepdims = True )
@@ -176,20 +267,56 @@ def coverage(self, true_value):
176
267
177
268
class CLRtransformation :
178
269
"""
179
- Centered log-ratio
270
+ Centered log-ratio, from component analysis
180
271
"""
181
-
182
272
def __call__ (self , X , epsilon = 1e-6 ):
273
+ """
274
+ Applies the CLR function to X thus mapping the instances, which are contained in `\\ mathcal{R}^{n}` but
275
+ actually lie on a `\\ mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\ mathcal{R}^{n}`
276
+
277
+ :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
278
+ :param epsilon: small float for prevalence smoothing
279
+ :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
280
+ """
183
281
X = np .asarray (X )
184
282
X = qp .error .smooth (X , epsilon )
185
283
G = np .exp (np .mean (np .log (X ), axis = - 1 , keepdims = True )) # geometric mean
186
284
return np .log (X / G )
187
285
188
286
def inverse (self , X ):
287
+ """
288
+ Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
289
+
290
+ :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
291
+ :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
292
+ """
189
293
return softmax (X , axis = - 1 )
190
294
191
295
192
296
class AggregativeBootstrap (WithConfidenceABC , AggregativeQuantifier ):
297
+ """
298
+ Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
299
+ point-estimates of class prevalence values. This method implements some optimizations for
300
+ speeding up the computations, which are only possible due to the two phases of the aggregative
301
+ quantifiers.
302
+
303
+ During training, the bootstrap repetitions are only carried out over pre-classified training instances,
304
+ after the classifier has been trained (only once), in order to train a series of aggregation
305
+ functions (model-based approach).
306
+
307
+ During inference, the bootstrap repetitions are applied to the pre-classified test instances.
308
+
309
+ :param quantifier: an aggregative quantifier
310
+ :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
311
+ model-based bootstrap approach)
312
+ :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
313
+ population-based bootstrap approach)
314
+ :param confidence_level: float, the confidence level for the confidence region (default 0.95)
315
+ :param method: string, set to `intervals` for constructing confidence intervals (default), or to
316
+ `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
317
+ constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
318
+ :param random_state: int for replicating samples, None (default) for non-replicable samples
319
+ """
193
320
194
321
METHODS = ['intervals' , 'ellipse' , 'ellipse-clr' ]
195
322
0 commit comments