@@ -2,15 +2,38 @@ brier_score <- function(x, y) {
22 sum((x - y )^ 2 )
33}
44
5- brier_resampling_p <- function (x , y , B = 10000 ) {
6- actual_brier <- brier_score(x , y )
7- brier_null <- replicate(B , {
8- yrep <- rbinom(length(x ), size = 1 , prob = x )
9- brier_score(x , yrep )
10- })
11- max(mean(actual_brier < = brier_null ), 0.5 / B )
12- }
13-
5+ # ' @title Binary calibration tests
6+ # '
7+ # ' @param x the predicted success probabilities
8+ # ' @param y the actual observed outcomes (just 0 or 1)
9+ # ' @param alpha the type I error rate for the test
10+ # ' @param B number of boostrap samples for the null distribution
11+ # '
12+ # ' @description Dimitriadis et al. propose several tests based on
13+ # ' comparing actual predictions to predictions when the probabilities are
14+ # ' calibrated. This yields several possible tests of correctly calibrated
15+ # ' predictions (i.e. that the expected proportion of true values matches the
16+ # ' predicted probability).
17+ # '
18+ # ' @details
19+ # ' The `brier_` functions represent a test based on brier score, while
20+ # ' the `miscalibration_` functions represent a test based on miscalibration.
21+ # ' In both cases we evaluate the null distribution via bootstrapping.
22+ # '
23+ # ' @returns `brier_resampling_test` and `miscalibration_resampling_test` return
24+ # ' an object of class `htest`, `brier_resampling_p` and `miscalibration_resampling_p`
25+ # ' return just the p-value (for easier use with automated workflows).
26+ # ' `binary_miscalibration` computes just the miscalibration component using
27+ # ' the PAV (pool adjacent violators) algorithm.
28+ # '
29+ # '
30+ # ' @references T. Dimitriadis, T. Gneiting, & A.I. Jordan,
31+ # ' Stable reliability diagrams for probabilistic classifiers,
32+ # ' Proc. Natl. Acad. Sci. U.S.A. 118 (8) e2016191118,
33+ # ' https://doi.org/10.1073/pnas.2016191118 (2021).
34+ # '
35+ # ' @rdname binary-calibration-tests
36+ # ' @export
1437brier_resampling_test <- function (x , y , alpha = 0.05 , B = 10000 ) {
1538 dname <- paste0(" x = " , deparse1(substitute(x )), " , y = " , deparse1(substitute(y )))
1639
@@ -35,6 +58,19 @@ brier_resampling_test <- function(x, y, alpha = 0.05, B = 10000) {
3558 class = " htest" )
3659}
3760
61+ # ' @rdname binary-calibration-tests
62+ # ' @export
63+ brier_resampling_p <- function (x , y , B = 10000 ) {
64+ actual_brier <- brier_score(x , y )
65+ brier_null <- replicate(B , {
66+ yrep <- rbinom(length(x ), size = 1 , prob = x )
67+ brier_score(x , yrep )
68+ })
69+ max(mean(actual_brier < = brier_null ), 0.5 / B )
70+ }
71+
72+ # ' @rdname binary-calibration-tests
73+ # ' @export
3874binary_miscalibration <- function (x ,y ) {
3975 require_package_version(" monotone" , " 0.1.2" , " miscalibration computations" )
4076 ord <- order(x , - y )
@@ -56,12 +92,15 @@ miscalibration_resampling_nulldist <- function(x,y, B = 1000) {
5692 })
5793}
5894
95+ # ' @rdname binary-calibration-tests
96+ # ' @export
5997miscalibration_resampling_p <- function (x ,y , B = 10000 ) {
6098 actual_miscalibration <- binary_miscalibration(x ,y )
6199 misc_null <- miscalibration_resampling_nulldist(x , y , B )
62100 max(mean(actual_miscalibration < = misc_null ), 0.5 / B )
63101}
64102
103+ # ' @rdname binary-calibration-tests
65104# ' @export
66105miscalibration_resampling_test <- function (x , y , alpha = 0.05 , B = 10000 ) {
67106 dname <- paste0(" x = " , deparse1(substitute(x )), " , y = " , deparse1(substitute(y )))
@@ -110,6 +149,8 @@ gaffke_ci_from_m <- function(m, alpha = 0.05) {
110149 ))
111150}
112151
152+ # ' @rdname gaffke_test
153+ # ' @export
113154gaffke_ci <- function (probs , B = 10000 , alpha = 0.05 ) {
114155 m <- gaffke_m(probs , B , alpha )
115156 gaffke_ci_from_m(m , alpha )
@@ -140,6 +181,8 @@ gaffke_p_from_m <- function(m, mu, B, alternative = c("two.sided", "less", "grea
140181 }
141182}
142183
184+ # ' @rdname gaffke_test
185+ # ' @export
143186gaffke_p <- function (probs , mu = 0.5 , alpha = 0.05 , B = 10000 , alternative = c(" two.sided" , " less" , " greater" )) {
144187 alternative <- match.arg(alternative )
145188
@@ -148,13 +191,46 @@ gaffke_p <- function(probs, mu = 0.5, alpha = 0.05, B = 10000, alternative = c("
148191}
149192
150193# ' Non-parametric test for the mean of a bounded variable.
194+ # '
195+ # ' @param x a vector of observed values
196+ # ' @param mu the mean under null hypothesis
197+ # ' @param alpha the level of the test
198+ # ' @param lb the lower bound for `x`
199+ # ' @param ub the upper bound for `x`
200+ # ' @param B number of bootstrap samples for the null distribution
201+ # ' @param alternative the alternative for the test.
202+ # '
203+ # ' @details The test is expected to be valid for any bounded distribution without further
204+ # ' assumptions. The test has been proven valid only for special cases but
205+ # ' no counterexample is known despite some efforts in the literature to find
206+ # ' some.
207+ # '
208+ # ' @description Test a null hypothesis about the mean of i.i.d. samples.
209+ # ' The test is based on Gaffke 2005, though a more detailed analysis and
210+ # ' exposition can be found in Learned-Miller & Thomas 2020.
211+ # '
212+ # ' @returns `gaffke_test` returns an object of class `htest`, `gaffke_p` and
213+ # ' `gaffke_ci` return just the p-value / CI as numeric for easier use in batch
214+ # ' workflows.
215+ # '
216+ # ' @references Gaffke, N. (2005).
217+ # ' “Three test statistics for a nonparametric one-sided hypothesis on the mean
218+ # ' of a nonnegative variable.” Mathematical Methods of Statistics, 14(4): 451–467.
219+ # '
220+ # ' Learned-Miller, E. and Thomas, P. S. (2020).
221+ # ' “A New Confidence Interval for the Mean of a Bounded Random Variable.”
222+ # ' https://arxiv.org/abs/1905.06208
223+ # '
224+ # ' @rdname gaffke_test
151225# ' @export
152226gaffke_test <- function (x , mu = 0.5 , alpha = 0.05 , lb = 0 , ub = 1 , B = 10000 , alternative = c(" two.sided" , " less" , " greater" )) {
153227 dname <- deparse1(substitute(x ))
154228 alternative <- match.arg(alternative )
155229
156230 stopifnot(length(lb ) == 1 )
157231 stopifnot(length(ub ) == 1 )
232+ stopifnot(is.finite(lb ))
233+ stopifnot(is.finite(ub ))
158234 stopifnot(all(x > = lb ))
159235 stopifnot(all(x < = ub ))
160236 stopifnot(length(B ) == 1 && B > 1 )
0 commit comments