Support binary calibration tests, improved calibration plots

martinmodrak · martinmodrak · commit 348a4945278a · 2025-08-06T21:38:40.000+02:00
diff --git a/.Rprofile b/.Rprofile
@@ -1,5 +1,5 @@
 source("renv/activate.R")
-source("~/.Rprofile")
+if(file.exists("~/.Rprofile")) source("~/.Rprofile")
 # Allows to change how all vignettes are run at once (especially to test rstan)
 options("SBC.vignettes_cmdstanr" = TRUE)
 
diff --git a/R/binary-calibration-tests.R b/R/binary-calibration-tests.R
@@ -0,0 +1,182 @@
+brier_score <- function(x, y) {
+  sum((x-y)^2)
+}
+
+brier_resampling_p <- function(x, y, B = 10000) {
+  actual_brier <- brier_score(x, y)
+  brier_null <- replicate(B, {
+    yrep <- rbinom(length(x), size = 1, prob = x)
+    brier_score(x, yrep)
+  })
+  max(mean(actual_brier <= brier_null), 0.5/B)
+}
+
+brier_resampling_test <- function(x, y, alpha = 0.05, B = 10000) {
+  dname <- paste0("x = ", deparse1(substitute(x)), ", y = ", deparse1(substitute(y)))
+
+  actual_brier <- brier_score(x, y)
+  brier_null <- replicate(B, {
+    yrep <- rbinom(length(x), size = 1, prob = x)
+    brier_score(x, yrep)
+  })
+
+  p <- max(mean(actual_brier <= brier_null), 0.5/B)
+
+  param <- quantile(brier_null, probs = 1 - alpha)
+  names(param) <- paste0(scales::percent(1 - alpha), " rejection limit")
+
+  structure(list(
+    method = paste0("Bootstrapped binary Brier score test (using ", B, " samples)"),
+    data.name = dname,
+    p.value = p,
+    estimate = c("Brier score" = actual_brier),
+    parameter = param
+  ),
+  class = "htest")
+}
+
+binary_miscalibration <- function(x,y) {
+  require_package_version("monotone", "0.1.2", "miscalibration computations")
+  ord <- order(x, -y)
+  x <- x[ord]
+  y <- y[ord]
+  #CEP_pav <- stats::isoreg(y)$yf
+  CEP_pav <- monotone::monotone(y)
+  #Using brier score
+  Sc <- mean((CEP_pav - y)^2)
+  mean((x - y) ^2) - Sc
+}
+
+# Faster reimplementation from https://www.pnas.org/doi/full/10.1073/pnas.2016191118#sec-4
+# and the reliabilitydiag package
+miscalibration_resampling_nulldist <- function(x,y, B = 1000) {
+  replicate(B, {
+    yrep <- rbinom(length(x), size = 1, prob = x)
+    binary_miscalibration(x, yrep)
+  })
+}
+
+miscalibration_resampling_p <- function(x,y, B = 10000) {
+  actual_miscalibration <- binary_miscalibration(x,y)
+  misc_null <- miscalibration_resampling_nulldist(x, y, B)
+  max(mean(actual_miscalibration <= misc_null), 0.5/B)
+}
+
+#' @export
+miscalibration_resampling_test <- function(x, y, alpha = 0.05, B = 10000) {
+  dname <- paste0("x = ", deparse1(substitute(x)), ", y = ", deparse1(substitute(y)))
+
+  actual_miscalibration <- binary_miscalibration(x,y)
+  misc_null <- miscalibration_resampling_nulldist(x, y, B)
+  p <- max(mean(actual_miscalibration <= misc_null), 0.5/B)
+
+  param <- quantile(misc_null, probs = 1 - alpha)
+  names(param) <- paste0(scales::percent(1 - alpha), " rejection limit")
+
+  structure(list(
+    method = paste0("Bootstrapped binary miscalibration test (using ", B, " samples)"),
+    data.name = dname,
+    p.value = p,
+    estimate = c("miscalibration" = actual_miscalibration),
+    parameter = param
+  ),
+  class = "htest")
+}
+
+gaffke_m <- function(probs, B = 10000) {
+  require_package_version("MCMCpack", "1.0.0", "the Gaffke test")
+  u_diff <- MCMCpack::rdirichlet(B, alpha = rep(1, length(probs) + 1))
+
+  probs_sort <- sort(probs)
+  z_upr <- c(probs_sort, 1)
+  m_matrix_upr <- sweep(u_diff, MARGIN = 2, STATS = z_upr, FUN = "*")
+  m_upr <- rowSums(m_matrix_upr)
+
+  #stopifnot(identical(sort(1 - probs), rev(1 - probs_sort)))
+  z_lwr <- c(rev(1 - probs_sort), 1)
+  m_matrix_lwr <- sweep(u_diff, MARGIN = 2, STATS = z_lwr, FUN = "*")
+  m_lwr <- rowSums(m_matrix_lwr)
+
+  list(lwr = m_lwr, upr = m_upr)
+}
+
+gaffke_ci_from_m <- function(m, alpha = 0.05) {
+  m_lwr <- m$lwr
+  m_upr <- m$upr
+
+  as.numeric(c(
+    1 - quantile(m_lwr, probs = 1 - alpha / 2),
+    quantile(m_upr, probs = 1 - alpha / 2)
+  ))
+}
+
+gaffke_ci <- function(probs, B = 10000, alpha = 0.05) {
+  m <- gaffke_m(probs, B, alpha)
+  gaffke_ci_from_m(m, alpha)
+}
+
+gaffke_p_from_m <- function(m, mu, B, alternative = c("two.sided", "less", "greater")) {
+  alternative <- match.arg(alternative)
+
+  m_lwr <- m$lwr
+  m_upr <- m$upr
+
+  prob_low <- mean(1-m_lwr <= mu)
+  if(prob_low == 0) {
+    prob_low <- 0.5/B
+  }
+  prob_high <- mean(m_upr >= mu)
+  if(prob_high == 0) {
+    prob_high <- 0.5/B
+  }
+  if(alternative == "two.sided") {
+    return(min(prob_low, prob_high, 0.5) * 2)
+  } else if(alternative == "less") {
+    return(prob_high)
+  } else if(alternative == "greater") {
+    return(prob_low)
+  } else {
+    stop("Invalid alternative")
+  }
+}
+
+gaffke_p <- function(probs, mu = 0.5, alpha = 0.05, B = 10000, alternative = c("two.sided", "less", "greater")) {
+  alternative <- match.arg(alternative)
+
+  m <- gaffke_m(probs, B, alpha)
+  gaffke_p_from_m(m, mu, B, alternative)
+}
+
+#' Non-parametric test for the mean of a bounded variable.
+#' @export
+gaffke_test <- function(x, mu = 0.5, alpha = 0.05, lb = 0, ub = 1, B = 10000, alternative = c("two.sided", "less", "greater")) {
+  dname <- deparse1(substitute(x))
+  alternative <- match.arg(alternative)
+
+  stopifnot(length(lb) == 1)
+  stopifnot(length(ub) == 1)
+  stopifnot(all(x >= lb))
+  stopifnot(all(x <= ub))
+  stopifnot(length(B) == 1 && B > 1)
+  stopifnot(0 < alpha && alpha < 1)
+  stopifnot(mu >= lb && mu <= ub)
+
+  x_scaled <- (x - lb) / (ub - lb)
+  mu_scaled <- (mu - lb) / (ub - lb)
+  m <- gaffke_m(x_scaled, B = B)
+  p <- gaffke_p_from_m(m, mu_scaled, alternative = alternative)
+  ci <- gaffke_ci_from_m(m, alpha = alpha)
+  attr(ci, "conf.level") <- 1 - alpha
+
+  structure(list(
+    method = paste0("Gaffke's test for the mean of a bounded variable  (using ", B, " samples)"),
+    data.name = dname,
+    p.value = p,
+    alternative = alternative,
+    null.value = c("mean" = mu),
+    conf.int = ci,
+    estimate = c("mean" = mean(x)),
+    parameter = c("lower bound" = lb, "upper bound" = ub)
+  ),
+  class = "htest")
+}
diff --git a/R/binary-calibration.R b/R/binary-calibration.R
@@ -19,7 +19,7 @@ binary_probabilities_from_stats <- function(stats) {
 }
 
 #' @export
-binary_calibration_from_stats <- function(stats, type = "isotonic", ...) {
+binary_calibration_from_stats <- function(stats, type = c("reliabilitydiag", "calibrationband"), ...) {
   stats <- binary_probabilities_from_stats(stats)
 
   stats_grouped <- dplyr::group_by(stats, variable)
@@ -29,8 +29,10 @@ binary_calibration_from_stats <- function(stats, type = "isotonic", ...) {
 }
 
 #' @export
-binary_calibration_base <- function(prob, outcome, type = "isotonic", ...) {
-  stopifnot(is.numeric(prob) && is.numeric(outcome))
+binary_calibration_base <- function(prob, outcome, uncertainty_prob = 0.95, type = c("reliabilitydiag", "calibrationband"), ...) {
+  stopifnot(is.numeric(prob))
+  stopifnot((is.numeric(outcome) || is.logical(outcome) || is.integer(outcome)))
+  outcome <- as.numeric(outcome)
   stopifnot(all(outcome %in% c(0,1)))
   stopifnot(all(prob >=0 & prob <= 1))
   stopifnot(length(prob) == length(outcome))
@@ -41,8 +43,22 @@ binary_calibration_base <- function(prob, outcome, type = "isotonic", ...) {
   outcome <- outcome[!na_indices]
 
   type <- match.arg(type)
-  if(type == "isotonic") {
-    require_package_version("calibrationband", "0.2", "to compute binary calibration with the type 'isotonic'.")
+  if(type == "reliabilitydiag") {
+    require_package_version("reliabilitydiag", "0.2.1", "to compute binary calibration with the type 'reliabilitydiag'.")
+    rel_diag <- reliabilitydiag::reliabilitydiag(
+      x = prob,
+      y = outcome,
+      region.level = uncertainty_prob,
+      ...
+    )
+    res <- data.frame(prob = rel_diag$x$regions$x, low = rel_diag$x$regions$lower, high = rel_diag$x$regions$upper)
+    res$estimate <- approx(x = c(rel_diag$x$bins$x_min, rel_diag$x$bins$x_max),
+                           y = rep(rel_diag$x$bins$CEP_pav, times = 2),
+                           xout = res$prob)$y
+
+    return(res)
+  } else if(type == "calibrationband") {
+    require_package_version("calibrationband", "0.2", "to compute binary calibration with the type 'calibrationband'.")
     # Need to remove extreme indices because they cause crashes in the package
     extreme_indices <- prob < 1e-10 | prob > 1 - 1e-10
     extreme_indices_mismatch <- extreme_indices & round(prob) != outcome
@@ -56,7 +72,7 @@ binary_calibration_base <- function(prob, outcome, type = "isotonic", ...) {
     # Avoiding https://github.com/marius-cp/calibrationband/issues/1
     prob <- round(prob, digits = 7)
 
-    bands <- calibrationband::calibration_bands(prob, outcome, ...)
+    bands <- calibrationband::calibration_bands(prob, outcome, alpha = 1 - uncertainty_prob, ...)
 
     res <- dplyr::transmute(bands$bands, prob = x, low = lwr, high = upr)
 
@@ -77,7 +93,7 @@ binary_calibration_base <- function(prob, outcome, type = "isotonic", ...) {
 
 
 #' @export
-plot_binary_calibration_diff <- function(stats, type = "isotonic", ...) {
+plot_binary_calibration_diff <- function(stats, type = c("reliabilitydiag", "calibrationband"), ...) {
   calib_df <- binary_calibration_from_stats(stats, type = type, ...)
 
   ggplot(calib_df, aes(x = prob, ymin = low - prob, ymax = high - prob, y = estimate - prob)) +
@@ -87,11 +103,11 @@ plot_binary_calibration_diff <- function(stats, type = "isotonic", ...) {
 }
 
 #' @export
-plot_binary_calibration <- function(stats, type = "isotonic", ...) {
+plot_binary_calibration <- function(stats, type = c("reliabilitydiag", "calibrationband"), ...) {
   calib_df <- binary_calibration_from_stats(stats, type = type, ...)
 
   ggplot(calib_df, aes(x = prob, ymin = low, ymax = high, y = estimate)) +
     geom_segment(x = 0, y = 0, xend = 1, yend = 1, color = "skyblue1", size = 2) +
     geom_ribbon(fill = "black", alpha = 0.33) +
-    geom_line() + facet_wrap(~variable)
+    geom_line() + facet_wrap(~variable) + coord_fixed()
 }