Fix cluster-robust SE handling

Copilot · kaiemjoy · web-flow · commit 73d82efe508b · 2026-06-26T04:48:37.000Z
Co-authored-by: kaiemjoy &lt;16113030+kaiemjoy@users.noreply.github.com&gt;
diff --git a/NEWS.md b/NEWS.md
@@ -28,6 +28,10 @@
   shifted the random-number stream out of sync and made simulated values, and
   their snapshots, differ between macOS, Windows, and Linux). Simulated
   values change slightly as a result of this fix. (#447)
+* Cluster-robust standard errors now treat multiple `cluster_var` columns as
+  multi-way clustering instead of collapsing them to a single interaction, and
+  they are no longer allowed to be smaller than the corresponding model-based
+  standard errors. (#543)
 * Corrected default axis labels in `strat_ests_barplot()` (`xlab`) and
   `strat_ests_scatterplot()` (`ylab`) to say "seroincidence" rather than
   "seroconversion"/"incidence".
diff --git a/R/compute_cluster_robust_var.R b/R/compute_cluster_robust_var.R
@@ -17,103 +17,45 @@
     fit,
     cluster_var,
     stratum_var = NULL) {
-  # Extract stored data (already split by antigen_iso)
   pop_data_list <- attr(fit, "pop_data")
-  sr_params_list <- attr(fit, "sr_params")
-  noise_params_list <- attr(fit, "noise_params")
-  antigen_isos <- attr(fit, "antigen_isos")
-
-  # Get MLE estimate
-  log_lambda_mle <- fit$estimate
-
-  # Combine pop_data list back into a single data frame
-  # to get cluster info
   pop_data_combined <- do.call(rbind, pop_data_list)
-
-  # Compute score (gradient) using numerical differentiation
-  # The score is the derivative of log-likelihood w.r.t. log(lambda)
-  epsilon <- 1e-6
-
-  # For each observation, compute the contribution to the score
-  # We need to identify which cluster each observation belongs to
-
-  # Handle multiple clustering levels by creating composite cluster ID
-  if (length(cluster_var) == 1) {
-    cluster_ids <- pop_data_combined[[cluster_var]]
-  } else {
-    # Create composite cluster ID from multiple variables
-    cluster_ids <- interaction(
-      pop_data_combined[, cluster_var, drop = FALSE],
-      drop = TRUE,
-      sep = "_"
-    )
-  }
-
-  # Get unique clusters
-  unique_clusters <- unique(cluster_ids)
-  n_clusters <- length(unique_clusters)
-
-  # Compute cluster-level scores
-  cluster_scores <- numeric(n_clusters)
-
-  for (i in seq_along(unique_clusters)) {
-    cluster_id <- unique_clusters[i]
-
-    # Get observations in this cluster
-    cluster_mask <- cluster_ids == cluster_id
-
-    # Create temporary pop_data with only this cluster
-    pop_data_cluster <- pop_data_combined[cluster_mask, , drop = FALSE]
-
-    # Split by antigen
-    pop_data_cluster_list <- split(
-      pop_data_cluster,
-      pop_data_cluster$antigen_iso
-    )
-
-    # Ensure all antigen_isos are represented
-    # (add empty data frames if missing)
-    for (ag in antigen_isos) {
-      if (!ag %in% names(pop_data_cluster_list)) {
-        # Create empty data frame with correct structure
-        pop_data_cluster_list[[ag]] <- pop_data_list[[ag]][0, , drop = FALSE]
-      }
+  standard_var_log_lambda <- 1 / fit$hessian |> as.numeric()
+
+  subset_cluster_vars <- unlist(
+    lapply(seq_along(cluster_var), function(n_vars) {
+      utils::combn(cluster_var, n_vars, simplify = FALSE)
+    }),
+    recursive = FALSE
+  )
+
+  cluster_var_terms <- vapply(subset_cluster_vars, length, integer(1))
+  robust_var_log_lambda <- 0
+
+  for (i in seq_along(subset_cluster_vars)) {
+    cluster_vars_subset <- subset_cluster_vars[[i]]
+    if (length(cluster_vars_subset) == 1) {
+      cluster_ids <- pop_data_combined[[cluster_vars_subset]]
+    } else {
+      cluster_ids <- interaction(
+        pop_data_combined[, cluster_vars_subset, drop = FALSE],
+        drop = TRUE,
+        sep = "_"
+      )
     }
 
-    # Compute log-likelihood for this cluster at MLE
-    ll_cluster_mle <- -(.nll(
-      log.lambda = log_lambda_mle,
-      pop_data = pop_data_cluster_list,
-      antigen_isos = antigen_isos,
-      curve_params = sr_params_list,
-      noise_params = noise_params_list,
-      verbose = FALSE
-    ))
-
-    # Compute log-likelihood at MLE + epsilon
-    ll_cluster_plus <- -(.nll(
-      log.lambda = log_lambda_mle + epsilon,
-      pop_data = pop_data_cluster_list,
-      antigen_isos = antigen_isos,
-      curve_params = sr_params_list,
-      noise_params = noise_params_list,
-      verbose = FALSE
-    ))
-
-    # Numerical derivative (score for this cluster)
-    cluster_scores[i] <- (ll_cluster_plus - ll_cluster_mle) / epsilon
+    robust_var_log_lambda <- robust_var_log_lambda +
+      (-1)^(cluster_var_terms[[i]] + 1) *
+      .compute_cluster_var_oneway(
+        fit = fit,
+        cluster_ids = cluster_ids,
+        pop_data_combined = pop_data_combined
+      )
   }
 
-  # Compute B matrix (middle of sandwich)
-  # B = sum of outer products of cluster scores
-  b_matrix <- sum(cluster_scores^2) # nolint: object_name_linter
-
-  # Get Hessian (already computed by nlm)
-  h_matrix <- fit$hessian # nolint: object_name_linter
-
-  # Sandwich variance: V = H^(-1) * B * H^(-1)
-  # Since we have a scalar parameter, this simplifies to:
-  var_log_lambda_robust <- b_matrix / (h_matrix^2)
+  robust_var_log_lambda <- max(
+    standard_var_log_lambda,
+    robust_var_log_lambda
+  )
 
-  return(var_log_lambda_robust)
+  return(robust_var_log_lambda)
 }
diff --git a/R/compute_cluster_var_oneway.R b/R/compute_cluster_var_oneway.R
@@ -0,0 +1,63 @@
+#' Compute one-way cluster-robust variance for seroincidence estimates
+#'
+#' @param fit a `seroincidence` object from [est_seroincidence()]
+#' @param cluster_ids cluster identifier for each row in `pop_data_combined`
+#' @param pop_data_combined combined population data across antigen isotypes
+#'
+#' @return one-way cluster-robust variance of log(lambda)
+#' @keywords internal
+#' @noRd
+.compute_cluster_var_oneway <- function(
+    fit,
+    cluster_ids,
+    pop_data_combined) {
+  pop_data_list <- attr(fit, "pop_data")
+  sr_params_list <- attr(fit, "sr_params")
+  noise_params_list <- attr(fit, "noise_params")
+  antigen_isos <- attr(fit, "antigen_isos")
+  log_lambda_mle <- fit$estimate
+  epsilon <- 1e-6
+
+  unique_clusters <- unique(cluster_ids)
+  cluster_scores <- numeric(length(unique_clusters))
+
+  for (i in seq_along(unique_clusters)) {
+    cluster_id <- unique_clusters[i]
+    cluster_mask <- cluster_ids == cluster_id
+    pop_data_cluster <- pop_data_combined[cluster_mask, , drop = FALSE]
+    pop_data_cluster_list <- split(
+      pop_data_cluster,
+      pop_data_cluster$antigen_iso
+    )
+
+    for (ag in antigen_isos) {
+      if (!ag %in% names(pop_data_cluster_list)) {
+        pop_data_cluster_list[[ag]] <- pop_data_list[[ag]][0, , drop = FALSE]
+      }
+    }
+
+    ll_cluster_mle <- -(.nll(
+      log.lambda = log_lambda_mle,
+      pop_data = pop_data_cluster_list,
+      antigen_isos = antigen_isos,
+      curve_params = sr_params_list,
+      noise_params = noise_params_list,
+      verbose = FALSE
+    ))
+    ll_cluster_plus <- -(.nll(
+      log.lambda = log_lambda_mle + epsilon,
+      pop_data = pop_data_cluster_list,
+      antigen_isos = antigen_isos,
+      curve_params = sr_params_list,
+      noise_params = noise_params_list,
+      verbose = FALSE
+    ))
+
+    cluster_scores[i] <- (ll_cluster_plus - ll_cluster_mle) / epsilon
+  }
+
+  b_matrix <- sum(cluster_scores^2) # nolint: object_name_linter
+  h_matrix <- fit$hessian # nolint: object_name_linter
+
+  b_matrix / (h_matrix^2)
+}
diff --git a/tests/testthat/test-cluster_robust_se.R b/tests/testthat/test-cluster_robust_se.R
@@ -145,3 +145,57 @@ test_that("multiple cluster variables work correctly", {
   # Standard errors should be positive
   expect_true(sum_multi$SE > 0)
 })
+
+test_that("singleton cluster IDs do not reduce standard errors", {
+  withr::local_seed(20241213)
+
+  test_data <- sees_pop_data_pk_100
+  test_data$household_id <- seq_len(nrow(test_data))
+
+  est_standard <- est_seroincidence(
+    pop_data = test_data,
+    sr_param = typhoid_curves_nostrat_100,
+    noise_param = example_noise_params_pk,
+    antigen_isos = c("HlyE_IgG", "HlyE_IgA")
+  )
+  est_household <- est_seroincidence(
+    pop_data = test_data,
+    sr_param = typhoid_curves_nostrat_100,
+    noise_param = example_noise_params_pk,
+    antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
+    cluster_var = "household_id"
+  )
+
+  sum_standard <- summary(est_standard, verbose = FALSE)
+  sum_household <- summary(est_household, verbose = FALSE)
+
+  expect_equal(sum_household$SE, sum_standard$SE)
+})
+
+test_that("nested multi-level clustering uses the broader cluster level", {
+  withr::local_seed(20241213)
+
+  test_data <- sees_pop_data_pk_100
+  test_data$household_id <- seq_len(nrow(test_data))
+  test_data$commune <- rep(1:10, length.out = nrow(test_data))
+
+  est_commune <- est_seroincidence(
+    pop_data = test_data,
+    sr_param = typhoid_curves_nostrat_100,
+    noise_param = example_noise_params_pk,
+    antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
+    cluster_var = "commune"
+  )
+  est_nested <- est_seroincidence(
+    pop_data = test_data,
+    sr_param = typhoid_curves_nostrat_100,
+    noise_param = example_noise_params_pk,
+    antigen_isos = c("HlyE_IgG", "HlyE_IgA"),
+    cluster_var = c("commune", "household_id")
+  )
+
+  sum_commune <- summary(est_commune, verbose = FALSE)
+  sum_nested <- summary(est_nested, verbose = FALSE)
+
+  expect_equal(sum_nested$SE, sum_commune$SE)
+})