some test improvements: HC, undersmoothing

CoryMcCartan · CoryMcCartan · commit 0358150bd0e2 · 2026-02-10T16:11:20.000-05:00
diff --git a/R/ei_test_car.R b/R/ei_test_car.R
@@ -11,21 +11,28 @@
 #'
 #' The test is a Kennedy-Cade (1996) style permutation test on a Wald statistic
 #' for the coefficients not included in the "reduced" model that would be fit
-#' by [ei_ridge()].
-#' The test is carried out by fitting a regression on a fully basis-expanded
-#' combination of covariates and predictors, and calculating a Wald statistic
-#' for the
+#' by [ei_ridge()]. The test statistic is asymptotically chi-squared under the
+#' null and may be anti-conservative in small samples, especially when the
+#' dimensionality of the basis expansion is large.
 #'
 #' @param spec An `ei_spec` object created with [ei_spec()]. The object
 #'   should use the `preproc` argument to specify a rich basis expansion of the
 #'   covariates and predictors.
 #' @inheritParams ei_ridge
 #' @param iter The number of permutations to use when estimating the null
-#'   distribution. Ignored when `use_chisq = TRUE`.
+#'   distribution, including the original identity permutation.
+#'   Ignored when `use_chisq = TRUE`.
+#' @param undersmooth A value to divide the estimated ridge penalty by when
+#'   partialling out the partially linear component of the model. A larger
+#'   value will smooth the partially linear component less, which may improve
+#'   Type I error control in finite samples at the cost of power.
 #' @param use_chisq If `TRUE`, use the asymptotic chi-squared distribution for
 #'   the Wald test statistic instead of conducting a permutation test. Only
 #'   appropriate for larger sample sizes (Helwig 2022 recommends at least 200
 #'   when a single predictor is used).
+#' @param use_hc If `TRUE`, use a heteroskedasticity-consistent covariance estimate.
+#'   More computationally intensive, but may make a difference in small samples
+#'   or when there is substantial heteroskedasticity.
 #'
 #' @returns A tibble with one row per outcome variable and columns describing
 #'    the test results. The `p.value` column contains the p-values for the test.
@@ -62,11 +69,20 @@
 #'     preproc = preproc
 #' )
 #'
-#' ei_test_car(spec, iter=19) # use a larger number in practice
+#' ei_test_car(spec, iter=20) # use a larger number in practice
 #'
 #' @export
-ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
+ei_test_car <- function(spec, weights, iter = 1000, undersmooth = 1.5, use_chisq = nrow(spec) >= 2000, use_hc = FALSE) {
     validate_ei_spec(spec)
+    if (!has_preproc(spec)) {
+        cli_warn(c(
+            "{.arg preproc} was not specified in your {.cls ei_spec} object",
+            "i"="The {.fn ei_test_car} function relies on a rich basis expansion of the covariates and predictors.",
+            "x"="Without a basis expansion in {.arg preproc}, the test will not be able to detect violations of the CAR assumption.",
+            ">"="Consider basis expansions from the {.pkg bases} or {.pkg splines} package."
+        ))
+    }
+
     n = nrow(spec)
     x_col = attr(spec, "ei_x")
     z_col = attr(spec, "ei_z")
@@ -91,7 +107,7 @@ ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
     # first, residualize out xz0
     udv0 = svd(xz0 * sqrt_w)
     fit0 = ridge_auto(udv0, y, sqrt_w, vcov = FALSE)
-    pen = fit0$penalty
+    pen = fit0$penalty / undersmooth
     d_pen_h = udv0$d^2 / (udv0$d^2 + pen)
     H0 = tcrossprod(scale_cols(udv0$u, d_pen_h), udv0$u)
     res = y - H0 %*% y
@@ -107,37 +123,64 @@ ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
         out
     }
 
+    # observed test stat value
     fit = ridge_svd(udv, res, sqrt_w, pen, vcov = TRUE)
-    inv_vcov = pinv_sym(fit$vcov_u)
 
-    calc_wald = function(fit) {
-        colSums((inv_vcov %*% fit$coef) * fit$coef) / fit$sigma2
+    # set up Wald stat calculation
+    if (isTRUE(use_hc)) {
+        xzr = (xzf - H0 %*% xzf)
+        Sig_inv = solve((crossprod(xzr) + diag(ncol(xzr))*pen) / n)
+
+        calc_wald = function(fit) {
+            W = numeric(n_y)
+            df = numeric(n_y)
+            for (j in seq_len(n_y)) {
+                Omega = crossprod((res - fit$fitted)[, j] * xzr) / n
+                inv_vcov2 = pinv_sym(Sig_inv %*% Omega %*% Sig_inv)
+                df[j] = attr(inv_vcov2, "rank")
+                W[j] = n * crossprod(fit$coef[, j], inv_vcov2 %*% fit$coef[, j])
+            }
+            attr(W, "df") = df
+            W
+        }
+
+        W0 = calc_wald(fit)
+        df = attr(W0, "df")
+    } else {
+        inv_vcov = pinv_sym(fit$vcov_u)
+        df = rep(attr(inv_vcov, "rank"), n_y)
+
+        calc_wald = function(fit) {
+            colSums(fit$coef * (inv_vcov %*% fit$coef)) / fit$sigma2
+        }
+
+        W0 = calc_wald(fit)
     }
-    W0 = calc_wald(fit)
 
     if (!isTRUE(use_chisq)) {
-        if (!is.numeric(iter) && iter > 0) {
+        if (!is.numeric(iter) && iter > 1) {
             cli_abort("{.arg iter} must be a positive integer.")
         }
 
         W = matrix(nrow = ncol(y), ncol = iter)
+        W[, 1] = W0
         pb = cli::cli_progress_bar("Running permutations", total = iter)
-        for (i in seq_len(iter)) {
+        for (i in seq(2, iter, 1)) {
             res_p = res[sample.int(n), , drop=FALSE]
             fit_p = ridge_svd(udv, res_p, sqrt_w, pen, vcov = FALSE)
             W[, i] = calc_wald(fit_p)
             cli::cli_progress_update(id = pb)
         }
         cli::cli_progress_done(id = pb)
-        p_val = (rowSums(W >= W0) + 1) / (iter + 1)
+        p_val = rowSums(W >= W0) / iter
     } else {
-        p_val = pchisq(W0, df = attr(inv_vcov, "rank"), lower.tail = FALSE)
+        p_val = pchisq(W0, df = df, lower.tail = FALSE)
     }
 
     tibble::new_tibble(list(
         outcome = colnames(y),
         W = W0,
-        df = attr(inv_vcov, "rank"),
+        df = df,
         p.value = p_val
     ))
 }
diff --git a/explore/id_test.R b/explore/id_test.R
@@ -1,80 +1,49 @@
 devtools::load_all(".")
 library(tidyverse)
 
-# try on simulated data
-n = 1000
-n_perm = 100
-
-run_test <- function(n = 1000, n_x = 3, p = 2, r2 = 0.5, iter = 1000) {
-    spec0 = ei_synthetic(
+n = 2000
+p = 3
+r2 = 0.25
+specs = map(1:500, function(i) {
+     ei_synthetic(
         n,
         p,
         n_x = n_x,
-        z = c(1, rep(0, p - 1)),
         r2_xz = r2,
         r2_bz = r2,
         b_cov = 0.0004 * (1 + diag(n_x))
     )
+}, .progress = TRUE)
+
+run_test = function(spec0, well_spec = TRUE, ...) {
     spec = ei_spec(
         spec0,
         predictors = starts_with("x"),
         outcome = starts_with("y"),
         total = attr(spec0, "ei_n"),
-        covariates = starts_with("z"),
-        # covariates = "z1",
+        covariates = if (well_spec) starts_with("z") else "z1",
         preproc = function(z) {
             if (ncol(z) == 0) {
-                matrix(nrow=nrow(z), ncol=0)
+                matrix(nrow = nrow(z), ncol = 0)
             } else {
-                bases::b_tpsob(z, p = 50)
+                bases::b_tpsob(z, p = 25)
             }
         }
     )
 
-    ei_test_car(spec, iter = iter, use_chisq = F)
+    ei_test_car(spec, ...)
 }
 
-res = map(1:400, ~ run_test(n = 500, iter = 100), .progress = TRUE) |>
+res = map(specs, ~ run_test(.x, well_spec = F, use_hc=F, iter=50), .progress = TRUE) |>
+    bind_rows()
+res2 = map(specs, ~ run_test(.x, well_spec = T, use_hc=F, iter=50), .progress = TRUE) |>
     bind_rows()
 
-hist(res$df, breaks=50)
 hist(res$p.value, breaks=50)
-hist(res$p.value[res$df > 0], breaks=50)
+hist(res2$p.value, breaks=50)
 mean(res$p.value <= 0.05)
-hist(res$W, breaks=50)
+mean(res2$p.value <= 0.05)
 hist(pchisq(res$W, res$df, lower.tail=F), breaks=50)
 plot(pchisq(res$W, res$df, lower.tail=F), res$p.value)
-
-
-# try on wallace
-data(elec_1968)
-elec_1968 = elec_1968 |>
-    mutate(
-        vap_nonwhite = 1 - vap_white,
-        z = bases::b_bart(pop_urban, pop_rural, educ_elem, educ_hsch, educ_coll, farm,
-            inc_00_03k, inc_03_08k, inc_08_25k, inc_25_99k)
-    )
-
-spec = ei_spec(elec_1968, c(vap_white, vap_black, vap_other), c(pres_dem_hum, pres_rep_nix, pres_ind_wal, pres_abs),
-               total = pres_total, covariates = c(state, z))
-
-m = ei_ridge(spec, bounds=F, sum_one=F)
-
-Z = cbind(elec_1968$z, model.matrix(~ state - 1, elec_1968))
-XZ = with(elec_1968, bases::b_tpsob(vap_white, vap_black, vap_other, pop_urban, pop_rural, educ_elem, educ_hsch, educ_coll, farm,
-            inc_00_03k, inc_03_08k, inc_08_25k, inc_25_99k, p=200)) |>
-  cbind(model.matrix(~ state - 1, elec_1968))
-
-udv = svd(XZ)
-n = nrow(Z)
-fit0 = ridge_auto(udv, resid(m), rep(1, n), vcov = FALSE)
-r2 = diag(as.matrix(cor(fit0$fitted, resid(m))^2))
-
-perm = replicate(1000, {
-    yy = resid(m)[sample(n), , drop=FALSE]
-    fit = ridge_svd(udv, yy, rep(1, n), penalty = fit0$penalty, vcov = FALSE)
-    diag(as.matrix(cor(fit$fitted, yy)^2))
-})
-
-r2
-(rowSums(perm >= r2) + 1) / (1000 + 1)
+plot(res$p.value, res2$p.value)
+plot(res$W, res2$W)
diff --git a/man/ei_test_car.Rd b/man/ei_test_car.Rd