CoryMcCartan
diff --git a/‎R/ei_ridge.R‎
Lines changed: 4 additions & 1 deletion b/‎R/ei_ridge.R‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/ei_synthetic.R‎
Lines changed: 2 additions & 0 deletions b/‎R/ei_synthetic.R‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/ei_test_car.R‎
Lines changed: 126 additions & 8 deletions b/‎R/ei_test_car.R‎
Lines changed: 126 additions & 8 deletions
diff --git a/‎_pkgdown.yml‎
Lines changed: 1 addition & 0 deletions b/‎_pkgdown.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎explore/id_test.R‎
Lines changed: 34 additions & 51 deletions b/‎explore/id_test.R‎
Lines changed: 34 additions & 51 deletions
diff --git a/‎man/ei_test_car.Rd‎
Lines changed: 62 additions & 7 deletions b/‎man/ei_test_car.Rd‎
Lines changed: 62 additions & 7 deletions
diff --git a/‎man/ridge-methods.Rd‎
Lines changed: 4 additions & 1 deletion b/‎man/ridge-methods.Rd‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tests/testthat/test-ei_est.R‎ ‎tests/testthat/test-est.R‎tests/testthat/test-ei_est.R renamed to tests/testthat/test-est.R b/‎tests/testthat/test-ei_est.R‎ ‎tests/testthat/test-est.R‎tests/testthat/test-ei_est.R renamed to tests/testthat/test-est.R
@@ -485,7 +485,10 @@ residuals.ei_ridge <- function(object, ...) {
     object$y - object$fitted
 }
 
-#' @describeIn ridge-methods Extract covariance of coefficient estimates.
+#' @describeIn ridge-methods Extract unscaled covariance of coefficient estimates.
+#'   Covariance estimate is not currently heteroskedasticity-robust.
+#'   Multiply by `sigma2` from the fitted model to get the covariance matrix for
+#'   a particular outcome variable.
 #' @export
 vcov.ei_ridge <- function(object, ...) {
     object$vcov_u
 
@@ -242,6 +242,8 @@ ei_synthetic = function(n, p = 0, n_x = 2, x = n_x:1, z = 0.25 * exp(-(seq_len(p
         ei_y = "y",
         ei_z = colnames(z),
         ei_n = rep(1, n),
+        ei_preproc = default_preproc,
+        ei_z_proc = run_preproc(d, default_preproc, colnames(z)),
         b = b,
         b_loc = b_loc,
         b_cov = b_cov,
 
@@ -9,17 +9,135 @@
 #' rich basis transformation of the covariates and predictors; a missing
 #' `preproc` will lead to a warning.
 #'
+#' The test is a Kennedy-Cade (1996) style permutation test on a Wald statistic
+#' for the coefficients not included in the "reduced" model that would be fit
+#' by [ei_ridge()].
 #' The test is carried out by fitting a regression on a fully basis-expanded
-#' combination of covariates and predictors, and calculating the F statistic
-#' compared to the "reduced" model that would be fit by [ei_ridge()]. To
-#' account for penalization, the null distribution is estimated by permuting
-#' the residuals from the reduced model.
+#' combination of covariates and predictors, and calculating a Wald statistic
+#' for the
 #'
-#' @param spec An `ei_spec` object created with [ei_spec()].
+#' @param spec An `ei_spec` object created with [ei_spec()]. The object
+#'   should use the `preproc` argument to specify a rich basis expansion of the
+#'   covariates and predictors.
+#' @inheritParams ei_ridge
+#' @param iter The number of permutations to use when estimating the null
+#'   distribution. Ignored when `use_chisq = TRUE`.
+#' @param use_chisq If `TRUE`, use the asymptotic chi-squared distribution for
+#'   the Wald test statistic instead of conducting a permutation test. Only
+#'   appropriate for larger sample sizes (Helwig 2022 recommends at least 200
+#'   when a single predictor is used).
 #'
-#' @returns A 1-row tibble with columns describing the test results. The
-#    `p.value` column contains the p-value for the test.
+#' @returns A tibble with one row per outcome variable and columns describing
+#'    the test results. The `p.value` column contains the p-values for the test.
+#'    P-values are not adjusted by default; pass them to [stats::p.adjust()] if
+#'    desired.
+#'
+#' @references
+#' Helwig, N. E. (2022). Robust Permutation Tests for Penalized Splines. _Stats_,
+#' 5(3), 916-933.
+#'
+#' Kennedy, P. E., & Cade, B. S. (1996). Randomization tests for multiple regression.
+#' _Communications in Statistics-Simulation and Computation_, 25(4), 923-936.
+#'
+#' McCartan, C., & Kuriwaki, S. (2025+). Identification and semiparametric
+#' estimation of conditional means from aggregate data.
+#' Working paper [arXiv:2509.20194](https://arxiv.org/abs/2509.20194).
+#'
+#' @examples
+#' data(elec_1968)
+#'
+#' # basis expansion: poly() with degree=2 not recommended in practice
+#' preproc = if (requireNamespace("bases", quietly = TRUE)) {
+#'     ~ bases::b_bart(.x, trees = 100)
+#' } else {
+#'     ~ poly(as.matrix(.x), degree=2, simple=TRUE)
+#' }
+#'
+#' spec = ei_spec(
+#'     data = elec_1968,
+#'     predictors = vap_white:vap_other,
+#'     outcome = pres_dem_hum:pres_abs,
+#'     total = pres_total,
+#'     covariates = c(pop_city:pop_rural, farm:educ_coll, starts_with("inc_")),
+#'     preproc = preproc
+#' )
+#'
+#' ei_test_car(spec, iter=19) # use a larger number in practice
 #'
 #' @export
-ei_test_car <- function(spec) {
+ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
+    validate_ei_spec(spec)
+    n = nrow(spec)
+    x_col = attr(spec, "ei_x")
+    z_col = attr(spec, "ei_z")
+    x = spec[, x_col, drop = FALSE]
+    z = spec[, z_col, drop = FALSE]
+    z_proc = attr(spec, "ei_z_proc")
+
+    int_scale = 1e5
+    xz0 = row_kronecker(as.matrix(x), z_proc, int_scale)
+    xzf = run_preproc(spec, z_col = c(x_col, z_col))
+
+    if (missing(weights)) {
+        weights = rep(1, n)
+    } else {
+        weights = eval_tidy(enquo(weights), spec)
+    }
+    sqrt_w = sqrt(weights / mean(weights))
+
+    y = as.matrix(spec[, attr(spec, "ei_y"), drop = FALSE])
+    n_y = ncol(y)
+
+    # first, residualize out xz0
+    udv0 = svd(xz0 * sqrt_w)
+    fit0 = ridge_auto(udv0, y, sqrt_w, vcov = FALSE)
+    pen = fit0$penalty
+    d_pen_h = udv0$d^2 / (udv0$d^2 + pen)
+    H0 = tcrossprod(scale_cols(udv0$u, d_pen_h), udv0$u)
+    res = y - H0 %*% y
+    udv = svd((xzf - H0 %*% xzf) * sqrt_w)
+
+    # pseudo-inverse
+    pinv_sym = function(M) {
+        eig = eigen(M, symmetric=TRUE)
+        rk = seq_len(sum(eig$values > 1e-10))
+        eig$values[rk] = 1 / eig$values[rk]
+        out = tcrossprod(scale_cols(eig$vectors, eig$values), eig$vectors)
+        attr(out, "rank") = length(rk)
+        out
+    }
+
+    fit = ridge_svd(udv, res, sqrt_w, pen, vcov = TRUE)
+    inv_vcov = pinv_sym(fit$vcov_u)
+
+    calc_wald = function(fit) {
+        colSums((inv_vcov %*% fit$coef) * fit$coef) / fit$sigma2
+    }
+    W0 = calc_wald(fit)
+
+    if (!isTRUE(use_chisq)) {
+        if (!is.numeric(iter) && iter > 0) {
+            cli_abort("{.arg iter} must be a positive integer.")
+        }
+
+        W = matrix(nrow = ncol(y), ncol = iter)
+        pb = cli::cli_progress_bar("Running permutations", total = iter)
+        for (i in seq_len(iter)) {
+            res_p = res[sample.int(n), , drop=FALSE]
+            fit_p = ridge_svd(udv, res_p, sqrt_w, pen, vcov = FALSE)
+            W[, i] = calc_wald(fit_p)
+            cli::cli_progress_update(id = pb)
+        }
+        cli::cli_progress_done(id = pb)
+        p_val = (rowSums(W >= W0) + 1) / (iter + 1)
+    } else {
+        p_val = pchisq(W0, df = attr(inv_vcov, "rank"), lower.tail = FALSE)
+    }
+
+    tibble::new_tibble(list(
+        outcome = colnames(y),
+        W = W0,
+        df = attr(inv_vcov, "rank"),
+        p.value = p_val
+    ))
 }
@@ -30,6 +30,7 @@ reference:
   - ei_sens_rv
   - ei_bench
   - plot.ei_sens
+  - ei_test_car
 - title: Ecological modeling
   desc: Fit regression and weighting models to aggregate data to use in inference
   contents:
 
@@ -5,62 +5,45 @@ library(tidyverse)
 n = 1000
 n_perm = 100
 
-run_test <- function(n = 1000, n_x = 3, p = 2, n_perm = 1000) {
-    spec0 = ei_synthetic(n, p, n_x = n_x, b_cov = 0.0004 * (1 + diag(n_x)))
-    spec = ei_spec(spec0, starts_with("x"), starts_with("y"), total = attr(spec0, "ei_n"))
-    # attr(spec, "b") |> hist(breaks=50)
-
-    y = as.matrix(spec0[, attr(spec0, "ei_y")])
-    X = as.matrix(spec0[, attr(spec0, "ei_x")])
-    Z = as.matrix(spec0[, attr(spec0, "ei_z")])
-    XZ0 = row_kronecker(X, Z, 1e5)
-    XZ = bases::b_tpsob(spec0[, c(attr(spec, "ei_x"), attr(spec, "ei_z"))], p = 200)
-    # ZZ = bases::b_tpsob(spec[, c(attr(spec, "ei_z"))], p = 200)
-
-    udv0 = svd(XZ0)
-    udv = svd(cbind(XZ0, XZ))
-
-    fit0 = ridge_auto(udv0, y, rep(1, n), vcov = FALSE)
-    pen = fit0$penalty
-    fit = ridge_svd(udv, y, rep(1, n), vcov = FALSE, penalty = pen)
-    # fit = ridge_auto(udv, y, rep(1, n), vcov = FALSE)
-    # pen = fit$penalty
-    # fit0 = ridge_svd(udv0, y, rep(1, n), vcov = FALSE, penalty = pen)
-    c(cor(fit0$fitted, y)^2)
-    c(cor(fit$fitted, y)^2)
-
-    rss_full = colSums((y - fit$fitted)^2)
-    rss_red = colSums((y - fit0$fitted)^2)
-    F_stat = ((rss_red - rss_full) / (fit$df - fit0$df)) / (rss_full / (n - fit$df))
-
-    perm = replicate(
-        n_perm,
-        {
-            yp = fit0$fitted + (y - fit0$fitted)[sample(n), , drop = FALSE]
-            fit = ridge_svd(udv, yp, rep(1, n), vcov = FALSE, penalty = pen)
-            # fit0 = ridge_svd(udv0, yp, rep(1, n), vcov = FALSE, penalty = pen)
-            rss_full = colSums((yp - fit$fitted)^2)
-            # rss_red = colSums((yp - fit0$fitted)^2)
-            F_stat = ((rss_red - rss_full) / (fit$df - fit0$df)) / (rss_full / (n - fit$df))
-        },
-        simplify = FALSE
-    ) |>
-        do.call(cbind, args = _)
-
-    tibble::tibble_row(
-        F = F_stat,
-        p = (rowSums(perm >= F_stat) + 1) / (n_perm + 1),
-        p_param = pf(F_stat, fit$df - fit0$df, n - fit$df, lower.tail = FALSE)
+run_test <- function(n = 1000, n_x = 3, p = 2, r2 = 0.5, iter = 1000) {
+    spec0 = ei_synthetic(
+        n,
+        p,
+        n_x = n_x,
+        z = c(1, rep(0, p - 1)),
+        r2_xz = r2,
+        r2_bz = r2,
+        b_cov = 0.0004 * (1 + diag(n_x))
     )
+    spec = ei_spec(
+        spec0,
+        predictors = starts_with("x"),
+        outcome = starts_with("y"),
+        total = attr(spec0, "ei_n"),
+        covariates = starts_with("z"),
+        # covariates = "z1",
+        preproc = function(z) {
+            if (ncol(z) == 0) {
+                matrix(nrow=nrow(z), ncol=0)
+            } else {
+                bases::b_tpsob(z, p = 50)
+            }
+        }
+    )
+
+    ei_test_car(spec, iter = iter, use_chisq = F)
 }
 
-res = map(1:400, ~ run_test(n = 2000, n_perm = 50), .progress = TRUE) |>
+res = map(1:400, ~ run_test(n = 500, iter = 100), .progress = TRUE) |>
     bind_rows()
 
-hist(res$p)
-mean(res$p <= 0.05)
-hist(res$F, breaks=50)
-hist(res$p_param, breaks=50)
+hist(res$df, breaks=50)
+hist(res$p.value, breaks=50)
+hist(res$p.value[res$df > 0], breaks=50)
+mean(res$p.value <= 0.05)
+hist(res$W, breaks=50)
+hist(pchisq(res$W, res$df, lower.tail=F), breaks=50)
+plot(pchisq(res$W, res$df, lower.tail=F), res$p.value)
 
 
 # try on wallace