Skip to content

Commit 27c4bac

Browse files
committed
initial CAR test; some type I error concerns
1 parent ef0adc9 commit 27c4bac

File tree

8 files changed

+233
-68
lines changed

8 files changed

+233
-68
lines changed

R/ei_ridge.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,10 @@ residuals.ei_ridge <- function(object, ...) {
485485
object$y - object$fitted
486486
}
487487

488-
#' @describeIn ridge-methods Extract covariance of coefficient estimates.
488+
#' @describeIn ridge-methods Extract unscaled covariance of coefficient estimates.
489+
#' Covariance estimate is not currently heteroskedasticity-robust.
490+
#' Multiply by `sigma2` from the fitted model to get the covariance matrix for
491+
#' a particular outcome variable.
489492
#' @export
490493
vcov.ei_ridge <- function(object, ...) {
491494
object$vcov_u

R/ei_synthetic.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ ei_synthetic = function(n, p = 0, n_x = 2, x = n_x:1, z = 0.25 * exp(-(seq_len(p
242242
ei_y = "y",
243243
ei_z = colnames(z),
244244
ei_n = rep(1, n),
245+
ei_preproc = default_preproc,
246+
ei_z_proc = run_preproc(d, default_preproc, colnames(z)),
245247
b = b,
246248
b_loc = b_loc,
247249
b_cov = b_cov,

R/ei_test_car.R

Lines changed: 126 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,135 @@
99
#' rich basis transformation of the covariates and predictors; a missing
1010
#' `preproc` will lead to a warning.
1111
#'
12+
#' The test is a Kennedy-Cade (1996) style permutation test on a Wald statistic
13+
#' for the coefficients not included in the "reduced" model that would be fit
14+
#' by [ei_ridge()].
1215
#' The test is carried out by fitting a regression on a fully basis-expanded
13-
#' combination of covariates and predictors, and calculating the F statistic
14-
#' compared to the "reduced" model that would be fit by [ei_ridge()]. To
15-
#' account for penalization, the null distribution is estimated by permuting
16-
#' the residuals from the reduced model.
16+
#' combination of covariates and predictors, and calculating a Wald statistic
17+
#' for the
1718
#'
18-
#' @param spec An `ei_spec` object created with [ei_spec()].
19+
#' @param spec An `ei_spec` object created with [ei_spec()]. The object
20+
#' should use the `preproc` argument to specify a rich basis expansion of the
21+
#' covariates and predictors.
22+
#' @inheritParams ei_ridge
23+
#' @param iter The number of permutations to use when estimating the null
24+
#' distribution. Ignored when `use_chisq = TRUE`.
25+
#' @param use_chisq If `TRUE`, use the asymptotic chi-squared distribution for
26+
#' the Wald test statistic instead of conducting a permutation test. Only
27+
#' appropriate for larger sample sizes (Helwig 2022 recommends at least 200
28+
#' when a single predictor is used).
1929
#'
20-
#' @returns A 1-row tibble with columns describing the test results. The
21-
# `p.value` column contains the p-value for the test.
30+
#' @returns A tibble with one row per outcome variable and columns describing
31+
#' the test results. The `p.value` column contains the p-values for the test.
32+
#' P-values are not adjusted by default; pass them to [stats::p.adjust()] if
33+
#' desired.
34+
#'
35+
#' @references
36+
#' Helwig, N. E. (2022). Robust Permutation Tests for Penalized Splines. _Stats_,
37+
#' 5(3), 916-933.
38+
#'
39+
#' Kennedy, P. E., & Cade, B. S. (1996). Randomization tests for multiple regression.
40+
#' _Communications in Statistics-Simulation and Computation_, 25(4), 923-936.
41+
#'
42+
#' McCartan, C., & Kuriwaki, S. (2025+). Identification and semiparametric
43+
#' estimation of conditional means from aggregate data.
44+
#' Working paper [arXiv:2509.20194](https://arxiv.org/abs/2509.20194).
45+
#'
46+
#' @examples
47+
#' data(elec_1968)
48+
#'
49+
#' # basis expansion: poly() with degree=2 not recommended in practice
50+
#' preproc = if (requireNamespace("bases", quietly = TRUE)) {
51+
#' ~ bases::b_bart(.x, trees = 100)
52+
#' } else {
53+
#' ~ poly(as.matrix(.x), degree=2, simple=TRUE)
54+
#' }
55+
#'
56+
#' spec = ei_spec(
57+
#' data = elec_1968,
58+
#' predictors = vap_white:vap_other,
59+
#' outcome = pres_dem_hum:pres_abs,
60+
#' total = pres_total,
61+
#' covariates = c(pop_city:pop_rural, farm:educ_coll, starts_with("inc_")),
62+
#' preproc = preproc
63+
#' )
64+
#'
65+
#' ei_test_car(spec, iter=19) # use a larger number in practice
2266
#'
2367
#' @export
24-
ei_test_car <- function(spec) {
68+
ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
69+
validate_ei_spec(spec)
70+
n = nrow(spec)
71+
x_col = attr(spec, "ei_x")
72+
z_col = attr(spec, "ei_z")
73+
x = spec[, x_col, drop = FALSE]
74+
z = spec[, z_col, drop = FALSE]
75+
z_proc = attr(spec, "ei_z_proc")
76+
77+
int_scale = 1e5
78+
xz0 = row_kronecker(as.matrix(x), z_proc, int_scale)
79+
xzf = run_preproc(spec, z_col = c(x_col, z_col))
80+
81+
if (missing(weights)) {
82+
weights = rep(1, n)
83+
} else {
84+
weights = eval_tidy(enquo(weights), spec)
85+
}
86+
sqrt_w = sqrt(weights / mean(weights))
87+
88+
y = as.matrix(spec[, attr(spec, "ei_y"), drop = FALSE])
89+
n_y = ncol(y)
90+
91+
# first, residualize out xz0
92+
udv0 = svd(xz0 * sqrt_w)
93+
fit0 = ridge_auto(udv0, y, sqrt_w, vcov = FALSE)
94+
pen = fit0$penalty
95+
d_pen_h = udv0$d^2 / (udv0$d^2 + pen)
96+
H0 = tcrossprod(scale_cols(udv0$u, d_pen_h), udv0$u)
97+
res = y - H0 %*% y
98+
udv = svd((xzf - H0 %*% xzf) * sqrt_w)
99+
100+
# pseudo-inverse
101+
pinv_sym = function(M) {
102+
eig = eigen(M, symmetric=TRUE)
103+
rk = seq_len(sum(eig$values > 1e-10))
104+
eig$values[rk] = 1 / eig$values[rk]
105+
out = tcrossprod(scale_cols(eig$vectors, eig$values), eig$vectors)
106+
attr(out, "rank") = length(rk)
107+
out
108+
}
109+
110+
fit = ridge_svd(udv, res, sqrt_w, pen, vcov = TRUE)
111+
inv_vcov = pinv_sym(fit$vcov_u)
112+
113+
calc_wald = function(fit) {
114+
colSums((inv_vcov %*% fit$coef) * fit$coef) / fit$sigma2
115+
}
116+
W0 = calc_wald(fit)
117+
118+
if (!isTRUE(use_chisq)) {
119+
if (!is.numeric(iter) && iter > 0) {
120+
cli_abort("{.arg iter} must be a positive integer.")
121+
}
122+
123+
W = matrix(nrow = ncol(y), ncol = iter)
124+
pb = cli::cli_progress_bar("Running permutations", total = iter)
125+
for (i in seq_len(iter)) {
126+
res_p = res[sample.int(n), , drop=FALSE]
127+
fit_p = ridge_svd(udv, res_p, sqrt_w, pen, vcov = FALSE)
128+
W[, i] = calc_wald(fit_p)
129+
cli::cli_progress_update(id = pb)
130+
}
131+
cli::cli_progress_done(id = pb)
132+
p_val = (rowSums(W >= W0) + 1) / (iter + 1)
133+
} else {
134+
p_val = pchisq(W0, df = attr(inv_vcov, "rank"), lower.tail = FALSE)
135+
}
136+
137+
tibble::new_tibble(list(
138+
outcome = colnames(y),
139+
W = W0,
140+
df = attr(inv_vcov, "rank"),
141+
p.value = p_val
142+
))
25143
}

_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ reference:
3030
- ei_sens_rv
3131
- ei_bench
3232
- plot.ei_sens
33+
- ei_test_car
3334
- title: Ecological modeling
3435
desc: Fit regression and weighting models to aggregate data to use in inference
3536
contents:

explore/id_test.R

Lines changed: 34 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,62 +5,45 @@ library(tidyverse)
55
n = 1000
66
n_perm = 100
77

8-
run_test <- function(n = 1000, n_x = 3, p = 2, n_perm = 1000) {
9-
spec0 = ei_synthetic(n, p, n_x = n_x, b_cov = 0.0004 * (1 + diag(n_x)))
10-
spec = ei_spec(spec0, starts_with("x"), starts_with("y"), total = attr(spec0, "ei_n"))
11-
# attr(spec, "b") |> hist(breaks=50)
12-
13-
y = as.matrix(spec0[, attr(spec0, "ei_y")])
14-
X = as.matrix(spec0[, attr(spec0, "ei_x")])
15-
Z = as.matrix(spec0[, attr(spec0, "ei_z")])
16-
XZ0 = row_kronecker(X, Z, 1e5)
17-
XZ = bases::b_tpsob(spec0[, c(attr(spec, "ei_x"), attr(spec, "ei_z"))], p = 200)
18-
# ZZ = bases::b_tpsob(spec[, c(attr(spec, "ei_z"))], p = 200)
19-
20-
udv0 = svd(XZ0)
21-
udv = svd(cbind(XZ0, XZ))
22-
23-
fit0 = ridge_auto(udv0, y, rep(1, n), vcov = FALSE)
24-
pen = fit0$penalty
25-
fit = ridge_svd(udv, y, rep(1, n), vcov = FALSE, penalty = pen)
26-
# fit = ridge_auto(udv, y, rep(1, n), vcov = FALSE)
27-
# pen = fit$penalty
28-
# fit0 = ridge_svd(udv0, y, rep(1, n), vcov = FALSE, penalty = pen)
29-
c(cor(fit0$fitted, y)^2)
30-
c(cor(fit$fitted, y)^2)
31-
32-
rss_full = colSums((y - fit$fitted)^2)
33-
rss_red = colSums((y - fit0$fitted)^2)
34-
F_stat = ((rss_red - rss_full) / (fit$df - fit0$df)) / (rss_full / (n - fit$df))
35-
36-
perm = replicate(
37-
n_perm,
38-
{
39-
yp = fit0$fitted + (y - fit0$fitted)[sample(n), , drop = FALSE]
40-
fit = ridge_svd(udv, yp, rep(1, n), vcov = FALSE, penalty = pen)
41-
# fit0 = ridge_svd(udv0, yp, rep(1, n), vcov = FALSE, penalty = pen)
42-
rss_full = colSums((yp - fit$fitted)^2)
43-
# rss_red = colSums((yp - fit0$fitted)^2)
44-
F_stat = ((rss_red - rss_full) / (fit$df - fit0$df)) / (rss_full / (n - fit$df))
45-
},
46-
simplify = FALSE
47-
) |>
48-
do.call(cbind, args = _)
49-
50-
tibble::tibble_row(
51-
F = F_stat,
52-
p = (rowSums(perm >= F_stat) + 1) / (n_perm + 1),
53-
p_param = pf(F_stat, fit$df - fit0$df, n - fit$df, lower.tail = FALSE)
8+
run_test <- function(n = 1000, n_x = 3, p = 2, r2 = 0.5, iter = 1000) {
9+
spec0 = ei_synthetic(
10+
n,
11+
p,
12+
n_x = n_x,
13+
z = c(1, rep(0, p - 1)),
14+
r2_xz = r2,
15+
r2_bz = r2,
16+
b_cov = 0.0004 * (1 + diag(n_x))
5417
)
18+
spec = ei_spec(
19+
spec0,
20+
predictors = starts_with("x"),
21+
outcome = starts_with("y"),
22+
total = attr(spec0, "ei_n"),
23+
covariates = starts_with("z"),
24+
# covariates = "z1",
25+
preproc = function(z) {
26+
if (ncol(z) == 0) {
27+
matrix(nrow=nrow(z), ncol=0)
28+
} else {
29+
bases::b_tpsob(z, p = 50)
30+
}
31+
}
32+
)
33+
34+
ei_test_car(spec, iter = iter, use_chisq = F)
5535
}
5636

57-
res = map(1:400, ~ run_test(n = 2000, n_perm = 50), .progress = TRUE) |>
37+
res = map(1:400, ~ run_test(n = 500, iter = 100), .progress = TRUE) |>
5838
bind_rows()
5939

60-
hist(res$p)
61-
mean(res$p <= 0.05)
62-
hist(res$F, breaks=50)
63-
hist(res$p_param, breaks=50)
40+
hist(res$df, breaks=50)
41+
hist(res$p.value, breaks=50)
42+
hist(res$p.value[res$df > 0], breaks=50)
43+
mean(res$p.value <= 0.05)
44+
hist(res$W, breaks=50)
45+
hist(pchisq(res$W, res$df, lower.tail=F), breaks=50)
46+
plot(pchisq(res$W, res$df, lower.tail=F), res$p.value)
6447

6548

6649
# try on wallace

man/ei_test_car.Rd

Lines changed: 62 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/ridge-methods.Rd

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)