Skip to content

Commit 0358150

Browse files
committed
some test improvements: HC, undersmoothing
1 parent 27c4bac commit 0358150

File tree

3 files changed

+102
-74
lines changed

3 files changed

+102
-74
lines changed

R/ei_test_car.R

Lines changed: 60 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,28 @@
1111
#'
1212
#' The test is a Kennedy-Cade (1996) style permutation test on a Wald statistic
1313
#' for the coefficients not included in the "reduced" model that would be fit
14-
#' by [ei_ridge()].
15-
#' The test is carried out by fitting a regression on a fully basis-expanded
16-
#' combination of covariates and predictors, and calculating a Wald statistic
17-
#' for the
14+
#' by [ei_ridge()]. The test statistic is asymptotically chi-squared under the
15+
#' null and may be anti-conservative in small samples, especially when the
16+
#' dimensionality of the basis expansion is large.
1817
#'
1918
#' @param spec An `ei_spec` object created with [ei_spec()]. The object
2019
#' should use the `preproc` argument to specify a rich basis expansion of the
2120
#' covariates and predictors.
2221
#' @inheritParams ei_ridge
2322
#' @param iter The number of permutations to use when estimating the null
24-
#' distribution. Ignored when `use_chisq = TRUE`.
23+
#' distribution, including the original identity permutation.
24+
#' Ignored when `use_chisq = TRUE`.
25+
#' @param undersmooth A value to divide the estimated ridge penalty by when
26+
#' partialling out the partially linear component of the model. A larger
27+
#' value will smooth the partially linear component less, which may improve
28+
#' Type I error control in finite samples at the cost of power.
2529
#' @param use_chisq If `TRUE`, use the asymptotic chi-squared distribution for
2630
#' the Wald test statistic instead of conducting a permutation test. Only
2731
#' appropriate for larger sample sizes (Helwig 2022 recommends at least 200
2832
#' when a single predictor is used).
33+
#' @param use_hc If `TRUE`, use a heteroskedasticity-consistent covariance estimate.
34+
#' More computationally intensive, but may make a difference in small samples
35+
#' or when there is substantial heteroskedasticity.
2936
#'
3037
#' @returns A tibble with one row per outcome variable and columns describing
3138
#' the test results. The `p.value` column contains the p-values for the test.
@@ -62,11 +69,20 @@
6269
#' preproc = preproc
6370
#' )
6471
#'
65-
#' ei_test_car(spec, iter=19) # use a larger number in practice
72+
#' ei_test_car(spec, iter=20) # use a larger number in practice
6673
#'
6774
#' @export
68-
ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
75+
ei_test_car <- function(spec, weights, iter = 1000, undersmooth = 1.5, use_chisq = nrow(spec) >= 2000, use_hc = FALSE) {
6976
validate_ei_spec(spec)
77+
if (!has_preproc(spec)) {
78+
cli_warn(c(
79+
"{.arg preproc} was not specified in your {.cls ei_spec} object",
80+
"i"="The {.fn ei_test_car} function relies on a rich basis expansion of the covariates and predictors.",
81+
"x"="Without a basis expansion in {.arg preproc}, the test will not be able to detect violations of the CAR assumption.",
82+
">"="Consider basis expansions from the {.pkg bases} or {.pkg splines} package."
83+
))
84+
}
85+
7086
n = nrow(spec)
7187
x_col = attr(spec, "ei_x")
7288
z_col = attr(spec, "ei_z")
@@ -91,7 +107,7 @@ ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
91107
# first, residualize out xz0
92108
udv0 = svd(xz0 * sqrt_w)
93109
fit0 = ridge_auto(udv0, y, sqrt_w, vcov = FALSE)
94-
pen = fit0$penalty
110+
pen = fit0$penalty / undersmooth
95111
d_pen_h = udv0$d^2 / (udv0$d^2 + pen)
96112
H0 = tcrossprod(scale_cols(udv0$u, d_pen_h), udv0$u)
97113
res = y - H0 %*% y
@@ -107,37 +123,64 @@ ei_test_car <- function(spec, weights, iter = 1000, use_chisq = FALSE) {
107123
out
108124
}
109125

126+
# observed test stat value
110127
fit = ridge_svd(udv, res, sqrt_w, pen, vcov = TRUE)
111-
inv_vcov = pinv_sym(fit$vcov_u)
112128

113-
calc_wald = function(fit) {
114-
colSums((inv_vcov %*% fit$coef) * fit$coef) / fit$sigma2
129+
# set up Wald stat calculation
130+
if (isTRUE(use_hc)) {
131+
xzr = (xzf - H0 %*% xzf)
132+
Sig_inv = solve((crossprod(xzr) + diag(ncol(xzr))*pen) / n)
133+
134+
calc_wald = function(fit) {
135+
W = numeric(n_y)
136+
df = numeric(n_y)
137+
for (j in seq_len(n_y)) {
138+
Omega = crossprod((res - fit$fitted)[, j] * xzr) / n
139+
inv_vcov2 = pinv_sym(Sig_inv %*% Omega %*% Sig_inv)
140+
df[j] = attr(inv_vcov2, "rank")
141+
W[j] = n * crossprod(fit$coef[, j], inv_vcov2 %*% fit$coef[, j])
142+
}
143+
attr(W, "df") = df
144+
W
145+
}
146+
147+
W0 = calc_wald(fit)
148+
df = attr(W0, "df")
149+
} else {
150+
inv_vcov = pinv_sym(fit$vcov_u)
151+
df = rep(attr(inv_vcov, "rank"), n_y)
152+
153+
calc_wald = function(fit) {
154+
colSums(fit$coef * (inv_vcov %*% fit$coef)) / fit$sigma2
155+
}
156+
157+
W0 = calc_wald(fit)
115158
}
116-
W0 = calc_wald(fit)
117159

118160
if (!isTRUE(use_chisq)) {
119-
if (!is.numeric(iter) && iter > 0) {
161+
if (!is.numeric(iter) && iter > 1) {
120162
cli_abort("{.arg iter} must be a positive integer.")
121163
}
122164

123165
W = matrix(nrow = ncol(y), ncol = iter)
166+
W[, 1] = W0
124167
pb = cli::cli_progress_bar("Running permutations", total = iter)
125-
for (i in seq_len(iter)) {
168+
for (i in seq(2, iter, 1)) {
126169
res_p = res[sample.int(n), , drop=FALSE]
127170
fit_p = ridge_svd(udv, res_p, sqrt_w, pen, vcov = FALSE)
128171
W[, i] = calc_wald(fit_p)
129172
cli::cli_progress_update(id = pb)
130173
}
131174
cli::cli_progress_done(id = pb)
132-
p_val = (rowSums(W >= W0) + 1) / (iter + 1)
175+
p_val = rowSums(W >= W0) / iter
133176
} else {
134-
p_val = pchisq(W0, df = attr(inv_vcov, "rank"), lower.tail = FALSE)
177+
p_val = pchisq(W0, df = df, lower.tail = FALSE)
135178
}
136179

137180
tibble::new_tibble(list(
138181
outcome = colnames(y),
139182
W = W0,
140-
df = attr(inv_vcov, "rank"),
183+
df = df,
141184
p.value = p_val
142185
))
143186
}

explore/id_test.R

Lines changed: 19 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,49 @@
11
devtools::load_all(".")
22
library(tidyverse)
33

4-
# try on simulated data
5-
n = 1000
6-
n_perm = 100
7-
8-
run_test <- function(n = 1000, n_x = 3, p = 2, r2 = 0.5, iter = 1000) {
9-
spec0 = ei_synthetic(
4+
n = 2000
5+
p = 3
6+
r2 = 0.25
7+
specs = map(1:500, function(i) {
8+
ei_synthetic(
109
n,
1110
p,
1211
n_x = n_x,
13-
z = c(1, rep(0, p - 1)),
1412
r2_xz = r2,
1513
r2_bz = r2,
1614
b_cov = 0.0004 * (1 + diag(n_x))
1715
)
16+
}, .progress = TRUE)
17+
18+
run_test = function(spec0, well_spec = TRUE, ...) {
1819
spec = ei_spec(
1920
spec0,
2021
predictors = starts_with("x"),
2122
outcome = starts_with("y"),
2223
total = attr(spec0, "ei_n"),
23-
covariates = starts_with("z"),
24-
# covariates = "z1",
24+
covariates = if (well_spec) starts_with("z") else "z1",
2525
preproc = function(z) {
2626
if (ncol(z) == 0) {
27-
matrix(nrow=nrow(z), ncol=0)
27+
matrix(nrow = nrow(z), ncol = 0)
2828
} else {
29-
bases::b_tpsob(z, p = 50)
29+
bases::b_tpsob(z, p = 25)
3030
}
3131
}
3232
)
3333

34-
ei_test_car(spec, iter = iter, use_chisq = F)
34+
ei_test_car(spec, ...)
3535
}
3636

37-
res = map(1:400, ~ run_test(n = 500, iter = 100), .progress = TRUE) |>
37+
res = map(specs, ~ run_test(.x, well_spec = F, use_hc=F, iter=50), .progress = TRUE) |>
38+
bind_rows()
39+
res2 = map(specs, ~ run_test(.x, well_spec = T, use_hc=F, iter=50), .progress = TRUE) |>
3840
bind_rows()
3941

40-
hist(res$df, breaks=50)
4142
hist(res$p.value, breaks=50)
42-
hist(res$p.value[res$df > 0], breaks=50)
43+
hist(res2$p.value, breaks=50)
4344
mean(res$p.value <= 0.05)
44-
hist(res$W, breaks=50)
45+
mean(res2$p.value <= 0.05)
4546
hist(pchisq(res$W, res$df, lower.tail=F), breaks=50)
4647
plot(pchisq(res$W, res$df, lower.tail=F), res$p.value)
47-
48-
49-
# try on wallace
50-
data(elec_1968)
51-
elec_1968 = elec_1968 |>
52-
mutate(
53-
vap_nonwhite = 1 - vap_white,
54-
z = bases::b_bart(pop_urban, pop_rural, educ_elem, educ_hsch, educ_coll, farm,
55-
inc_00_03k, inc_03_08k, inc_08_25k, inc_25_99k)
56-
)
57-
58-
spec = ei_spec(elec_1968, c(vap_white, vap_black, vap_other), c(pres_dem_hum, pres_rep_nix, pres_ind_wal, pres_abs),
59-
total = pres_total, covariates = c(state, z))
60-
61-
m = ei_ridge(spec, bounds=F, sum_one=F)
62-
63-
Z = cbind(elec_1968$z, model.matrix(~ state - 1, elec_1968))
64-
XZ = with(elec_1968, bases::b_tpsob(vap_white, vap_black, vap_other, pop_urban, pop_rural, educ_elem, educ_hsch, educ_coll, farm,
65-
inc_00_03k, inc_03_08k, inc_08_25k, inc_25_99k, p=200)) |>
66-
cbind(model.matrix(~ state - 1, elec_1968))
67-
68-
udv = svd(XZ)
69-
n = nrow(Z)
70-
fit0 = ridge_auto(udv, resid(m), rep(1, n), vcov = FALSE)
71-
r2 = diag(as.matrix(cor(fit0$fitted, resid(m))^2))
72-
73-
perm = replicate(1000, {
74-
yy = resid(m)[sample(n), , drop=FALSE]
75-
fit = ridge_svd(udv, yy, rep(1, n), penalty = fit0$penalty, vcov = FALSE)
76-
diag(as.matrix(cor(fit$fitted, yy)^2))
77-
})
78-
79-
r2
80-
(rowSums(perm >= r2) + 1) / (1000 + 1)
48+
plot(res$p.value, res2$p.value)
49+
plot(res$W, res2$W)

man/ei_test_car.Rd

Lines changed: 23 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)