Open
Description
In this Stan Discourse reply (reference model: Gaussian, multilevel), we observed a gap in predictive performance between the submodels and the reference model when search_terms
was NULL
(with a "jump" towards the reference model's performance at the full model size), but not when forcing both group-level terms to be selected first. Reprex copied (and reduced to the relevant part) from that reply (the example dataset may be found here; also note that the reprex writes some files to the current working directory):
# Setup -------------------------------------------------------------------
library(brms)
library(projpred)
options(warn = 1)
options(mc.cores = min(parallel::detectCores(logical = FALSE), 4))
options(brms.backend = "cmdstanr")
options(brms.file_refit = "on_change")
options(cmdstanr_write_stan_file_dir = ".")
options(projpred.extra_verbose = TRUE)
options(projpred.check_conv = TRUE)
# Data --------------------------------------------------------------------
indicator_data <- read.csv("example_dataset.csv")
# Subsample `N` observations:
N <- 3000
set.seed(123)
indicator_data_N <- indicator_data[
sample.int(nrow(indicator_data), size = N), , drop = FALSE
]
# Avoid `:` between grouping variables:
indicator_data_N[["iso_country_code_IA_village"]] <- paste(
indicator_data_N$iso_country_code, indicator_data_N$village, sep = "_"
)
# Reference model fit -----------------------------------------------------
rfit <- brm(
formula = log_tva ~ 1 + log_hh_size + education_cleaned + log_livestock_tlu +
log_land_cultivated + off_farm_any + till_not_by_hand + external_labour +
pesticide + debts_have + aidreceived + livestock_inputs_any +
land_irrigated_any + norm_growing_period + log_min_travel_time +
log_pop_dens + norm_gdl_country_shdi + (1 | iso_country_code) +
(1 | iso_country_code_IA_village),
data = indicator_data_N,
prior = c(
set_prior("normal(0, 1)", class = "b"),
set_prior("normal(0, 1)", class = "sd"),
set_prior("normal(0, 1)", class = "sigma"),
set_prior("normal(0, 1)", class = "Intercept")
),
family = gaussian(),
file = "rfit",
seed = 584356,
refresh = 0
)
# projpred ----------------------------------------------------------------
# Run kfold() separately to save time later when running cv_varsel() multiple
# times:
set.seed(3424511)
refm_kfold <- kfold(rfit, K = 5, save_fits = TRUE)
cvfits_crr <- structure(
list(fits = refm_kfold$fits[, "fit"]),
K = length(refm_kfold$fits[, "fit"]),
folds = sapply(seq_len(nrow(rfit$data)), function(ii) {
which(sapply(refm_kfold$fits[, "omitted"], "%in%", x = ii))
})
)
refmodel_obj <- get_refmodel(rfit, cvfits = cvfits_crr)
S_ref <- nrow(as.matrix(rfit))
# With the default of `search_terms = NULL`:
cvvs3 <- cv_varsel(
refmodel_obj,
cv_method = "kfold",
seed = 1,
nclusters = 3,
control = lme4::lmerControl(
optimizer = "Nelder_Mead"
)
)
print(plot(cvvs3, ranking_nterms_max = NA))
print(plot(cv_proportions(cvvs3)))
# Forcing both group-level terms to be selected first:
get_search_terms_forced <- function(forced_terms, optional_terms) {
forced_terms <- paste(forced_terms, collapse = " + ")
return(c(forced_terms, paste0(forced_terms, " + ", optional_terms)))
}
optional_predictors <- c(
"log_hh_size",
"education_cleaned",
"log_livestock_tlu",
"log_land_cultivated",
"off_farm_any",
"till_not_by_hand",
"external_labour",
"pesticide",
"debts_have",
"aidreceived",
"livestock_inputs_any",
"land_irrigated_any",
"norm_growing_period",
"log_min_travel_time",
"log_pop_dens",
"norm_gdl_country_shdi"
)
forced_predictors <- c("(1 | iso_country_code)",
"(1 | iso_country_code_IA_village)")
search_terms_forcedGL <- get_search_terms_forced(forced_predictors,
optional_predictors)
cvvs4 <- cv_varsel(
refmodel_obj,
cv_method = "kfold",
seed = 1,
nclusters = 3,
search_terms = search_terms_forcedGL,
control = lme4::lmerControl(
optimizer = "Nelder_Mead"
)
)
print(plot(cvvs4, ranking_nterms_max = NA))
print(plot(cv_proportions(cvvs4)))
Details may be found in the Stan Discourse reply.
Metadata
Metadata
Assignees
Labels
No labels