Skip to content

Predictive performance gap / jumpy behavior at full size in Gaussian multilevel model #441

Open
@fweber144

Description

@fweber144

In this Stan Discourse reply (reference model: Gaussian, multilevel), we observed a gap in predictive performance between the submodels and the reference model when search_terms was NULL (with a "jump" towards the reference model's performance at the full model size), but not when forcing both group-level terms to be selected first. Reprex copied (and reduced to the relevant part) from that reply (the example dataset may be found here; also note that the reprex writes some files to the current working directory):

# Setup -------------------------------------------------------------------

library(brms)
library(projpred)

options(warn = 1)

options(mc.cores = min(parallel::detectCores(logical = FALSE), 4))
options(brms.backend = "cmdstanr")
options(brms.file_refit = "on_change")
options(cmdstanr_write_stan_file_dir = ".")

options(projpred.extra_verbose = TRUE)
options(projpred.check_conv = TRUE)

# Data --------------------------------------------------------------------

indicator_data <- read.csv("example_dataset.csv")

# Subsample `N` observations:
N <- 3000
set.seed(123)
indicator_data_N <- indicator_data[
  sample.int(nrow(indicator_data), size = N), , drop = FALSE
]
# Avoid `:` between grouping variables:
indicator_data_N[["iso_country_code_IA_village"]] <- paste(
  indicator_data_N$iso_country_code, indicator_data_N$village, sep = "_"
)

# Reference model fit -----------------------------------------------------

rfit <- brm(
  formula = log_tva ~ 1 + log_hh_size + education_cleaned + log_livestock_tlu +
    log_land_cultivated + off_farm_any + till_not_by_hand + external_labour +
    pesticide + debts_have + aidreceived + livestock_inputs_any +
    land_irrigated_any + norm_growing_period + log_min_travel_time +
    log_pop_dens + norm_gdl_country_shdi + (1 | iso_country_code) +
    (1 | iso_country_code_IA_village),
  data = indicator_data_N,
  prior = c(
    set_prior("normal(0, 1)", class = "b"),
    set_prior("normal(0, 1)", class = "sd"),
    set_prior("normal(0, 1)", class = "sigma"),
    set_prior("normal(0, 1)", class = "Intercept")
  ),
  family = gaussian(),
  file = "rfit",
  seed = 584356,
  refresh = 0
)

# projpred ----------------------------------------------------------------

# Run kfold() separately to save time later when running cv_varsel() multiple
# times:
set.seed(3424511)
refm_kfold <- kfold(rfit, K = 5, save_fits = TRUE)
cvfits_crr <- structure(
  list(fits = refm_kfold$fits[, "fit"]),
  K = length(refm_kfold$fits[, "fit"]),
  folds = sapply(seq_len(nrow(rfit$data)), function(ii) {
    which(sapply(refm_kfold$fits[, "omitted"], "%in%", x = ii))
  })
)
refmodel_obj <- get_refmodel(rfit, cvfits = cvfits_crr)
S_ref <- nrow(as.matrix(rfit))

# With the default of `search_terms = NULL`:
cvvs3 <- cv_varsel(
  refmodel_obj,
  cv_method = "kfold",
  seed = 1,
  nclusters = 3,
  control = lme4::lmerControl(
    optimizer = "Nelder_Mead"
  )
)
print(plot(cvvs3, ranking_nterms_max = NA))

Screenshot from 2023-08-18 11-53-38

print(plot(cv_proportions(cvvs3)))

Screenshot from 2023-08-18 11-53-48

# Forcing both group-level terms to be selected first:
get_search_terms_forced <- function(forced_terms, optional_terms) {
  forced_terms <- paste(forced_terms, collapse = " + ")
  return(c(forced_terms, paste0(forced_terms, " + ", optional_terms)))
}
optional_predictors <- c(
  "log_hh_size",
  "education_cleaned",
  "log_livestock_tlu",
  "log_land_cultivated",
  "off_farm_any",
  "till_not_by_hand",
  "external_labour",
  "pesticide",
  "debts_have",
  "aidreceived",
  "livestock_inputs_any",
  "land_irrigated_any",
  "norm_growing_period",
  "log_min_travel_time",
  "log_pop_dens",
  "norm_gdl_country_shdi"
)
forced_predictors <- c("(1 | iso_country_code)",
                       "(1 | iso_country_code_IA_village)")
search_terms_forcedGL <- get_search_terms_forced(forced_predictors,
                                                 optional_predictors)
cvvs4 <- cv_varsel(
  refmodel_obj,
  cv_method = "kfold",
  seed = 1,
  nclusters = 3,
  search_terms = search_terms_forcedGL,
  control = lme4::lmerControl(
    optimizer = "Nelder_Mead"
  )
)
print(plot(cvvs4, ranking_nterms_max = NA))

Screenshot from 2023-08-18 11-53-57

print(plot(cv_proportions(cvvs4)))

Screenshot from 2023-08-18 11-54-04

Details may be found in the Stan Discourse reply.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions