|
| 1 | +#' ARD Continuous Survey Statistics |
| 2 | +#' |
| 3 | +#' Returns an ARD of weighted statistics using the `{survey}` package. |
| 4 | +#' |
| 5 | +#' @param data (`survey.design`)\cr |
| 6 | +#' a design object often created with [`survey::svydesign()`]. |
| 7 | +#' @param variables ([`tidy-select`][dplyr::dplyr_tidy_select])\cr |
| 8 | +#' columns to include in summaries. Default is `everything()`. |
| 9 | +#' @param by ([`tidy-select`][dplyr::dplyr_tidy_select])\cr |
| 10 | +#' results are calculated for **all combinations** of the columns specified, |
| 11 | +#' including unobserved combinations and unobserved factor levels. |
| 12 | +#' @param statistic ([`formula-list-selector`][syntax])\cr |
| 13 | +#' a named list, a list of formulas, |
| 14 | +#' or a single formula where the list element is a character vector of |
| 15 | +#' statistic names to include. See below for options. |
| 16 | +#' @param fmt_fn ([`formula-list-selector`][syntax])\cr |
| 17 | +#' a named list, a list of formulas, |
| 18 | +#' or a single formula where the list element is a named list of functions |
| 19 | +#' (or the RHS of a formula), |
| 20 | +#' e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character))`. |
| 21 | +#' @param stat_label ([`formula-list-selector`][syntax])\cr |
| 22 | +#' a named list, a list of formulas, or a single formula where |
| 23 | +#' the list element is either a named list or a list of formulas defining the |
| 24 | +#' statistic labels, e.g. `everything() ~ list(mean = "Mean", sd = "SD")` or |
| 25 | +#' `everything() ~ list(mean ~ "Mean", sd ~ "SD")`. |
| 26 | +#' |
| 27 | +#' @section statistic argument: |
| 28 | +#' |
| 29 | +#' The following statistics are available: |
| 30 | +#' `r cardx:::accepted_svy_stats(FALSE) |> shQuote() |> paste(collapse = ", ")`, |
| 31 | +#' where 'p##' is are the percentiles and `##` is an integer between 0 and 100. |
| 32 | +#' |
| 33 | +#' |
| 34 | +#' @return an ARD data frame of class 'card' |
| 35 | +#' @export |
| 36 | +#' |
| 37 | +#' @examplesIf cards::is_pkg_installed("survey", reference_pkg = "cardx") |
| 38 | +#' data(api, package = "survey") |
| 39 | +#' dclus1 <- survey::svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~fpc) |
| 40 | +#' |
| 41 | +#' ard_svycontinuous( |
| 42 | +#' data = dclus1, |
| 43 | +#' variables = api00, |
| 44 | +#' by = stype |
| 45 | +#' ) |
| 46 | +ard_svycontinuous <- function(data, variables, by = NULL, |
| 47 | + statistic = everything() ~ c("median", "p25", "p75"), |
| 48 | + fmt_fn = NULL, |
| 49 | + stat_label = NULL) { |
| 50 | + cards::check_pkg_installed("survey", reference_pkg = "cardx") |
| 51 | + |
| 52 | + # check inputs --------------------------------------------------------------- |
| 53 | + check_not_missing(data) |
| 54 | + check_class(data, cls = "survey.design") |
| 55 | + check_not_missing(variables) |
| 56 | + |
| 57 | + # process inputs ------------------------------------------------------------- |
| 58 | + cards::process_selectors(data$variables, variables = {{ variables }}, by = {{ by }}) |
| 59 | + variables <- setdiff(variables, by) |
| 60 | + cards::process_formula_selectors( |
| 61 | + data$variables[variables], |
| 62 | + statistic = statistic, |
| 63 | + fmt_fn = fmt_fn, |
| 64 | + stat_label = stat_label |
| 65 | + ) |
| 66 | + cards::fill_formula_selectors( |
| 67 | + data$variables[variables], |
| 68 | + statistic = formals(ard_svycontinuous)[["statistic"]] |> eval() |
| 69 | + ) |
| 70 | + cards::check_list_elements( |
| 71 | + x = statistic, |
| 72 | + predicate = \(x) all(x %in% accepted_svy_stats()), |
| 73 | + error_msg = c("Error in the values of the {.arg statistic} argument.", |
| 74 | + i = "Values must be in {.val {accepted_svy_stats(FALSE)}}" |
| 75 | + ) |
| 76 | + ) |
| 77 | + |
| 78 | + # compute the weighted statistics -------------------------------------------- |
| 79 | + df_stats <- |
| 80 | + map( |
| 81 | + names(statistic), |
| 82 | + function(variable) { |
| 83 | + map( |
| 84 | + statistic[[variable]], |
| 85 | + function(statistic) { |
| 86 | + .compute_svy_stat(data, variable = variable, by = by, stat_name = statistic) |
| 87 | + } |
| 88 | + ) |
| 89 | + } |
| 90 | + ) |> |
| 91 | + dplyr::bind_rows() |
| 92 | + |
| 93 | + |
| 94 | + # add stat_labels ------------------------------------------------------------ |
| 95 | + df_stats <- |
| 96 | + df_stats |> |
| 97 | + dplyr::left_join( |
| 98 | + .default_svy_stat_labels(), |
| 99 | + by = "stat_name" |
| 100 | + ) |> |
| 101 | + dplyr::mutate(stat_label = dplyr::coalesce(.data$stat_label, .data$stat_name)) |
| 102 | + if (!is_empty(stat_label)) { |
| 103 | + df_stats <- |
| 104 | + dplyr::rows_update( |
| 105 | + df_stats, |
| 106 | + dplyr::tibble( |
| 107 | + variable = names(stat_label), |
| 108 | + stat_name = map(.data$variable, ~ names(stat_label[[.x]])), |
| 109 | + stat_label = map(.data$variable, ~ stat_label[[.x]] |> |
| 110 | + unname() |> |
| 111 | + unlist()) |
| 112 | + ) |> |
| 113 | + tidyr::unnest(cols = c("stat_name", "stat_label")), |
| 114 | + by = "stat_name", |
| 115 | + unmatched = "ignore" |
| 116 | + ) |
| 117 | + } |
| 118 | + |
| 119 | + # add formatting stats ------------------------------------------------------- |
| 120 | + df_stats$fmt_fn <- list(1L) |
| 121 | + if (!is_empty(fmt_fn)) { |
| 122 | + df_stats <- |
| 123 | + dplyr::rows_update( |
| 124 | + df_stats, |
| 125 | + dplyr::tibble( |
| 126 | + variable = names(fmt_fn), |
| 127 | + stat_name = map(.data$variable, ~ names(fmt_fn[[.x]])), |
| 128 | + fmt_fn = map(.data$variable, ~ fmt_fn[[.x]] |> unname()) |
| 129 | + ) |> |
| 130 | + tidyr::unnest(cols = c("stat_name", "fmt_fn")), |
| 131 | + by = "stat_name", |
| 132 | + unmatched = "ignore" |
| 133 | + ) |
| 134 | + } |
| 135 | + |
| 136 | + # add class and return ARD object -------------------------------------------- |
| 137 | + df_stats |> |
| 138 | + dplyr::mutate(context = "continuous") |> |
| 139 | + cards::tidy_ard_column_order() %>% |
| 140 | + {structure(., class = c("card", class(.)))} # styler: off |
| 141 | +} |
| 142 | + |
| 143 | +.default_svy_stat_labels <- function(stat_label = NULL) { |
| 144 | + dplyr::tribble( |
| 145 | + ~stat_name, ~stat_label, |
| 146 | + "mean", "Mean", |
| 147 | + "median", "Median", |
| 148 | + "var", "Variance", |
| 149 | + "sd", "Standard Deviation", |
| 150 | + "sum", "Sum", |
| 151 | + "deff", "Design Effect", |
| 152 | + "mean.std.error", "SE(Mean)", |
| 153 | + "min", "Minimum", |
| 154 | + "max", "Maximum", |
| 155 | + "p25", "25% Percentile", |
| 156 | + "p75", "75% Percentile" |
| 157 | + ) |
| 158 | +} |
| 159 | + |
| 160 | +accepted_svy_stats <- function(expand_quantiles = TRUE) { |
| 161 | + base_stats <- |
| 162 | + c("mean", "median", "min", "max", "sum", "var", "sd", "mean.std.error", "deff") |
| 163 | + if (expand_quantiles) { |
| 164 | + return(c(base_stats, paste0("p", 0:100))) |
| 165 | + } |
| 166 | + c(base_stats, "p##") |
| 167 | +} |
| 168 | + |
| 169 | + |
| 170 | + |
| 171 | +# this function calculates the summary for a single variable, single statistic |
| 172 | +# and for all `by` levels. it returns an ARD data frame |
| 173 | +.compute_svy_stat <- function(data, variable, by = NULL, stat_name) { |
| 174 | + # difftime variable needs to be transformed into numeric for svyquantile |
| 175 | + if (inherits(data$variables[[variable]], "difftime")) { |
| 176 | + data$variables[[variable]] <- unclass(data$variables[[variable]]) |
| 177 | + } |
| 178 | + |
| 179 | + # styler: off |
| 180 | + if (stat_name %in% "mean") args <- list(FUN = survey::svymean) |
| 181 | + else if (stat_name %in% "sum") args <- list(FUN = survey::svytotal) |
| 182 | + else if (stat_name %in% "var") args <- list(FUN = survey::svyvar) |
| 183 | + else if (stat_name %in% "sd") args <- list(FUN = \(...) survey::svyvar(...) |> sqrt()) |
| 184 | + else if (stat_name %in% "mean.std.error") args <- list(FUN = \(...) survey::svymean(...) |> survey::SE()) |
| 185 | + else if (stat_name %in% "deff") args <- list(FUN = \(...) survey::svymean(..., deff = TRUE) |> survey::deff()) |
| 186 | + else if (stat_name %in% "min") args <- list(FUN = \(x, design, na.rm, ...) min(design$variables[[all.vars(x)]], na.rm = na.rm)) |
| 187 | + else if (stat_name %in% "max") args <- list(FUN = \(x, design, na.rm, ...) max(design$variables[[all.vars(x)]], na.rm = na.rm)) |
| 188 | + # define functions for the quantiles |
| 189 | + else if (stat_name %in% c("median", paste0("p", 0:100))) { |
| 190 | + quantile <- ifelse(stat_name %in% "median", 0.5, substr(stat_name, 2, nchar(stat_name)) |> as.numeric() %>% `/`(100)) |
| 191 | + # univariate results are returned in a different format from stratified. |
| 192 | + args <- |
| 193 | + if (is_empty(by)) list(FUN = \(...) survey::svyquantile(...)[[1]], quantiles = quantile) |
| 194 | + else list(FUN = \(...) survey::svyquantile(...), quantiles = quantile) |
| 195 | + } |
| 196 | + # styler: on |
| 197 | + |
| 198 | + # adding additional args to pass |
| 199 | + args <- |
| 200 | + args |> |
| 201 | + append( |
| 202 | + list( |
| 203 | + design = data, |
| 204 | + # if all values are NA, turn na.rm to FALSE to avoid error |
| 205 | + na.rm = !all(is.na(data$variables[[variable]])), |
| 206 | + keep.var = FALSE |
| 207 | + ) |
| 208 | + ) |
| 209 | + |
| 210 | + |
| 211 | + # if no by variable, calculate univariate statistics |
| 212 | + if (is_empty(by)) { |
| 213 | + args$x <- stats::reformulate(variable) |
| 214 | + # calculate statistic (and remove FUN from the argument list) |
| 215 | + stat <- |
| 216 | + cards::eval_capture_conditions( |
| 217 | + do.call(args$FUN, args = args |> utils::modifyList(list(FUN = NULL))) |
| 218 | + ) |
| 219 | + # if the result was calculated, then put it into a tibble |
| 220 | + if (!is.null(stat[["result"]])) { |
| 221 | + df_stat <- |
| 222 | + dplyr::tibble(variable, stat[["result"]][1]) |> |
| 223 | + set_names(c("variable", "stat")) |> |
| 224 | + dplyr::mutate( |
| 225 | + stat = as.list(unname(.data$stat)), |
| 226 | + warning = list(stat[["warning"]]), |
| 227 | + error = list(stat[["error"]]) |
| 228 | + ) |
| 229 | + } |
| 230 | + # otherwise, if there was an error return tibble with error message |
| 231 | + else { |
| 232 | + df_stat <- |
| 233 | + dplyr::tibble( |
| 234 | + variable = .env$variable, |
| 235 | + stat = list(NULL), |
| 236 | + warning = list(.env$stat[["warning"]]), |
| 237 | + error = list(.env$stat[["error"]]) |
| 238 | + ) |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + # if there is by variable(s), calculate statistics for the combinations |
| 243 | + else { |
| 244 | + args$formula <- stats::reformulate(variable) |
| 245 | + args$by <- stats::reformulate(by) |
| 246 | + stat <- |
| 247 | + if (stat_name %in% c("median", paste0("p", 0:100))) { |
| 248 | + cards::eval_capture_conditions( |
| 249 | + do.call(survey::svyby, args) |> set_names(c(by, "quantile", "ci.2.5", "ci.97.5", "se")) |
| 250 | + ) |
| 251 | + } else if (stat_name %in% "deff") { |
| 252 | + stat <- |
| 253 | + cards::eval_capture_conditions( |
| 254 | + do.call( |
| 255 | + survey::svyby, |
| 256 | + args |> utils::modifyList(list(FUN = survey::svymean, deff = TRUE)) |
| 257 | + ) |> |
| 258 | + dplyr::select(all_of(by), dplyr::last_col()) # the last column is DEff |
| 259 | + ) |
| 260 | + } else { |
| 261 | + cards::eval_capture_conditions(do.call(survey::svyby, args)) |
| 262 | + } |
| 263 | + |
| 264 | + # if the result was calculated, then put it into a tibble |
| 265 | + if (!is.null(stat[["result"]])) { |
| 266 | + df_stat <- stat[["result"]][seq_len(length(by) + 1L)] |> |
| 267 | + dplyr::as_tibble() %>% |
| 268 | + # adding unobserved combinations of "by" variables |
| 269 | + { |
| 270 | + dplyr::full_join( |
| 271 | + cards::nest_for_ard(data$variables, by = by, key = "...ard_no_one_will_ever_pick_this...", rename = FALSE, list_columns = FALSE) |> |
| 272 | + dplyr::select(-"...ard_no_one_will_ever_pick_this..."), |
| 273 | + ., |
| 274 | + by = by |
| 275 | + ) |
| 276 | + } |> |
| 277 | + set_names(paste0("group", seq_along(by), "_level"), "stat") |> |
| 278 | + dplyr::bind_cols( |
| 279 | + dplyr::tibble(!!!c(by, variable)) |> |
| 280 | + set_names(paste0("group", seq_along(by)), "variable") |
| 281 | + ) |> |
| 282 | + dplyr::mutate( |
| 283 | + dplyr::across(c(cards::all_ard_groups("levels"), "stat"), as.list), |
| 284 | + warning = list(.env$stat[["warning"]]), |
| 285 | + error = list(.env$stat[["error"]]) |
| 286 | + ) |
| 287 | + } |
| 288 | + # otherwise, if there was an error return tibble with error message |
| 289 | + else { |
| 290 | + df_stat <- |
| 291 | + cards::nest_for_ard(data$variables, by = by, key = "...ard_no_one_will_ever_pick_this...") |> |
| 292 | + dplyr::select(-"...ard_no_one_will_ever_pick_this...") |> |
| 293 | + dplyr::mutate( |
| 294 | + variable = .env$variable, |
| 295 | + stat = list(NULL), |
| 296 | + warning = list(.env$stat[["warning"]]), |
| 297 | + error = list(.env$stat[["error"]]) |
| 298 | + ) |
| 299 | + } |
| 300 | + } |
| 301 | + |
| 302 | + df_stat |> |
| 303 | + dplyr::mutate(stat_name = .env$stat_name) |
| 304 | +} |
0 commit comments