Skip to content

Commit aef0c8f

Browse files
ddsjobergzdz2101
andauthored
Adding ard_svycontinuous() for weighted continuous variable summaries (#69)
**What changes are proposed in this pull request?** * Adding `ard_svycontinuous()` for weighted continuous variable summaries **Reference GitHub issue associated with pull request.** _e.g., 'closes #<issue number>'_ closes #68 -------------------------------------------------------------------------------- Pre-review Checklist (if item does not apply, mark is as complete) - [x] **All** GitHub Action workflows pass with a ✅ - [x] PR branch has pulled the most recent updates from master branch: `usethis::pr_merge_main()` - [x] If a bug was fixed, a unit test was added. - [x] Code coverage is suitable for any new functions/features (generally, 100% coverage for new code): `devtools::test_coverage()` - [ ] Request a reviewer Reviewer Checklist (if item does not apply, mark is as complete) - [ ] If a bug was fixed, a unit test was added. - [ ] Run `pkgdown::build_site()`. Check the R console for errors, and review the rendered website. - [ ] Code coverage is suitable for any new functions/features: `devtools::test_coverage()` When the branch is ready to be merged: - [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# cards (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end update (see `NEWS.md` for examples). - [ ] **All** GitHub Action workflows pass with a ✅ - [ ] Approve Pull Request - [ ] Merge the PR. Please use "Squash and merge" or "Rebase and merge". --------- Signed-off-by: Daniel Sjoberg <danield.sjoberg@gmail.com> Co-authored-by: Zelos Zhu <zelos.zhu@atorusresearch.com>
1 parent 59d1855 commit aef0c8f

File tree

8 files changed

+827
-1
lines changed

8 files changed

+827
-1
lines changed

NAMESPACE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ export(ard_proptest)
2121
export(ard_regression)
2222
export(ard_regression_basic)
2323
export(ard_smd)
24-
export(ard_svyttest)
2524
export(ard_svychisq)
25+
export(ard_svycontinuous)
26+
export(ard_svyttest)
2627
export(ard_ttest)
2728
export(ard_vif)
2829
export(ard_wilcoxtest)

R/ard_svycontinuous.R

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
#' ARD Continuous Survey Statistics
2+
#'
3+
#' Returns an ARD of weighted statistics using the `{survey}` package.
4+
#'
5+
#' @param data (`survey.design`)\cr
6+
#' a design object often created with [`survey::svydesign()`].
7+
#' @param variables ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
8+
#' columns to include in summaries. Default is `everything()`.
9+
#' @param by ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
10+
#' results are calculated for **all combinations** of the columns specified,
11+
#' including unobserved combinations and unobserved factor levels.
12+
#' @param statistic ([`formula-list-selector`][syntax])\cr
13+
#' a named list, a list of formulas,
14+
#' or a single formula where the list element is a character vector of
15+
#' statistic names to include. See below for options.
16+
#' @param fmt_fn ([`formula-list-selector`][syntax])\cr
17+
#' a named list, a list of formulas,
18+
#' or a single formula where the list element is a named list of functions
19+
#' (or the RHS of a formula),
20+
#' e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character))`.
21+
#' @param stat_label ([`formula-list-selector`][syntax])\cr
22+
#' a named list, a list of formulas, or a single formula where
23+
#' the list element is either a named list or a list of formulas defining the
24+
#' statistic labels, e.g. `everything() ~ list(mean = "Mean", sd = "SD")` or
25+
#' `everything() ~ list(mean ~ "Mean", sd ~ "SD")`.
26+
#'
27+
#' @section statistic argument:
28+
#'
29+
#' The following statistics are available:
30+
#' `r cardx:::accepted_svy_stats(FALSE) |> shQuote() |> paste(collapse = ", ")`,
31+
#' where 'p##' is are the percentiles and `##` is an integer between 0 and 100.
32+
#'
33+
#'
34+
#' @return an ARD data frame of class 'card'
35+
#' @export
36+
#'
37+
#' @examplesIf cards::is_pkg_installed("survey", reference_pkg = "cardx")
38+
#' data(api, package = "survey")
39+
#' dclus1 <- survey::svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~fpc)
40+
#'
41+
#' ard_svycontinuous(
42+
#' data = dclus1,
43+
#' variables = api00,
44+
#' by = stype
45+
#' )
46+
ard_svycontinuous <- function(data, variables, by = NULL,
47+
statistic = everything() ~ c("median", "p25", "p75"),
48+
fmt_fn = NULL,
49+
stat_label = NULL) {
50+
cards::check_pkg_installed("survey", reference_pkg = "cardx")
51+
52+
# check inputs ---------------------------------------------------------------
53+
check_not_missing(data)
54+
check_class(data, cls = "survey.design")
55+
check_not_missing(variables)
56+
57+
# process inputs -------------------------------------------------------------
58+
cards::process_selectors(data$variables, variables = {{ variables }}, by = {{ by }})
59+
variables <- setdiff(variables, by)
60+
cards::process_formula_selectors(
61+
data$variables[variables],
62+
statistic = statistic,
63+
fmt_fn = fmt_fn,
64+
stat_label = stat_label
65+
)
66+
cards::fill_formula_selectors(
67+
data$variables[variables],
68+
statistic = formals(ard_svycontinuous)[["statistic"]] |> eval()
69+
)
70+
cards::check_list_elements(
71+
x = statistic,
72+
predicate = \(x) all(x %in% accepted_svy_stats()),
73+
error_msg = c("Error in the values of the {.arg statistic} argument.",
74+
i = "Values must be in {.val {accepted_svy_stats(FALSE)}}"
75+
)
76+
)
77+
78+
# compute the weighted statistics --------------------------------------------
79+
df_stats <-
80+
map(
81+
names(statistic),
82+
function(variable) {
83+
map(
84+
statistic[[variable]],
85+
function(statistic) {
86+
.compute_svy_stat(data, variable = variable, by = by, stat_name = statistic)
87+
}
88+
)
89+
}
90+
) |>
91+
dplyr::bind_rows()
92+
93+
94+
# add stat_labels ------------------------------------------------------------
95+
df_stats <-
96+
df_stats |>
97+
dplyr::left_join(
98+
.default_svy_stat_labels(),
99+
by = "stat_name"
100+
) |>
101+
dplyr::mutate(stat_label = dplyr::coalesce(.data$stat_label, .data$stat_name))
102+
if (!is_empty(stat_label)) {
103+
df_stats <-
104+
dplyr::rows_update(
105+
df_stats,
106+
dplyr::tibble(
107+
variable = names(stat_label),
108+
stat_name = map(.data$variable, ~ names(stat_label[[.x]])),
109+
stat_label = map(.data$variable, ~ stat_label[[.x]] |>
110+
unname() |>
111+
unlist())
112+
) |>
113+
tidyr::unnest(cols = c("stat_name", "stat_label")),
114+
by = "stat_name",
115+
unmatched = "ignore"
116+
)
117+
}
118+
119+
# add formatting stats -------------------------------------------------------
120+
df_stats$fmt_fn <- list(1L)
121+
if (!is_empty(fmt_fn)) {
122+
df_stats <-
123+
dplyr::rows_update(
124+
df_stats,
125+
dplyr::tibble(
126+
variable = names(fmt_fn),
127+
stat_name = map(.data$variable, ~ names(fmt_fn[[.x]])),
128+
fmt_fn = map(.data$variable, ~ fmt_fn[[.x]] |> unname())
129+
) |>
130+
tidyr::unnest(cols = c("stat_name", "fmt_fn")),
131+
by = "stat_name",
132+
unmatched = "ignore"
133+
)
134+
}
135+
136+
# add class and return ARD object --------------------------------------------
137+
df_stats |>
138+
dplyr::mutate(context = "continuous") |>
139+
cards::tidy_ard_column_order() %>%
140+
{structure(., class = c("card", class(.)))} # styler: off
141+
}
142+
143+
.default_svy_stat_labels <- function(stat_label = NULL) {
144+
dplyr::tribble(
145+
~stat_name, ~stat_label,
146+
"mean", "Mean",
147+
"median", "Median",
148+
"var", "Variance",
149+
"sd", "Standard Deviation",
150+
"sum", "Sum",
151+
"deff", "Design Effect",
152+
"mean.std.error", "SE(Mean)",
153+
"min", "Minimum",
154+
"max", "Maximum",
155+
"p25", "25% Percentile",
156+
"p75", "75% Percentile"
157+
)
158+
}
159+
160+
accepted_svy_stats <- function(expand_quantiles = TRUE) {
161+
base_stats <-
162+
c("mean", "median", "min", "max", "sum", "var", "sd", "mean.std.error", "deff")
163+
if (expand_quantiles) {
164+
return(c(base_stats, paste0("p", 0:100)))
165+
}
166+
c(base_stats, "p##")
167+
}
168+
169+
170+
171+
# this function calculates the summary for a single variable, single statistic
172+
# and for all `by` levels. it returns an ARD data frame
173+
.compute_svy_stat <- function(data, variable, by = NULL, stat_name) {
174+
# difftime variable needs to be transformed into numeric for svyquantile
175+
if (inherits(data$variables[[variable]], "difftime")) {
176+
data$variables[[variable]] <- unclass(data$variables[[variable]])
177+
}
178+
179+
# styler: off
180+
if (stat_name %in% "mean") args <- list(FUN = survey::svymean)
181+
else if (stat_name %in% "sum") args <- list(FUN = survey::svytotal)
182+
else if (stat_name %in% "var") args <- list(FUN = survey::svyvar)
183+
else if (stat_name %in% "sd") args <- list(FUN = \(...) survey::svyvar(...) |> sqrt())
184+
else if (stat_name %in% "mean.std.error") args <- list(FUN = \(...) survey::svymean(...) |> survey::SE())
185+
else if (stat_name %in% "deff") args <- list(FUN = \(...) survey::svymean(..., deff = TRUE) |> survey::deff())
186+
else if (stat_name %in% "min") args <- list(FUN = \(x, design, na.rm, ...) min(design$variables[[all.vars(x)]], na.rm = na.rm))
187+
else if (stat_name %in% "max") args <- list(FUN = \(x, design, na.rm, ...) max(design$variables[[all.vars(x)]], na.rm = na.rm))
188+
# define functions for the quantiles
189+
else if (stat_name %in% c("median", paste0("p", 0:100))) {
190+
quantile <- ifelse(stat_name %in% "median", 0.5, substr(stat_name, 2, nchar(stat_name)) |> as.numeric() %>% `/`(100))
191+
# univariate results are returned in a different format from stratified.
192+
args <-
193+
if (is_empty(by)) list(FUN = \(...) survey::svyquantile(...)[[1]], quantiles = quantile)
194+
else list(FUN = \(...) survey::svyquantile(...), quantiles = quantile)
195+
}
196+
# styler: on
197+
198+
# adding additional args to pass
199+
args <-
200+
args |>
201+
append(
202+
list(
203+
design = data,
204+
# if all values are NA, turn na.rm to FALSE to avoid error
205+
na.rm = !all(is.na(data$variables[[variable]])),
206+
keep.var = FALSE
207+
)
208+
)
209+
210+
211+
# if no by variable, calculate univariate statistics
212+
if (is_empty(by)) {
213+
args$x <- stats::reformulate(variable)
214+
# calculate statistic (and remove FUN from the argument list)
215+
stat <-
216+
cards::eval_capture_conditions(
217+
do.call(args$FUN, args = args |> utils::modifyList(list(FUN = NULL)))
218+
)
219+
# if the result was calculated, then put it into a tibble
220+
if (!is.null(stat[["result"]])) {
221+
df_stat <-
222+
dplyr::tibble(variable, stat[["result"]][1]) |>
223+
set_names(c("variable", "stat")) |>
224+
dplyr::mutate(
225+
stat = as.list(unname(.data$stat)),
226+
warning = list(stat[["warning"]]),
227+
error = list(stat[["error"]])
228+
)
229+
}
230+
# otherwise, if there was an error return tibble with error message
231+
else {
232+
df_stat <-
233+
dplyr::tibble(
234+
variable = .env$variable,
235+
stat = list(NULL),
236+
warning = list(.env$stat[["warning"]]),
237+
error = list(.env$stat[["error"]])
238+
)
239+
}
240+
}
241+
242+
# if there is by variable(s), calculate statistics for the combinations
243+
else {
244+
args$formula <- stats::reformulate(variable)
245+
args$by <- stats::reformulate(by)
246+
stat <-
247+
if (stat_name %in% c("median", paste0("p", 0:100))) {
248+
cards::eval_capture_conditions(
249+
do.call(survey::svyby, args) |> set_names(c(by, "quantile", "ci.2.5", "ci.97.5", "se"))
250+
)
251+
} else if (stat_name %in% "deff") {
252+
stat <-
253+
cards::eval_capture_conditions(
254+
do.call(
255+
survey::svyby,
256+
args |> utils::modifyList(list(FUN = survey::svymean, deff = TRUE))
257+
) |>
258+
dplyr::select(all_of(by), dplyr::last_col()) # the last column is DEff
259+
)
260+
} else {
261+
cards::eval_capture_conditions(do.call(survey::svyby, args))
262+
}
263+
264+
# if the result was calculated, then put it into a tibble
265+
if (!is.null(stat[["result"]])) {
266+
df_stat <- stat[["result"]][seq_len(length(by) + 1L)] |>
267+
dplyr::as_tibble() %>%
268+
# adding unobserved combinations of "by" variables
269+
{
270+
dplyr::full_join(
271+
cards::nest_for_ard(data$variables, by = by, key = "...ard_no_one_will_ever_pick_this...", rename = FALSE, list_columns = FALSE) |>
272+
dplyr::select(-"...ard_no_one_will_ever_pick_this..."),
273+
.,
274+
by = by
275+
)
276+
} |>
277+
set_names(paste0("group", seq_along(by), "_level"), "stat") |>
278+
dplyr::bind_cols(
279+
dplyr::tibble(!!!c(by, variable)) |>
280+
set_names(paste0("group", seq_along(by)), "variable")
281+
) |>
282+
dplyr::mutate(
283+
dplyr::across(c(cards::all_ard_groups("levels"), "stat"), as.list),
284+
warning = list(.env$stat[["warning"]]),
285+
error = list(.env$stat[["error"]])
286+
)
287+
}
288+
# otherwise, if there was an error return tibble with error message
289+
else {
290+
df_stat <-
291+
cards::nest_for_ard(data$variables, by = by, key = "...ard_no_one_will_ever_pick_this...") |>
292+
dplyr::select(-"...ard_no_one_will_ever_pick_this...") |>
293+
dplyr::mutate(
294+
variable = .env$variable,
295+
stat = list(NULL),
296+
warning = list(.env$stat[["warning"]]),
297+
error = list(.env$stat[["error"]])
298+
)
299+
}
300+
}
301+
302+
df_stat |>
303+
dplyr::mutate(stat_name = .env$stat_name)
304+
}

_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ reference:
4444
- ard_regression
4545
- ard_regression_basic
4646
- ard_smd
47+
- ard_svycontinuous
4748
- ard_vif
4849

4950
- title: "Helpers"

inst/WORDLIST

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ Xin
1717
agresti
1818
clopper
1919
coull
20+
deff
2021
funder
2122
jeffreys
2223
pearson
24+
sd
2325
strat
2426
vif
2527
wald

0 commit comments

Comments
 (0)