-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathcodelists.R
More file actions
347 lines (320 loc) · 12.6 KB
/
codelists.R
File metadata and controls
347 lines (320 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#' Dash to Equation
#'
#' Converts strings that are #-# style to a logical expression (but in a string format)
#' @param string
#'
#' @return string
#' @noRd
dash_to_eq <- function(string) {
front <- str_extract(string, "^.*(?=\\-)")
front_eq <- if_else(str_detect(front, "<|>|="), front, paste0(">=", front))
back <- str_extract(string, "(?<=\\-).*$")
back_eq <- if_else(str_detect(back, "<|>|="), back, paste0("<=", back))
paste0("x", front_eq, " & x", back_eq)
}
#' Create Subgroups
#'
#' @param ref_vec Vector of numeric values
#' @param grp_defs Vector of strings with groupings defined. Format must be
#' either: <00, >=00, 00-00, or 00-<00
#' @param grp_labs Vector of strings with labels defined. The labels correspond
#' to the associated `grp_defs`. i.e., "12-17" may translate to "12-17 years".
#' If no `grp_labs` specified then `grp_defs` will be used.
#'
#' @return Character vector of the values in the subgroups
#' @export
#'
#' @examples
#' create_subgrps(c(1:10), c("<2", "2-5", ">5"))
#' create_subgrps(c(1:10), c("<=2", ">2-5", ">5"))
#' create_subgrps(c(1:10), c("<2", "2-<5", ">=5"))
#' create_subgrps(c(1:10), c("<2", "2-<5", ">=5"), c("<2 years", "2-5 years", ">=5 years"))
create_subgrps <- function(ref_vec, grp_defs, grp_labs = NULL) {
if (!is.numeric(ref_vec)) {
cli_abort("ref_vec must be numeric")
}
if (is.null(grp_labs)) {
grp_labs <- grp_defs
}
# Create equations used to derive the subgroups
equations <- case_when(
str_detect(grp_defs, "-") ~ paste0("function(x){if_else(", dash_to_eq(grp_defs), ", '", grp_labs, "', '')}"),
str_detect(grp_defs, "^(<\\s?=|>\\s?=|<|>)\\s?\\d+") ~ paste0("function(x){if_else(x", grp_defs, ",'", grp_labs, "', '')}"),
TRUE ~ NA_character_
)
# Apply equations
if (all(!is.na(equations))) {
functions <- equations %>%
map(~ eval(parse(text = .)))
out <- functions %>%
map(~ .(ref_vec)) %>%
reduce(str_c) %>%
replace(. == "", NA)
} else {
na_index <- which(is.na(equations))
bad_defs <- grp_defs[na_index]
cli_abort(paste(
"Unable to decipher the following group definition{?s}: {bad_defs}.",
"Please check your controlled terminology."
))
}
# Find non-exclusive subgroups i.e., values that have been mapped to two groups
non_excl <- out |>
discard(is.na) |>
map(~ grp_labs[str_detect(.x, grp_labs)]) |>
keep(~ length(.) > 1) |>
unique()
# Throw error if groups are not exclusive
if (length(non_excl) > 0) {
msg <- map_chr(non_excl, ~ {
items <- paste(.x, collapse = ", ")
}) %>%
paste0(seq_along(.), ". ", .)
cli_abort(c(
"Group definitions are not exclusive. Please check your controlled terminology",
"The following group definitions overlap:",
msg
))
}
out
}
#' Create Variable from Codelist
#'
#' This functions uses code/decode pairs from a metacore object to create new
#' variables in the data
#'
#' @param data Dataset that contains the input variable
#' @param metacore A metacore object to get the codelist from. This should be a
#' subsetted metacore object (of subclass `DatasetMeta`) created using
#' `metacore::select_dataset`.
#' @param input_var Name of the variable that will be translated for the new
#' column
#' @param out_var Name of the output variable. Note: Unless a codelist is provided
#' the grouping will always be from the code of the codelist associates with
#' `out_var`.
#' @param codelist Optional argument to supply a codelist. Must be a data.frame
#' with `code` and `decode` columns such as those created by the function
#' `metacore::get_control_term`. If no codelist is provided the codelist
#' associated with the column supplied to `out_var` will be used. By default
#' `codelist` is `NULL`.
#' @param decode_to_code Direction of the translation. Default value is `TRUE`,
#' i.e., assumes the `input_var` is the decode column of the codelist.
#' Set to `FALSE` if the `input_var` is the code column of the codelist.
#' @param strict A logical value indicating whether to perform strict checking
#' against the codelist. If `TRUE` will issue a warning if values in the `input_var`
#' column are not present in the codelist. If `FALSE` no warning is issued and
#' values not present in the codelist will likely result in `NA` results.
#'
#' @return Dataset with a new column added
#' @export
#'
#' @examples
#' library(metacore)
#' library(tibble)
#' data <- tribble(
#' ~USUBJID, ~VAR1, ~VAR2,
#' 1, "M", "Male",
#' 2, "F", "Female",
#' 3, "F", "Female",
#' 4, "U", "Unknown",
#' 5, "M", "Male",
#' )
#' spec <- spec_to_metacore(metacore_example("p21_mock.xlsx"), quiet = TRUE)
#' dm_spec <- select_dataset(spec, "DM", quiet = TRUE)
#' create_var_from_codelist(data, dm_spec, VAR2, SEX)
#' create_var_from_codelist(data, dm_spec, "VAR2", "SEX")
#' create_var_from_codelist(data, dm_spec, VAR1, SEX, decode_to_code = FALSE)
#'
#' # Example providing a custom codelist
#' # This example also reverses the direction of translation
#' load(metacore_example("pilot_ADaM.rda"))
#' adlb_spec <- select_dataset(metacore, "ADLBC", quiet = TRUE)
#' adlb <- tibble(PARAMCD = c("ALB", "ALP", "ALT", "AST", "BILI", "BUN"))
#' create_var_from_codelist(
#' adlb,
#' adlb_spec,
#' PARAMCD,
#' PARAM,
#' codelist = get_control_term(adlb_spec, PARAMCD),
#' decode_to_code = FALSE,
#' strict = FALSE
#' )
#'
#' \dontrun{
#' # Example expecting warning where `strict` == `TRUE`
#' adlb <- tibble(PARAMCD = c("ALB", "ALP", "ALT", "AST", "BILI", "BUN", "DUMMY1", "DUMMY2"))
#' create_var_from_codelist(
#' adlb,
#' adlb_spec,
#' PARAMCD,
#' PARAM,
#' codelist = get_control_term(adlb_spec, PARAMCD),
#' decode_to_code = FALSE,
#' strict = TRUE
#' )
#' }
create_var_from_codelist <- function(data, metacore, input_var, out_var, codelist = NULL,
decode_to_code = TRUE, strict = TRUE) {
verify_DatasetMeta(metacore)
# Use codelist if provided, else use codelist of the out_var
if (!missing(codelist)) {
code_translation <- codelist
} else {
code_translation <- get_control_term(metacore, {{ out_var }})
}
if (is.vector(code_translation) | !("decode" %in% names(code_translation))) {
cli_abort("Expecting 'code_decode' type of control terminology. Actual \\
type is {typeof(code_translation)}. Check the structure of the codelist in the \\
{.obj metacore} object using {.fn View}.")
}
# Check decode_to_code is logical and set direction of translation
if (!is_logical(decode_to_code)) {
cli_abort("{.arg decode_to_code} must be either TRUE or FALSE.")
}
ref_var <- if (decode_to_code) "decode" else "code"
new_var <- if (decode_to_code) "code" else "decode"
# Pull data values and codelist values to check inconsistent overlap
values <- data |> pull({{ input_var }})
codelist <- code_translation |> pull(ref_var)
miss <- setdiff(values, codelist)
n_miss <- length(miss)
if (strict == TRUE && n_miss > 0) {
cli_warn(
"In {.fn create_var_from_codelist}: The following {qty(n_miss)}value{?s}
present in the input dataset {qty(n_miss)}{?is/are} not present in the codelist: {miss}"
)
}
input_var_str <- as_label(enexpr(input_var)) |>
str_remove_all("\"")
# Coerce join column to character to ensure join if input var is numeric
data <- data |> mutate(merge_on := as.character(.data[[input_var_str]]))
code_translation <- code_translation |>
mutate(
decode = as.character(decode),
code = as.character(code)
)
out <- data |>
left_join(code_translation, by = set_names(ref_var, "merge_on")) |>
rename({{ out_var }} := !!sym(new_var)) |>
select(-merge_on)
# Optionally coerce to numeric if the output values are numeric
if (all(str_detect(code_translation[[new_var]], "^\\d*$"))) {
out <- out |>
mutate({{ out_var }} := as.numeric({{ out_var }}))
}
out
}
#' Create Categorical Variable from Codelist
#'
#' Using the grouping from either the `decode_var` or `code_var` and a reference
#' variable (`ref_var`) it will create a categorical variable and the numeric
#' version of that categorical variable.
#'
#' @param data Dataset with reference variable in it
#' @param metacore A metacore object to get the codelist from. If the
#' variable has different codelists for different datasets the metacore object
#' will need to be subsetted using `select_dataset` from the metacore package.
#' @param ref_var Name of variable to be used as the reference i.e AGE when
#' creating AGEGR1
#' @param grp_var Name of the new grouped variable
#' @param num_grp_var Name of the new numeric decode for the grouped variable.
#' This is optional if no value given no variable will be created
#' @param create_from_decode Sets the `decode` column of the codelist as the column
#' from which the variable will be created. By default the column is `code`.
#' @param strict A logical value indicating whether to perform strict checking
#' against the codelist. If `TRUE` will issue a warning if values in the `ref_var`
#' column do not fit into the group definitions for the codelist in `grp_var`.
#' If `FALSE` no warning is issued and values not defined by the codelist will
#' likely result in `NA` results.
#'
#' @return dataset with new column added
#' @export
#' @examples
#' library(metacore)
#' library(haven)
#' library(dplyr)
#' load(metacore_example("pilot_ADaM.rda"))
#' spec <- metacore %>% select_dataset("ADSL")
#' dm <- read_xpt(metatools_example("dm.xpt")) %>%
#' select(USUBJID, AGE)
#' # Grouping Column Only
#' create_cat_var(dm, spec, AGE, AGEGR1)
#' # Grouping Column and Numeric Decode
#' create_cat_var(dm, spec, AGE, AGEGR1, AGEGR1N)
create_cat_var <- function(data, metacore, ref_var, grp_var, num_grp_var = NULL,
create_from_decode = FALSE, strict = TRUE) {
verify_DatasetMeta(metacore)
ct <- get_control_term(metacore, {{ grp_var }})
if (is.vector(ct) | !("decode" %in% names(ct))) {
cli_abort("Expecting 'code_decode' type of control terminology. Please check metacore object")
}
# Assign group definitions and labels
grp_defs <- pull(ct, code)
grp_labs <- if (create_from_decode) pull(ct, decode) else grp_defs
out <- data %>%
mutate({{ grp_var }} := create_subgrps({{ ref_var }}, grp_defs, grp_labs))
if (!is.null(enexpr(num_grp_var))) {
out <- out %>%
create_var_from_codelist(metacore, {{ grp_var }}, {{ num_grp_var }})
}
missing <- out |>
pull({{ grp_var }}) |>
is.na() |>
which() |>
length()
if (strict && missing > 0) {
cli_warn(paste(
"There {qty(missing)} {?is/are} {missing} {qty(missing)} observation{?s}",
"in {as_name(enquo(ref_var))} that {qty(missing)} {?does/do} not fit into",
"the provided categories for {as_name(enquo(grp_var))}. Please check your",
"controlled terminology."
))
}
out
}
#' Convert Variable to Factor with Levels Set by Control Terms
#'
#' This functions takes a dataset, a metacore object and a variable name. Then
#' looks at the metacore object for the control terms for the given variable and
#' uses that to convert the variable to a factor with those levels. If the
#' control terminology is a code list, the code column will be used. The
#' function fails if the control terminology is an external library
#' @param data A dataset containing the variable to be modified
#' @param metacore A metacore object to get the codelist from. If the
#' variable has different codelists for different datasets the metacore object
#' will need to be subsetted using `select_dataset` from the metacore package
#' @param var Name of variable to change
#'
#' @return Dataset with variable changed to a factor
#' @export
#'
#' @examples
#' library(metacore)
#' library(haven)
#' library(dplyr)
#' load(metacore_example("pilot_ADaM.rda"))
#' spec <- metacore %>% select_dataset("ADSL")
#' dm <- read_xpt(metatools_example("dm.xpt")) %>%
#' select(USUBJID, SEX, ARM)
#' # Variable with codelist control terms
#' convert_var_to_fct(dm, spec, SEX)
#' # Variable with permitted value control terms
#' convert_var_to_fct(dm, spec, ARM)
convert_var_to_fct <- function(data, metacore, var) {
verify_DatasetMeta(metacore)
code_translation <- get_control_term(metacore, {{ var }})
var_str <- as_label(enexpr(var)) %>%
str_remove_all("\"")
if (is.vector(code_translation)) {
levels <- code_translation
} else if ("code" %in% names(code_translation)) {
levels <- code_translation$code
} else {
stop("We currently don't have the ability to use external libraries")
}
if (!var_str %in% names(data)) {
stop(paste(var_str, "cannot be found in the dataset. Please create variable before converting to factor"))
}
data %>%
mutate({{ var }} := factor({{ var }}, levels = levels))
}