metatools/R/supp.R at b13c27de6ba8439043d39a771e8d2f6418cbb87d · pharmaverse/metatools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#' Build the observations for a single QNAM
#'
#' @param dataset Input dataset
#' @param qnam  QNAM value
#' @param qlabel QLABEL value
#' @param idvar IDVAR variable name (provided as a string)
#' @param qeval QEVAL value to be populated for this QNAM
#' @param qorig QORIG value to be populated for this QNAM
#' @param verbose Character string controlling message verbosity. One of:
#'   \describe{
#'     \item{`"message"`}{Show both warnings and messages (default)}
#'     \item{`"warn"`}{Show warnings but suppress messages}
#'     \item{`"silent"`}{Suppress all warnings and messages}
#'   }
#'
#' @return Observations structured in SUPP format
#' @export
#'
#'
build_qnam <- function(dataset, qnam, qlabel, idvar, qeval, qorig,
                       verbose = c("message", "warn", "silent")) {
  verbose <- validate_verbose(verbose)

  # Need QNAM as a variable
  qval <- as.symbol(qnam)

  # DM won't have an IDVAR so handle that
  if (is.na(idvar) || idvar == "") {
    dataset <- dataset %>%
      mutate(IDVARVAL = idvar)
    idvarval <- sym("IDVARVAL")
  } else {
    idvarval <- as.symbol(idvar)
  }

  dup_sup <- dataset %>%
    select(STUDYID, RDOMAIN = DOMAIN, USUBJID, !!idvarval, !!qval) %>%
    rename(IDVARVAL = !!idvarval, QVAL = !!qval) %>%
    filter(!is.na(QVAL)) %>%
    mutate(
      IDVAR = idvar,
      QNAM = qnam,
      QLABEL = qlabel,
      QORIG = qorig,
      QEVAL = qeval
    )

  out <- dup_sup %>%
    distinct(STUDYID, RDOMAIN,
      USUBJID, IDVARVAL, QNAM,
      .keep_all = TRUE
    ) %>%
    select(
      STUDYID, RDOMAIN, USUBJID, IDVAR,
      IDVARVAL, QNAM, QLABEL, QVAL,
      QORIG, QEVAL
    )

  test_out <- dup_sup %>%
    distinct()
  if (nrow(out) != nrow(test_out)) {
    stop("The combination of STUDYID, RDOMAIN, USUBJID, IDVARVAL, QNAM is ambiguous. Consider modifying the IDVAR",
      call. = FALSE
    )
  }

  blank_test <- out %>%
    pull(QVAL)
  if (any(blank_test == "")) {
    if (check_message(verbose)) {
      message(paste0("Empty QVAL rows removed for QNAM = ", unique(out$QNAM)))
    }
    out <- out %>%
      filter(QVAL != "")
  }
  out
}


#' Make Supplemental Qualifier
#'
#' @param dataset dataset the supp will be pulled from
#' @param metacore A subsetted metacore object to get the supp information from.
#'   If not already subsetted then a `dataset_name` will need to be provided
#' @param dataset_name `r lifecycle::badge("deprecated")` Optional string to
#'   specify the dataset that is being built. This is only needed if the metacore
#'   object provided hasn't already been subsetted.\cr
#'   Note: Deprecated in version 0.2.0. The `dataset_name` argument will be removed
#'   in a future release. Please use `metacore::select_dataset` to subset the
#'   `metacore` object to obtain metadata for a single dataset.
#'
#' @return a CDISC formatted SUPP dataset
#' @export
#'
#' @examples
#'
#' library(metacore)
#' library(safetyData)
#' library(tibble)
#' load(metacore_example("pilot_SDTM.rda"))
#' spec <- metacore %>% select_dataset("AE")
#' ae <- combine_supp(sdtm_ae, sdtm_suppae)
#' make_supp_qual(ae, spec) %>% as_tibble()
make_supp_qual <- function(dataset, metacore, dataset_name = deprecated()) {
  if (is_present(dataset_name)) {
    lifecycle::deprecate_warn(
      when = "0.2.0",
      what = "make_supp_qual(dataset_name)",
      details = cli_inform(c("i" = col_red("The {.arg dataset_name} argument will be removed in a future release.
      Please use {.fn metacore::select_dataset} to subset the {.obj metacore} object to obtain
      metadata for a single dataset.")))
    )
    metacore <- make_lone_dataset(metacore, dataset_name)
  }
  verify_DatasetMeta(metacore)

  supp_vars <- metacore$ds_vars %>%
    filter(supp_flag)
  if (nrow(supp_vars) == 0) {
    stop("No supplemental variables specified in metacore object. Please check your specifications",
      call. = FALSE
    )
  }

  supp_meta <- supp_vars %>%
    select(dataset, variable) %>%
    left_join(metacore$var_spec, by = "variable") %>%
    left_join(metacore$value_spec, by = c("dataset", "variable")) %>%
    left_join(metacore$supp, by = c("dataset", "variable")) %>%
    select(
      qnam = variable, qlabel = label,
      qorig = origin, qeval = qeval,
      idvar = idvar
    ) %>%
    distinct() # Protection against bad specs
  # TODO Addin in checks/coercion for when combining cols of different types
  pmap_dfr(supp_meta, build_qnam, dataset = dataset) %>%
    arrange(USUBJID, QNAM, IDVARVAL)
}

#' Combine the Domain and Supplemental Qualifier
#'
#' @param dataset Domain dataset
#' @param supp Supplemental Qualifier dataset
#'
#' @return a dataset with the supp variables added to it
#' @export
#
#' @examples
#' library(safetyData)
#' library(tibble)
#' combine_supp(sdtm_ae, sdtm_suppae) %>% as_tibble()
combine_supp <- function(dataset, supp) {
  if (!is.data.frame(dataset) | !is.data.frame(supp)) {
    stop("You must supply a domain and supplemental dataset", call. = FALSE)
  }
  if (nrow(supp) == 0) {
    warning("Zero rows in supp, returning original dataset unchanged")
    return(dataset)
  }
  supp_cols <- c(
    "STUDYID", "RDOMAIN", "USUBJID", "IDVAR", "IDVARVAL",
    "QNAM", "QLABEL", "QVAL", "QORIG"
  )
  maybe <- c("QEVAL")
  ext_supp_col <- names(supp) %>% discard(~ . %in% c(supp_cols, maybe))
  mis_supp_col <- supp_cols %>% discard(~ . %in% names(supp))
  if (length(ext_supp_col) > 0 | length(mis_supp_col) > 0) {
    mess <- "Supplemental datasets need to comply with CDISC standards\n"
    ext <- if_else(length(ext_supp_col) > 0,
      paste0("The following columns need to be removed:\n", paste0(ext_supp_col, collapse = "\n")),
      ""
    )
    mis <- if_else(length(mis_supp_col) > 0,
      paste0("The following columns are missing:\n", paste0(mis_supp_col, collapse = "\n")),
      ""
    )
    stop(paste0(mess, ext, mis))
  }
  all_qnam <- unique(supp$QNAM)
  existing_qnam <- intersect(all_qnam, names(dataset))
  if (length(existing_qnam) > 0) {
    stop(
      "The following column(s) would be created by combine_supp(), but are already in the original dataset:\n  ",
      paste(existing_qnam, sep = ", ")
    )
  }

  # In order to prevent issues when there are multiple IDVARS we need to merge
  # each IDVAR into the domain seperately (otherwise there is problems when the
  # two IDVARS don't overlap)

  supp_wides_prep <-
    supp %>%
    select(-any_of(c("QLABEL", "QORIG", "QEVAL"))) %>% # Removing columns not for the main dataset
    rename(DOMAIN = RDOMAIN) %>%
    group_by(IDVAR, QNAM) %>% # For when there are multiple IDs
    group_split()

  supp_wides <- purrr::pmap(.l = list(supp = supp_wides_prep), .f = combine_supp_make_wide)
  ret <- reduce(.x = append(list(dataset), supp_wides), .f = combine_supp_join)
  ret$IDVARVAL <- NULL

  labels_to_add <- unique(supp[, c("QNAM", "QLABEL")])
  for (current_idx in seq_len(nrow(labels_to_add))) {
    current_col <- labels_to_add$QNAM[current_idx]
    current_label <- labels_to_add$QLABEL[current_idx]
    attr(ret[[current_col]], "label") <- current_label
  }

  ret
}

# Create a wide version of `supp` for merging into the source dataset.
combine_supp_make_wide <- function(supp) {
  stopifnot(length(unique(supp$IDVAR)) == 1)
  stopifnot(length(unique(supp$QNAM)) == 1)
  # Get the IDVAR value to allow for renaming of IDVARVAL
  id_var <- unique(supp$IDVAR)

  wide_x <-
    supp %>%
    pivot_wider(
      names_from = QNAM,
      values_from = QVAL
    )
  wide_x$QNAM <- unique(supp$QNAM)
  if (!is.na(id_var) && id_var != "") {
    wide_x <-
      wide_x %>%
      mutate(IDVARVAL = str_trim(as.character(IDVARVAL)))
  } else {
    wide_x$IDVARVAL <- NULL
  }
  wide_x
}

combine_supp_join <- function(dataset, supp) {
  current_idvar <- unique(supp$IDVAR)
  current_qnam <- unique(supp$QNAM)
  stopifnot(length(current_idvar) == 1)
  stopifnot(length(current_qnam) == 1)

  by <- intersect(names(supp), c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL"))
  supp_prep <- supp %>% select(-QNAM, -IDVAR)
  new_column <- setdiff(names(supp_prep), by)
  stopifnot(length(new_column) == 1)

  # Prepare IDVARVAL
  ret <- dataset
  if ("IDVARVAL" %in% by) {
    # Match the IDVARVAL column in supp
    ret$IDVARVAL <- str_trim(as.character(ret[[current_idvar]]))
  } else {
    # A dummy column that can be removed later
    ret$IDVARVAL <- FALSE
  }

  # Put the new data in
  if (new_column %in% names(dataset)) {
    # Patch the data
    mask_na_ret_before <- is.na(ret[[new_column]])
    ret_orig <- ret
    ret <- dplyr::rows_patch(x = ret, y = supp_prep, by = by)
    mask_na_ret_after <- is.na(ret[[new_column]])

    expected_na_difference <- sum(!is.na(supp_prep[[new_column]]))
    actual_na_difference <- sum(!mask_na_ret_after) - sum(!mask_na_ret_before)
    if (expected_na_difference != actual_na_difference) {
      stop(
        "An unexpected number of rows were replaced while merging QNAM ", current_qnam, " and IDVAR ", current_idvar,
        "\n  Please verify that your SUPP domain is valid SDTM with only one matched row per key column set"
      )
    }
  } else {
    # Verify that nothing will be missed
    missing <- anti_join(supp_prep, ret, by = by)

    # Add message for when there are rows in the supp that didn't get merged
    if (nrow(missing) > 0) {
      missing_display <- missing %>%
        dplyr::transmute(
          USUBJID,
          !!current_idvar := IDVARVAL
        )
      msg <- "Not all rows of SUPP were merged."
      cli::cli_alert_warning(msg)

      cli::cli_text("")
      cli::cli_text("The following rows are missing:")
      cli::cli_rule()

      print(missing_display)

      cli::cli_rule()
      warning(msg, call. = FALSE)
    }

    # join the data
    ret <- left_join(ret, supp_prep, by = by)
  }
  ret
}

#' Handles the combining of datasets and supps for a single IDVAR
#'
#' @param dataset Domain dataset
#' @param supp Supplemental Qualifier dataset with a single IDVAR
#'
#' @return list of datasets
#' @noRd
combine_supp_by_idvar <- function(dataset, supp) {
  # Get the IDVAR value to allow for renaming of IDVARVAL
  id_var <- unique(supp$IDVAR)

  wide_x <- supp %>%
    pivot_wider(
      names_from = QNAM,
      values_from = QVAL
    ) %>%
    select(-IDVAR)

  if (!is.na(id_var) && id_var != "") {
    id_var_sym <- sym(id_var)

    by <- c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL")
    wide_x <- wide_x %>%
      mutate(IDVARVAL = as.character(IDVARVAL) %>%
        str_trim())
    #  Make a dummy IDVARVAL variable to merge on, won't effect the dataset
    dataset_chr <- dataset %>%
      mutate(IDVARVAL = as.character(!!id_var_sym) %>%
        str_trim())

    out <- left_join(dataset_chr, wide_x,
      by = by
    ) %>%
      select(-IDVARVAL)
    missing <- anti_join(wide_x, dataset_chr, by = by)

    # Add message for when there are rows in the supp that didn't get merged
    if (nrow(missing) > 0) {
      missing_txt <- capture.output(missing %>%
        select(USUBJID, !!sym(id_var)) %>%
        print()) %>%
        paste0(collapse = "\n")
      stop(
        paste0(
          "Not all rows of the Supp were merged. The following rows are missing:\n",
          missing_txt
        ),
        call. = FALSE
      )
    }
  } else {
    wide_x <- wide_x %>%
      select(-IDVARVAL)
    out <- left_join(dataset, wide_x,
      by = c("STUDYID", "DOMAIN", "USUBJID")
    )
  }
  out
}