pharmaversesdtm/data-raw/create_sdtms_data.R at 8164c8a03ff588adfadbd08c7496d4fa43b065d6 · pharmaverse/pharmaversesdtm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#' Script to Automatically Generate R Documentation Files for Datasets
#'
#' This script generates R documentation files for datasets in the 'pharmaversesdtm' package.
#' It reads metadata from a JSON file ('inst/extdata/sdtms-specs.json') to create properly formatted
#' documentation. The script ensures consistency and reduces manual intervention by automating the process.
#'
#' ## Key Features
#' - Reads metadata from a structured JSON file.
#' - Automatically generates `.R` documentation files in the 'R' directory,
#'   including xxTEST and xxTESTCD details in a table when available.
#' - Handles missing or incomplete metadata with sensible defaults.
#' - Provides hyperlinks to dataset sources in the documentation.
#'
#' ## Dependencies
#' - `jsonlite`: For reading JSON metadata.
#' - `roxygen2`: For generating the final documentation from `.R` files.
#'
#' ## Usage Notes
#' - Do not manually edit the generated files, as they are subject to automatic regeneration.
#' - Update the JSON metadata file for changes and rerun this script.

# Load required library
library(jsonlite)
library(metatools)
library(cli)
library(stringr)
library(dplyr)

# Load metadata from JSON file
specs <- fromJSON("inst/extdata/sdtms-specs.json")$`sdtms-specs`

# Helper function to retrieve the label attribute for a column
#' @description Retrieve column label attribute or return a default.
#' @param data The dataset containing the column.
#' @param col_name The name of the column.
#' @return A string containing the label attribute or "undocumented field".
get_attr <- function(data, col_name) {
  att <- attr(data[[col_name]], "label")
  if (is.null(att) || att == "null") {
    att <- "undocumented field"
  }
  return(att)
}

#' @description Create an HTML hyperlink for use in documentation, return plain text if the source is text,
#' or a default message if the URL is invalid.
#' @param url The URL to link to or plain text if the source is not a URL.
#' @param link_text Optional text for the hyperlink. Defaults to an empty string.
#' @return A string containing the HTML anchor tag, plain text if the source is text, or a default message for invalid URLs.
generate_hyperlink <- function(url, link_text = "") {
  # Define a basic regular expression for URL validation
  url_pattern <- "^(https?://)"

  # Check if the input is a valid URL
  if (!nzchar(url) || is.null(url)) {
    return("The source is inaccessible.")
  } else if (!str_detect(string = url, pattern = url_pattern)) {
    # If the source is plain text (not a URL), return it as-is
    return(url)
  }

  # Generate and return the hyperlink if the URL is valid
  paste0("[Access the source of the ", link_text, " dataset.](", url, ")")
}

# Helper function to retrieve the dataset keyword to group by TA
#' @description Retrieve the dataset keyword to group datasets by TA.
#' @param dataset_name The name of the dataset.
#' @param specs The SDTM dataset specs
get_dataset_keyword <- function(dataset_name, specs) {
  # Check that dataset_name exists in specs$name
  if (!dataset_name %in% specs$name) {
    cli_abort("The dataset {.val {dataset_name}} is not present in {.field specs$name}.")
  }

  # Extract row for this dataset
  meta_row <- specs[specs$name == dataset_name, ]

  # Check that therapeutic_area is populated
  ta <- meta_row$therapeutic_area

  if (is.null(ta) || is.na(ta) || identical(ta, "") || nchar(trimws(ta)) == 0) {
    cli_abort(
      "The field {.field therapeutic_area} is missing or empty for dataset {.val {dataset_name}}."
    )
  }

  return(tolower(ta))
}

# Main function to write documentation for a dataset
#' @description Generate R documentation for a dataset.
#' @param data The dataset object.
#' @param dataset_name The name of the dataset.
#' @param dataset_label The label for the dataset. Defaults to "No label available".
#' @param dataset_description A description of the dataset. Defaults to "No description available".
#' @param dataset_author The author of the dataset. Defaults to NULL if not available.
#' @param dataset_source The source of the dataset, as a URL or text. Defaults to "No source available".
#' @param dataset_testnames Test names and test codes available as a dataset. Defaults to NULL if not available.
#' @param dataset_keyword A description of Therapeutic Area
write_doc <- function(data, dataset_name, dataset_label = "No label available",
                      dataset_description = "No description available", dataset_author = NULL,
                      dataset_source = "No source available", dataset_testnames = NULL,
                      dataset_keyword = "generic") {
  dataset_source <- generate_hyperlink(dataset_source, dataset_label)
  doc_string <- paste(
    "# This file is automatically generated by data-raw/create_sdtms_data.R.",
    "# For updating it please edit inst/extdata/sdtms-specs.json and rerun create_sdtms_data.R.",
    "# Manual edits are not recommended, as changes may be overwritten.",
    sprintf("#' %s", dataset_label),
    "#'",
    sprintf("#' %s", dataset_description),
    "#'",
    sprintf("#' @name %s", dataset_name),
    sprintf("#' @title %s", dataset_label),
    sprintf("#' @keywords dataset %s", dataset_keyword),
    sprintf("#' @description %s", dataset_description),
    "#' @docType data",
    sprintf("#' @format A data frame with %s columns:", ncol(data)),
    "#'   \\describe{",
    paste(sapply(names(data), function(col_name) {
      paste(sprintf("#'     \\item{%s}{%s}", col_name, get_attr(data, col_name)))
    }, USE.NAMES = FALSE), collapse = "\n"),
    "#'   }",
    "#'",
    sprintf("#' @source %s", dataset_source),
    sep = "\n"
  )

  if (!is.null(dataset_author) && dataset_author != "") {
    doc_string <- paste(doc_string, sprintf("#' @author %s", dataset_author), sep = "\n")
  }

  if (!is.null(dataset_testnames) && dataset_testnames != "") {
    doc_string <- paste(doc_string, sprintf("#' @details %s", dataset_testnames), sep = "\n")
  }

  doc_string <- paste(doc_string, sprintf("\"%s\"", dataset_name), sep = "\n")
  writeLines(doc_string, con = file.path("R", paste0(dataset_name, ".R")))
}

# List datasets and generate documentation
datasets <- data(package = "pharmaversesdtm")$results[, "Item"]
for (dataset_name in datasets) {
  data(list = dataset_name, package = "pharmaversesdtm")
  dataset <- get(dataset_name)
  metadata <- specs[specs$name == dataset_name, ]

  # For each dataset, save a CSV version in inst/extdata/
  write.csv(dataset, file = file.path("inst/extdata/", paste0(dataset_name, ".csv")), row.names = FALSE)

  # Add Test Codes and Test Names details in a table
  # Identify column names ending in TEST and TESTCD
  test_col <- names(dataset)[str_detect(string = names(dataset), pattern = "TEST$")]
  # names(dataset)[str_detect("TEST$", names(dataset))]
  testcd_col <- names(dataset)[str_detect(string = names(dataset), pattern = "TESTCD$")]
  # names(dataset)[str_detect("TESTCD$", names(dataset))]

  if (length(test_col) == 1 && length(testcd_col) == 1) {
    # Check both columns exist
    unique_tests <- unique(dataset[c(testcd_col, test_col)])
    unique_tests <- unique_tests[order(unique_tests[[testcd_col]]), ]

    tabular <- function(df, ...) {
      stopifnot(is.data.frame(df))

      align <- function(x) if (is.numeric(x)) "r" else "l"
      col_align <- vapply(df, align, character(1))

      cols <- lapply(df, format, ...)
      contents <- do.call(
        "paste",
        c(cols, list(sep = " \\tab ", collapse = "\\cr\n#'   "))
      )

      paste(sprintf("Contains a set of %d unique Test Short Name%s and Test Name%s: ", nrow(unique_tests), if_else(nrow(unique_tests) == 1, "", "s"), if_else(nrow(unique_tests) == 1, "", "s")),
        "\\tabular{", paste(col_align, collapse = ""), "}{\n#'   ",
        paste0("\\strong{", names(df), "}", sep = "", collapse = " \\tab "), " \\cr\n#'   ",
        trimws(contents), "\n#' }\n",
        sep = ""
      )
    }

    testnames <- tabular(unique_tests)
  } else {
    testnames <- NULL
  }

  if (nrow(metadata) == 0) {
    cli_warn(sprintf("No metadata found for %s - using default values.", dataset_name), call. = FALSE)
    dataset_label <- "No label available"
    dataset_description <- "No description available"
    dataset_author <- NULL
    dataset_source <- "No source available"
    dataset_testnames <- NULL
    # Add Therapeutic area keyword to the dataset name
    dataset_keyword <- NULL
  } else {
    dataset_label <- if_else(!is.na(metadata$label), metadata$label, "No label available")
    dataset_description <- if_else(!is.na(metadata$description), metadata$description, "No description available")
    dataset_author <- if (!is.na(metadata$author) && metadata$author != "") metadata$author else NULL
    dataset_source <- if_else(!is.na(metadata$source), metadata$source, "No source available")
    dataset_testnames <- if (!is.null(testnames) && testnames != "") testnames else NULL
    # Add Therapeutic area keyword to the dataset name
    dataset_keyword <- get_dataset_keyword(dataset_name, specs)
  }
  # Write the R documentation
  write_doc(dataset, dataset_name, dataset_label, dataset_description, dataset_author, dataset_source, dataset_testnames, dataset_keyword)
}

# Finalize the documentation
roxygen2::roxygenize(".", roclets = c("rd", "collate", "namespace"))