cmu-delphi · dshemetov · Mar 17, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R
@@ -19,34 +19,28 @@
 #'   filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
 #'   select(geo_value, time_value, cases)
 #'
-#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
-#'
+#' # Create a recipe with a Yeo-Johnson transformation.
 #' r <- epi_recipe(jhu) %>%
-#'   step_epi_YeoJohnson(
-#'     df = pop_data,
-#'     df_pop_col = "value",
-#'     by = c("geo_value" = "states"),
-#'     cases, suffix = "_scaled"
-#'   ) %>%
-#'   step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
-#'   step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
+#'   step_epi_YeoJohnson(cases) %>%
+#'   step_epi_lag(cases, lag = 0) %>%
+#'   step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
 #'   step_epi_naomit()
 #'
+#' # Create a frosting layer that will undo the Yeo-Johnson transformation.
 #' f <- frosting() %>%
 #'   layer_predict() %>%
-#'   layer_threshold(.pred) %>%
-#'   layer_naomit(.pred) %>%
-#'   layer_epi_YeoJohnson(.pred,
-#'     df = pop_data,
-#'     by = c("geo_value" = "states"),
-#'     df_pop_col = "value"
-#'   )
+#'   layer_epi_YeoJohnson(.pred)
 #'
+#' # Create a workflow and fit it.
 #' wf <- epi_workflow(r, linear_reg()) %>%
 #'   fit(jhu) %>%
 #'   add_frosting(f)
 #'
+#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation.
 #' forecast(wf)
+#' # Compare to the original data.
+#' plot(density(jhu$cases))
+#' plot(density(forecast(wf)$cases))
 layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
   checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE)
 
@@ -116,28 +110,21 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
     unmatched = c("error", "drop")
   )
 
-  # TODO: There are many possibilities here:
-  # - (a) the terms can be empty, where we should probably default to
-  #   all_outcomes().
-  # - (b) explicitly giving all_outcomes(), we end here with terms being empty,
-  #   which doesn't seem right; need to make sure we pull in all the outcome
-  #   columns here. The question is what form should they have?
-  # - (c) if the user just specifies .pred, then we have to infer the outcome
-  #   from the mold, which is simple enough and the main case I have working.
-  # - (d) the user might specify outcomes of the form .pred_ahead_1_cases,
-  #   .pred_ahead_7_cases, etc. Is that the right format? Trying those out now
-  #   and getting errors downstream from forecast().
-  # Get the columns to transform.
   exprs <- rlang::expr(c(!!!object$terms))
   pos <- tidyselect::eval_select(exprs, components$predictions)
   col_names <- names(pos)
 
-  # For every column, we need to use the appropriate lambda column, which differs per row.
-  # Note that yj_inverse() is vectorized in x, but not in lambda.
+  # The `object$terms` is where the user specifies the columns they want to
+  # untransform. We need to match the outcomes with their lambda columns in our
+  # parameter table and then apply the inverse transformation.
   if (identical(col_names, ".pred")) {
-    # In this case, we don't get a hint for the outcome column name, so we need to
-    # infer it from the mold. `outcomes` is a vector of objects like
-    # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
+    # In this case, we don't get a hint for the outcome column name, so we need
+    # to infer it from the mold.
+    if (length(components$mold$outcomes) > 1) {
+      cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env())
+    }
+    # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc.
+    # We want to extract the cases part.
     outcome_cols <- names(components$mold$outcomes) %>%
       stringr::str_match("ahead_\\d+_(.*)") %>%
       magrittr::extract(, 2)
@@ -146,8 +133,14 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
       rowwise() %>%
       mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols))))
   } else if (identical(col_names, character(0))) {
-    # In this case, we should assume the user wants to transform all outcomes.
-    cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
+    # Wish I could suggest `all_outcomes()` here, but currently it's the same as
+    # not specifying any terms. I don't want to spend time with dealing with
+    # this case until someone asks for it.
+    cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented.
+    If you had a single outcome, you can use `.pred` as a column name.
+    If you had multiple outcomes, you'll need to specify them like
+    `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
+    ", call = rlang::caller_env())
   } else {
     # In this case, we assume that the user has specified the columns they want
     # transformed here. We then need to determine the lambda columns for each of
@@ -157,7 +150,9 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
     original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
     outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2]
     if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
-      cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
+      cli_abort("All columns specified in `...` must be outcome columns.
+      They must be of the form `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
+      ", call = rlang::caller_env())
     }
 
     for (i in seq_along(col_names)) {
@@ -184,7 +179,8 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30),
 
 #' Inverse Yeo-Johnson transformation
 #'
-#' Inverse of `yj_transform` in step_yeo_johnson.R.
+#' Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is
+#' vectorized in x, but not in lambda.
 #'
 #' @keywords internal
 yj_inverse <- function(x, lambda, eps = 0.001) {
@@ -247,3 +243,16 @@ get_lambdas_in_layer <- function(workflow) {
   }
   lambdas
 }
+
+get_transformed_cols_in_layer <- function(workflow) {
+  this_recipe <- hardhat::extract_recipe(workflow)
+  if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
+    cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
+  }
+  for (step in this_recipe$steps) {
+    if (inherits(step, "step_epi_YeoJohnson")) {
+      lambdas <- step$lambdas
+      break
+    }
+  }
+}
diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R
@@ -3,17 +3,16 @@
 #' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will
 #' transform data using a Yeo-Johnson transformation. This fork works with panel
 #' data and is meant for epidata.
-#' TODO: Do an edit pass on this docstring.
 #'
 #' @inheritParams step_center
-#' @param lambdas A numeric vector of transformation values. This
+#' @param lambdas Internal. A numeric vector of transformation values. This
 #'  is `NULL` until computed by [prep()].
 #' @param na_lambda_fill A numeric value to fill in for any
 #'  geos where the lambda cannot be estimated.
 #' @param limits A length 2 numeric vector defining the range to
 #'  compute the transformation parameter lambda.
-#' @param num_unique An integer where data that have less possible
-#'  values will not be evaluated for a transformation.
+#' @param num_unique An integer where data that have fewer than this
+#'  many unique values will not be evaluated for a transformation.
 #' @param na_rm A logical indicating whether missing values should be
 #'  removed.
 #' @param skip A logical. Should the step be skipped when the recipe is
@@ -22,11 +21,13 @@
 #' @template step-return
 #' @family individual transformation steps
 #' @export
-#' @details The Yeo-Johnson transformation is very similar to the
-#'  Box-Cox but does not require the input variables to be strictly
-#'  positive. In the package, the partial log-likelihood function is
-#'  directly optimized within a reasonable set of transformation
-#'  values (which can be changed by the user).
+#' @details The Yeo-Johnson transformation is variance-stabilizing
+#'  transformation, similar to the Box-Cox but does not require the input
+#'  variables to be strictly positive. In the package, the partial
+#'  log-likelihood function is directly optimized within a reasonable set of
+#'  transformation values (which can be changed by the user). The optimization
+#'  finds a lambda parameter for each group in the data that minimizes the
+#'  variance of the transformed data.
 #'
 #' This transformation is typically done on the outcome variable
 #'  using the residuals for a statistical model (such as ordinary
@@ -36,7 +37,7 @@
 #'  variable distributions more symmetric.
 #'
 #' If the transformation parameters are estimated to be very
-#'  closed to the bounds, or if the optimization fails, a value of
+#'  close to the bounds, or if the optimization fails, a value of
 #'  `NA` is used and no transformation is applied.
 #'
 #' # Tidying
@@ -54,28 +55,24 @@
 #'
 #' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power
 #'   transformations to improve normality or symmetry. *Biometrika*.
-#' @examplesIf rlang::is_installed("modeldata")
-#' data(biomass, package = "modeldata")
+#' @examplesIf
+#' jhu <- cases_deaths_subset %>%
+#'   filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
+#'   select(geo_value, time_value, cases)
+#' filtered_data <- jhu
 #'
-#' biomass_tr <- biomass[biomass$dataset == "Training", ]
-#' biomass_te <- biomass[biomass$dataset == "Testing", ]
-#'
-#' rec <- recipe(
-#'   HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
-#'   data = biomass_tr
-#' )
-#'
-#' yj_transform <- step_epi_YeoJohnson(rec, all_numeric())
-#'
-#' yj_estimates <- prep(yj_transform, training = biomass_tr)
-#'
-#' yj_te <- bake(yj_estimates, biomass_te)
-#'
-#' plot(density(biomass_te$sulfur), main = "before")
-#' plot(density(yj_te$sulfur), main = "after")
-#'
-#' tidy(yj_transform, number = 1)
-#' tidy(yj_estimates, number = 1)
+#' r <- epi_recipe(filtered_data) %>%
+#'   step_epi_YeoJohnson(cases)
+#' # View the recipe
+#' r
+#' # Fit the recipe
+#' tr <- r %>% prep(filtered_data)
+#' # View the lambda values
+#' tr$steps[[1]]$lambdas
+#' # View the transformed data
+#' df <- tr %>% bake(filtered_data)
+#' plot(density(df$cases))
+#' plot(density(filtered_data$cases))
 step_epi_YeoJohnson <- function(
   recipe,
   ...,
@@ -266,6 +263,8 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam
 
 #' Internal Functions
 #'
+#' Note that this function is vectorized in x, but not in lambda.
+#'
 #' @keywords internal
 #' @rdname recipes-internal
 #' @export
@@ -314,14 +313,14 @@ yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) {
   x
 }
 
-
 ## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K.,
 ## & Johnson, R. A. (2000). A new family of power transformations
 ## to improve normality or symmetry. Biometrika. page 957
 ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) {
   n <- length(y)
   y_t <- yj_transform(y, lambda, ind_neg)
-  mu_t <- mean(y_t)
+  # EDIT: Unused in the original recipes code.
+  # mu_t <- mean(y_t)
   var_t <- var(y_t) * (n - 1) / n
   res <- -.5 * n * log(var_t) + (lambda - 1) * const
   res
@@ -361,6 +360,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca
 
   const <- sum(sign(dat) * log(abs(dat) + 1))
 
+  suppressWarnings(
   res <- optimize(
     yj_obj,
     interval = limits,
@@ -370,6 +370,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca
     const = const,
     tol = .0001
   )
+  )
   lam <- res$maximum
   if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) {
     lam <- NA

diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R
@@ -77,21 +77,47 @@ test_that("Yeo-Johnson steps and layers invert each other", {
 })
 
 test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", {
-  skip("TODO")
-  jhu <- cases_deaths_subset %>%
-    filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
-    select(geo_value, time_value, cases)
-  filtered_data <- jhu
+  # Small synthetic grad_employ_dataset version.
+  filtered_data <- tribble(
+    ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y,
+    "ca", "25-34", "bachelor", 2017, 50000,
+    "ca", "25-34", "bachelor", 2018, 50500,
+    "ca", "25-34", "bachelor", 2019, 51000,
+    "ca", "25-34", "bachelor", 2020, 51500,
+    "ca", "25-34", "bachelor", 2021, 52000,
+    "ca", "25-34", "bachelor", 2022, 52500,
+    "ca", "35-1000", "bachelor", 2017, 3e10,
+    "ca", "35-1000", "bachelor", 2018, 3e10 + 10,
+    "ca", "35-1000", "bachelor", 2019, 3e10 + 20,
+    "ca", "35-1000", "bachelor", 2020, 3e10 + 30,
+    "ca", "35-1000", "bachelor", 2021, 3e10 + 40,
+    "ca", "35-1000", "bachelor", 2022, 3e10 + 50,
+    "ca", "25-34", "master", 2017, 2 * 50000,
+    "ca", "25-34", "master", 2018, 2 * 50500,
+    "ca", "25-34", "master", 2019, 2 * 51000,
+    "ca", "25-34", "master", 2020, 2 * 51500,
+    "ca", "25-34", "master", 2021, 2 * 52000,
+    "ca", "25-34", "master", 2022, 2 * 52500,
+    "ca", "35-1000", "master", 2017, 2 * 3e10,
+    "ca", "35-1000", "master", 2018, 2 * (3e10 + 10),
+    "ca", "35-1000", "master", 2019, 2 * (3e10 + 20),
+    "ca", "35-1000", "master", 2020, 2 * (3e10 + 30),
+    "ca", "35-1000", "master", 2021, 2 * (3e10 + 40),
+    "ca", "35-1000", "master", 2022, 2 * (3e10 + 50)
+  ) %>% as_epi_df(other_keys = c("age_group", "edu_qual"))
 
   # Get some lambda values
   r <- epi_recipe(filtered_data) %>%
-    step_epi_YeoJohnson(cases) %>%
-    step_epi_lag(cases, lag = 0) %>%
-    step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
+    step_epi_YeoJohnson(med_income_2y) %>%
+    step_epi_lag(med_income_2y, lag = 0) %>%
+    step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>%
     step_epi_naomit()
   tr <- r %>% prep(filtered_data)
-  # Check for fixed lambda values
-  expect_true(all(near(tr$steps[[1]]$lambdas$.lambda_cases, c(0.856, 0.207), tol = 0.001)))
+  expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas))
+  expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas))
+  expect_true("age_group" %in% names(tr$steps[[1]]$lambdas))
+  expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas))
+  expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y))
 
   # Make sure that the inverse transformation works
   f <- frosting() %>%
@@ -100,7 +126,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr
   wf <- epi_workflow(r, linear_reg()) %>%
     fit(filtered_data) %>%
     add_frosting(f)
-  out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
-  out2 <- forecast(wf) %>% rename(cases = .pred)
+  out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value)
+  out2 <- forecast(wf) %>% rename(med_income_2y = .pred) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value)
   expect_equal(out1, out2)
 })