Skip to content

feat: add yeo-johnson step #179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 47 additions & 38 deletions R/new_epipredict_steps/layer_yeo_johnson.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,28 @@
#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
#' select(geo_value, time_value, cases)
#'
#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
#'
#' # Create a recipe with a Yeo-Johnson transformation.
#' r <- epi_recipe(jhu) %>%
#' step_epi_YeoJohnson(
#' df = pop_data,
#' df_pop_col = "value",
#' by = c("geo_value" = "states"),
#' cases, suffix = "_scaled"
#' ) %>%
#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
#' step_epi_YeoJohnson(cases) %>%
#' step_epi_lag(cases, lag = 0) %>%
#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
#' step_epi_naomit()
#'
#' # Create a frosting layer that will undo the Yeo-Johnson transformation.
#' f <- frosting() %>%
#' layer_predict() %>%
#' layer_threshold(.pred) %>%
#' layer_naomit(.pred) %>%
#' layer_epi_YeoJohnson(.pred,
#' df = pop_data,
#' by = c("geo_value" = "states"),
#' df_pop_col = "value"
#' )
#' layer_epi_YeoJohnson(.pred)
#'
#' # Create a workflow and fit it.
#' wf <- epi_workflow(r, linear_reg()) %>%
#' fit(jhu) %>%
#' add_frosting(f)
#'
#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation.
#' forecast(wf)
#' # Compare to the original data.
#' plot(density(jhu$cases))
#' plot(density(forecast(wf)$cases))
layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE)

Expand Down Expand Up @@ -116,28 +110,21 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
unmatched = c("error", "drop")
)

# TODO: There are many possibilities here:
# - (a) the terms can be empty, where we should probably default to
# all_outcomes().
# - (b) explicitly giving all_outcomes(), we end here with terms being empty,
# which doesn't seem right; need to make sure we pull in all the outcome
# columns here. The question is what form should they have?
# - (c) if the user just specifies .pred, then we have to infer the outcome
# from the mold, which is simple enough and the main case I have working.
# - (d) the user might specify outcomes of the form .pred_ahead_1_cases,
# .pred_ahead_7_cases, etc. Is that the right format? Trying those out now
# and getting errors downstream from forecast().
# Get the columns to transform.
exprs <- rlang::expr(c(!!!object$terms))
pos <- tidyselect::eval_select(exprs, components$predictions)
col_names <- names(pos)

# For every column, we need to use the appropriate lambda column, which differs per row.
# Note that yj_inverse() is vectorized in x, but not in lambda.
# The `object$terms` is where the user specifies the columns they want to
# untransform. We need to match the outcomes with their lambda columns in our
# parameter table and then apply the inverse transformation.
if (identical(col_names, ".pred")) {
# In this case, we don't get a hint for the outcome column name, so we need to
# infer it from the mold. `outcomes` is a vector of objects like
# ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
# In this case, we don't get a hint for the outcome column name, so we need
# to infer it from the mold.
if (length(components$mold$outcomes) > 1) {
cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env())
}
# `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc.
# We want to extract the cases part.
outcome_cols <- names(components$mold$outcomes) %>%
stringr::str_match("ahead_\\d+_(.*)") %>%
magrittr::extract(, 2)
Expand All @@ -146,8 +133,14 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
rowwise() %>%
mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols))))
} else if (identical(col_names, character(0))) {
# In this case, we should assume the user wants to transform all outcomes.
cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
# Wish I could suggest `all_outcomes()` here, but currently it's the same as
# not specifying any terms. I don't want to spend time with dealing with
# this case until someone asks for it.
cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented.
If you had a single outcome, you can use `.pred` as a column name.
If you had multiple outcomes, you'll need to specify them like
`.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
", call = rlang::caller_env())
} else {
# In this case, we assume that the user has specified the columns they want
# transformed here. We then need to determine the lambda columns for each of
Expand All @@ -157,7 +150,9 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2]
if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
cli_abort("All columns specified in `...` must be outcome columns.
They must be of the form `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
", call = rlang::caller_env())
}

for (i in seq_along(col_names)) {
Expand All @@ -184,7 +179,8 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30),

#' Inverse Yeo-Johnson transformation
#'
#' Inverse of `yj_transform` in step_yeo_johnson.R.
#' Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is
#' vectorized in x, but not in lambda.
#'
#' @keywords internal
yj_inverse <- function(x, lambda, eps = 0.001) {
Expand Down Expand Up @@ -247,3 +243,16 @@ get_lambdas_in_layer <- function(workflow) {
}
lambdas
}

get_transformed_cols_in_layer <- function(workflow) {
this_recipe <- hardhat::extract_recipe(workflow)
if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
}
for (step in this_recipe$steps) {
if (inherits(step, "step_epi_YeoJohnson")) {
lambdas <- step$lambdas
break
}
}
}
67 changes: 34 additions & 33 deletions R/new_epipredict_steps/step_yeo_johnson.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
#' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will
#' transform data using a Yeo-Johnson transformation. This fork works with panel
#' data and is meant for epidata.
#' TODO: Do an edit pass on this docstring.
#'
#' @inheritParams step_center
#' @param lambdas A numeric vector of transformation values. This
#' @param lambdas Internal. A numeric vector of transformation values. This
#' is `NULL` until computed by [prep()].
#' @param na_lambda_fill A numeric value to fill in for any
#' geos where the lambda cannot be estimated.
#' @param limits A length 2 numeric vector defining the range to
#' compute the transformation parameter lambda.
#' @param num_unique An integer where data that have less possible
#' values will not be evaluated for a transformation.
#' @param num_unique An integer where data that have fewer than this
#' many unique values will not be evaluated for a transformation.
#' @param na_rm A logical indicating whether missing values should be
#' removed.
#' @param skip A logical. Should the step be skipped when the recipe is
Expand All @@ -22,11 +21,13 @@
#' @template step-return
#' @family individual transformation steps
#' @export
#' @details The Yeo-Johnson transformation is very similar to the
#' Box-Cox but does not require the input variables to be strictly
#' positive. In the package, the partial log-likelihood function is
#' directly optimized within a reasonable set of transformation
#' values (which can be changed by the user).
#' @details The Yeo-Johnson transformation is variance-stabilizing
#' transformation, similar to the Box-Cox but does not require the input
#' variables to be strictly positive. In the package, the partial
#' log-likelihood function is directly optimized within a reasonable set of
#' transformation values (which can be changed by the user). The optimization
#' finds a lambda parameter for each group in the data that minimizes the
#' variance of the transformed data.
#'
#' This transformation is typically done on the outcome variable
#' using the residuals for a statistical model (such as ordinary
Expand All @@ -36,7 +37,7 @@
#' variable distributions more symmetric.
#'
#' If the transformation parameters are estimated to be very
#' closed to the bounds, or if the optimization fails, a value of
#' close to the bounds, or if the optimization fails, a value of
#' `NA` is used and no transformation is applied.
#'
#' # Tidying
Expand All @@ -54,28 +55,24 @@
#'
#' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power
#' transformations to improve normality or symmetry. *Biometrika*.
#' @examplesIf rlang::is_installed("modeldata")
#' data(biomass, package = "modeldata")
#' @examplesIf
#' jhu <- cases_deaths_subset %>%
#' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
#' select(geo_value, time_value, cases)
#' filtered_data <- jhu
#'
#' biomass_tr <- biomass[biomass$dataset == "Training", ]
#' biomass_te <- biomass[biomass$dataset == "Testing", ]
#'
#' rec <- recipe(
#' HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
#' data = biomass_tr
#' )
#'
#' yj_transform <- step_epi_YeoJohnson(rec, all_numeric())
#'
#' yj_estimates <- prep(yj_transform, training = biomass_tr)
#'
#' yj_te <- bake(yj_estimates, biomass_te)
#'
#' plot(density(biomass_te$sulfur), main = "before")
#' plot(density(yj_te$sulfur), main = "after")
#'
#' tidy(yj_transform, number = 1)
#' tidy(yj_estimates, number = 1)
#' r <- epi_recipe(filtered_data) %>%
#' step_epi_YeoJohnson(cases)
#' # View the recipe
#' r
#' # Fit the recipe
#' tr <- r %>% prep(filtered_data)
#' # View the lambda values
#' tr$steps[[1]]$lambdas
#' # View the transformed data
#' df <- tr %>% bake(filtered_data)
#' plot(density(df$cases))
#' plot(density(filtered_data$cases))
step_epi_YeoJohnson <- function(
recipe,
...,
Expand Down Expand Up @@ -266,6 +263,8 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam

#' Internal Functions
#'
#' Note that this function is vectorized in x, but not in lambda.
#'
#' @keywords internal
#' @rdname recipes-internal
#' @export
Expand Down Expand Up @@ -314,14 +313,14 @@ yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) {
x
}


## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K.,
## & Johnson, R. A. (2000). A new family of power transformations
## to improve normality or symmetry. Biometrika. page 957
ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) {
n <- length(y)
y_t <- yj_transform(y, lambda, ind_neg)
mu_t <- mean(y_t)
# EDIT: Unused in the original recipes code.
# mu_t <- mean(y_t)
var_t <- var(y_t) * (n - 1) / n
res <- -.5 * n * log(var_t) + (lambda - 1) * const
res
Expand Down Expand Up @@ -361,6 +360,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca

const <- sum(sign(dat) * log(abs(dat) + 1))

suppressWarnings(
res <- optimize(
yj_obj,
interval = limits,
Expand All @@ -370,6 +370,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca
const = const,
tol = .0001
)
)
lam <- res$maximum
if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) {
lam <- NA
Expand Down
50 changes: 38 additions & 12 deletions tests/testthat/test-yeo-johnson.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,47 @@ test_that("Yeo-Johnson steps and layers invert each other", {
})

test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", {
skip("TODO")
jhu <- cases_deaths_subset %>%
filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
select(geo_value, time_value, cases)
filtered_data <- jhu
# Small synthetic grad_employ_dataset version.
filtered_data <- tribble(
~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y,
"ca", "25-34", "bachelor", 2017, 50000,
"ca", "25-34", "bachelor", 2018, 50500,
"ca", "25-34", "bachelor", 2019, 51000,
"ca", "25-34", "bachelor", 2020, 51500,
"ca", "25-34", "bachelor", 2021, 52000,
"ca", "25-34", "bachelor", 2022, 52500,
"ca", "35-1000", "bachelor", 2017, 3e10,
"ca", "35-1000", "bachelor", 2018, 3e10 + 10,
"ca", "35-1000", "bachelor", 2019, 3e10 + 20,
"ca", "35-1000", "bachelor", 2020, 3e10 + 30,
"ca", "35-1000", "bachelor", 2021, 3e10 + 40,
"ca", "35-1000", "bachelor", 2022, 3e10 + 50,
"ca", "25-34", "master", 2017, 2 * 50000,
"ca", "25-34", "master", 2018, 2 * 50500,
"ca", "25-34", "master", 2019, 2 * 51000,
"ca", "25-34", "master", 2020, 2 * 51500,
"ca", "25-34", "master", 2021, 2 * 52000,
"ca", "25-34", "master", 2022, 2 * 52500,
"ca", "35-1000", "master", 2017, 2 * 3e10,
"ca", "35-1000", "master", 2018, 2 * (3e10 + 10),
"ca", "35-1000", "master", 2019, 2 * (3e10 + 20),
"ca", "35-1000", "master", 2020, 2 * (3e10 + 30),
"ca", "35-1000", "master", 2021, 2 * (3e10 + 40),
"ca", "35-1000", "master", 2022, 2 * (3e10 + 50)
) %>% as_epi_df(other_keys = c("age_group", "edu_qual"))

# Get some lambda values
r <- epi_recipe(filtered_data) %>%
step_epi_YeoJohnson(cases) %>%
step_epi_lag(cases, lag = 0) %>%
step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
step_epi_YeoJohnson(med_income_2y) %>%
step_epi_lag(med_income_2y, lag = 0) %>%
step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>%
step_epi_naomit()
tr <- r %>% prep(filtered_data)
# Check for fixed lambda values
expect_true(all(near(tr$steps[[1]]$lambdas$.lambda_cases, c(0.856, 0.207), tol = 0.001)))
expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas))
expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas))
expect_true("age_group" %in% names(tr$steps[[1]]$lambdas))
expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas))
expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y))

# Make sure that the inverse transformation works
f <- frosting() %>%
Expand All @@ -100,7 +126,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr
wf <- epi_workflow(r, linear_reg()) %>%
fit(filtered_data) %>%
add_frosting(f)
out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
out2 <- forecast(wf) %>% rename(cases = .pred)
out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value)
out2 <- forecast(wf) %>% rename(med_income_2y = .pred) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value)
expect_equal(out1, out2)
})