feat: support input transformations for features

sumny · sumny · commit cfca53b285a6 · 2025-04-11T16:29:52.000+02:00
diff --git a/R/SurrogateLearner.R b/R/SurrogateLearner.R
@@ -15,6 +15,11 @@
 #'   Can be `"mean"` to use mean imputation or `"random"` to sample values uniformly at random between the empirical minimum and maximum.
 #'   Default is `"random"`.
 #' }
+#' \item{`input_trafo`}{`character(1)`\cr
+#'   Which input transformation should be applied to numeric and integer features?
+#'   Can be `"none"` for no transformation or `"unitcube"` to perform for each feature a min-max scaling to `[0, 1]` based on the boundaries of the search space.
+#'   Default is `"none"`.
+#' }
 #' }
 #'
 #' @export
@@ -74,9 +79,10 @@ SurrogateLearner = R6Class("SurrogateLearner",
 
       ps = ps(
         catch_errors = p_lgl(),
-        impute_method = p_fct(c("mean", "random"), default = "random")
+        impute_method = p_fct(c("mean", "random"), default = "random"),
+        input_trafo = p_fct(c("none", "unitcube"), default = "none")
       )
-      ps$values = list(catch_errors = TRUE, impute_method = "random")
+      ps$values = list(catch_errors = TRUE, impute_method = "random", input_trafo = "none")
 
       super$initialize(learner = learner, archive = archive, cols_x = cols_x, cols_y = col_y, param_set = ps)
     },
@@ -90,7 +96,10 @@ SurrogateLearner = R6Class("SurrogateLearner",
     #' @return [data.table::data.table()] with the columns `mean` and `se`.
     predict = function(xdt) {
       assert_xdt(xdt)
-      xdt = fix_xdt_missing(xdt, cols_x = self$cols_x, archive = self$archive)
+      xdt = fix_xdt_missing(copy(xdt), cols_x = self$cols_x, archive = self$archive)
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xdt = input_trafo_unitcube(xdt, search_space = self$archive$search_space)
+      }
 
       pred = self$learner$predict_newdata(newdata = xdt)
       if (self$learner$predict_type == "se") {
@@ -157,7 +166,10 @@ SurrogateLearner = R6Class("SurrogateLearner",
   private = list(
     # Train learner with new data.
     .update = function() {
-      xydt = self$archive$data[, c(self$cols_x, self$cols_y), with = FALSE]
+      xydt = copy(self$archive$data[, c(self$cols_x, self$cols_y), with = FALSE])
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xydt = input_trafo_unitcube(xydt, search_space = self$archive$search_space)
+      }
       task = TaskRegr$new(id = "surrogate_task", backend = xydt, target = self$cols_y)
       assert_learnable(task, learner = self$learner)
       self$learner$train(task)
@@ -166,7 +178,10 @@ SurrogateLearner = R6Class("SurrogateLearner",
     # Train learner with new data.
     # Operates on an asynchronous archive and performs imputation as needed.
     .update_async = function() {
-      xydt = self$archive$rush$fetch_tasks_with_state(states = c("queued", "running", "finished"))[, c(self$cols_x, self$cols_y, "state"), with = FALSE]
+      xydt = copy(self$archive$rush$fetch_tasks_with_state(states = c("queued", "running", "finished"))[, c(self$cols_x, self$cols_y, "state"), with = FALSE])
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xydt = input_trafo_unitcube(xydt, search_space = self$archive$search_space)
+      }
       if (self$param_set$values$impute_method == "mean") {
         mean_y = mean(xydt[[self$cols_y]], na.rm = TRUE)
         xydt[c("queued", "running"), (self$cols_y) := mean_y, on = "state"]
diff --git a/R/SurrogateLearnerCollection.R b/R/SurrogateLearnerCollection.R
@@ -17,6 +17,11 @@
 #'   Can be `"mean"` to use mean imputation or `"random"` to sample values uniformly at random between the empirical minimum and maximum.
 #'   Default is `"random"`.
 #' }
+#' \item{`input_trafo`}{`character(1)`\cr
+#'   Which input transformation should be applied to numeric and integer features?
+#'   Can be `"none"` for no transformation or `"unitcube"` to perform for each feature a min-max scaling to `[0, 1]` based on the boundaries of the search space.
+#'   Default is `"none"`.
+#' }
 #' }
 #'
 #' @export
@@ -89,9 +94,10 @@ SurrogateLearnerCollection = R6Class("SurrogateLearnerCollection",
 
       ps = ps(
         catch_errors = p_lgl(),
-        impute_method = p_fct(c("mean", "random"), default = "random")
+        impute_method = p_fct(c("mean", "random"), default = "random"),
+        input_trafo = p_fct(c("none", "unitcube"), default = "none")
       )
-      ps$values = list(catch_errors = TRUE, impute_method = "random")
+      ps$values = list(catch_errors = TRUE, impute_method = "random", input_trafo = "none")
 
       super$initialize(learner = learners, archive = archive, cols_x = cols_x, cols_y = cols_y, param_set = ps)
     },
@@ -107,7 +113,10 @@ SurrogateLearnerCollection = R6Class("SurrogateLearnerCollection",
     #' @return list of [data.table::data.table()]s with the columns `mean` and `se`.
     predict = function(xdt) {
       assert_xdt(xdt)
-      xdt = fix_xdt_missing(xdt, cols_x = self$cols_x, archive = self$archive)
+      xdt = fix_xdt_missing(copy(xdt), cols_x = self$cols_x, archive = self$archive)
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xdt = input_trafo_unitcube(xdt, search_space = self$archive$search_space)
+      }
 
       preds = lapply(self$learner, function(learner) {
         pred = learner$predict_newdata(newdata = xdt)
@@ -185,7 +194,10 @@ SurrogateLearnerCollection = R6Class("SurrogateLearnerCollection",
     .update = function() {
       assert_true((length(self$cols_y) == length(self$learner)) || length(self$cols_y) == 1L)  # either as many cols_y as learner or only one
       one_to_multiple = length(self$cols_y) == 1L
-      xydt = self$archive$data[, c(self$cols_x, self$cols_y), with = FALSE]
+      xydt = copy(self$archive$data[, c(self$cols_x, self$cols_y), with = FALSE])
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xydt = input_trafo_unitcube(xydt, search_space = self$archive$search_space)
+      }
       features = setdiff(names(xydt), self$cols_y)
       tasks = lapply(self$cols_y, function(col_y) {
         # if this turns out to be a bottleneck, we can also operate on a single task here
@@ -214,7 +226,10 @@ SurrogateLearnerCollection = R6Class("SurrogateLearnerCollection",
       assert_true((length(self$cols_y) == length(self$learner)) || length(self$cols_y) == 1L)  # either as many cols_y as learner or only one
       one_to_multiple = length(self$cols_y) == 1L
 
-      xydt = self$archive$rush$fetch_tasks_with_state(states = c("queued", "running", "finished"))[, c(self$cols_x, self$cols_y, "state"), with = FALSE]
+      xydt = copy(self$archive$rush$fetch_tasks_with_state(states = c("queued", "running", "finished"))[, c(self$cols_x, self$cols_y, "state"), with = FALSE])
+      if (self$param_set$values$input_trafo == "unitcube") {
+        xydt = input_trafo_unitcube(xydt, search_space = self$archive$search_space)
+      }
       if (self$param_set$values$impute_method == "mean") {
         walk(self$cols_y, function(col) {
           mean_y = mean(xydt[[col]], na.rm = TRUE)
diff --git a/R/helper.R b/R/helper.R
@@ -172,10 +172,17 @@ assert_xdt = function(xdt) {
 assert_learner_surrogate = function(x, .var.name = vname(x)) {
   # NOTE: this is buggy in checkmate; assert should always return x invisible not TRUE as is the case here
   assert(check_learner_surrogate(x), .var.name = .var.name)
-
   x
 }
 
+input_trafo_unitcube = function(xydt, search_space) {
+  parameters = names(which(search_space$is_number))  # numeric or integer
+  for (parameter in parameters) {
+    set(xydt, j = parameter, value = (xydt[[parameter]] - search_space$lower[[parameter]]) / (search_space$upper[[parameter]] - search_space$lower[[parameter]]))
+  }
+  xydt
+}
+
 #' Check if Redis Server is Available
 #'
 #' Attempts to establish a connection to a Redis server using the \CRANpkg{redux} package
diff --git a/man/SurrogateLearner.Rd b/man/SurrogateLearner.Rd
diff --git a/man/SurrogateLearnerCollection.Rd b/man/SurrogateLearnerCollection.Rd
diff --git a/tests/testthat/test_SurrogateLearner.R b/tests/testthat/test_SurrogateLearner.R
@@ -26,6 +26,13 @@ test_that("SurrogateLearner API works", {
   surrogate$learner$predict_type = "response"
   expect_equal(surrogate$predict_type, surrogate$learner$predict_type)
   expect_error({surrogate$predict_type = "response"}, "is read-only")
+
+  # unitcube input transformation for numeric and integer features
+  surrogate = SurrogateLearner$new(learner = REGR_FEATURELESS, archive = inst$archive)
+  surrogate$param_set$values$input_trafo = "unitcube"
+  surrogate$update()
+  expect_learner(surrogate$learner)
+  expect_data_table(surrogate$predict(xdt), col.names = "named", nrows = 5, ncols = 2, any.missing = FALSE)
 })
 
 test_that("predict_types are recognized", {
@@ -50,9 +57,10 @@ test_that("param_set", {
   inst = MAKE_INST_1D()
   surrogate = SurrogateLearner$new(learner = REGR_FEATURELESS, archive = inst$archive)
   expect_r6(surrogate$param_set, "ParamSet")
-  expect_setequal(surrogate$param_set$ids(), c("catch_errors", "impute_method"))
+  expect_setequal(surrogate$param_set$ids(), c("catch_errors", "impute_method", "input_trafo"))
   expect_equal(surrogate$param_set$class[["catch_errors"]], "ParamLgl")
   expect_equal(surrogate$param_set$class[["impute_method"]], "ParamFct")
+  expect_equal(surrogate$param_set$class[["input_trafo"]], "ParamFct")
   expect_error({surrogate$param_set = list()}, regexp = "param_set is read-only.")
 })
 
diff --git a/tests/testthat/test_SurrogateLearnerCollection.R b/tests/testthat/test_SurrogateLearnerCollection.R
@@ -35,6 +35,13 @@ test_that("SurrogateLearnerCollection API works", {
   expect_equal(surrogate$predict_type, surrogate$learner[[2L]]$predict_type)
   expect_error({surrogate$predict_type = "response"}, "is read-only")
 
+  # unitcube input transformation for numeric and integer features
+  surrogate = SurrogateLearnerCollection$new(learners = list(REGR_FEATURELESS, REGR_FEATURELESS$clone(deep = TRUE)), archive = inst$archive)
+  surrogate$param_set$values$input_trafo = "unitcube"
+  surrogate$update()
+  expect_learner(surrogate$learner[[1L]])
+  expect_learner(surrogate$learner[[2L]])
+  expect_list(surrogate$predict(xdt), len = 2L)
 })
 
 test_that("predict_types are recognized", {
@@ -60,9 +67,10 @@ test_that("param_set", {
   inst = MAKE_INST(OBJ_1D_2, PS_1D, trm("evals", n_evals = 5L))
   surrogate = SurrogateLearnerCollection$new(learners = list(REGR_FEATURELESS, REGR_FEATURELESS$clone(deep = TRUE)), archive = inst$archive)
   expect_r6(surrogate$param_set, "ParamSet")
-  expect_setequal(surrogate$param_set$ids(), c("catch_errors", "impute_method"))
+  expect_setequal(surrogate$param_set$ids(), c("catch_errors", "impute_method", "input_trafo"))
   expect_equal(surrogate$param_set$class[["catch_errors"]], "ParamLgl")
   expect_equal(surrogate$param_set$class[["impute_method"]], "ParamFct")
+  expect_equal(surrogate$param_set$class[["input_trafo"]], "ParamFct")
   expect_error({surrogate$param_set = list()}, regexp = "param_set is read-only.")
 })