From c6f410a04cd58353afa47d2259cad8689da3c342 Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 3 Dec 2024 13:16:49 +0100 Subject: [PATCH 01/22] ... --- R/resample.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/resample.R b/R/resample.R index cc1bb88f2..9fe5d012b 100644 --- a/R/resample.R +++ b/R/resample.R @@ -114,9 +114,17 @@ resample = function(task, learner, resampling, store_models = FALSE, store_backe data.table(learner = replicate(n, learner), mode = "train") } - res = future_map(n, workhorse, iteration = seq_len(n), learner = grid$learner, mode = grid$mode, + browser() + + if (FALSE) { + grid$iteration = seq_len(n) + system.time(mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal))) + } + + + system.time(future_map(n, workhorse, iteration = seq_len(n), learner = grid$learner, mode = grid$mode, MoreArgs = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal) - ) + )) data = data.table( task = list(task), From e79da7383a7bd255fe6e7e658d1249424443e0a7 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 27 Mar 2025 11:42:12 +0100 Subject: [PATCH 02/22] ... --- R/resample.R | 12 +--- R/resample_mirai.R | 142 +++++++++++++++++++++++++++++++++++++++++++++ attic/mirai.qmd | 70 ++++++++++++++++++++++ 3 files changed, 214 insertions(+), 10 deletions(-) create mode 100644 R/resample_mirai.R create mode 100644 attic/mirai.qmd diff --git a/R/resample.R b/R/resample.R index 9fe5d012b..cc1bb88f2 100644 --- a/R/resample.R +++ b/R/resample.R @@ -114,17 +114,9 @@ resample = function(task, learner, resampling, store_models = FALSE, store_backe data.table(learner = replicate(n, learner), mode = "train") } - browser() - - if (FALSE) { - grid$iteration = seq_len(n) - system.time(mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal))) - } - - - system.time(future_map(n, workhorse, iteration = seq_len(n), learner = grid$learner, mode = grid$mode, + res = future_map(n, workhorse, iteration = seq_len(n), learner = grid$learner, mode = grid$mode, MoreArgs = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal) - )) + ) data = data.table( task = list(task), diff --git a/R/resample_mirai.R b/R/resample_mirai.R new file mode 100644 index 000000000..751b689a9 --- /dev/null +++ b/R/resample_mirai.R @@ -0,0 +1,142 @@ +#' @title Resample a Learner on a Task +#' +#' @description +#' Runs a resampling (possibly in parallel): +#' Repeatedly apply [Learner] `learner` on a training set of [Task] `task` to train a model, +#' then use the trained model to predict observations of a test set. +#' Training and test sets are defined by the [Resampling] `resampling`. +#' +#' @param task ([Task]). +#' @param learner ([Learner]). +#' @param resampling ([Resampling]). +#' @template param_store_models +#' @template param_store_backends +#' @template param_encapsulate +#' @template param_allow_hotstart +#' @template param_clone +#' @template param_unmarshal +#' @return [ResampleResult]. +#' +#' @template section_predict_sets +#' @template section_parallelization +#' @template section_progress_bars +#' @template section_logging +#' +#' @note +#' The fitted models are discarded after the predictions have been computed in order to reduce memory consumption. +#' If you need access to the models for later analysis, set `store_models` to `TRUE`. +#' +#' @template seealso_resample +#' @export +#' @examples +#' task = tsk("penguins") +#' learner = lrn("classif.rpart") +#' resampling = rsmp("cv") +#' +#' # Explicitly instantiate the resampling for this task for reproduciblity +#' set.seed(123) +#' resampling$instantiate(task) +#' +#' rr = resample(task, learner, resampling) +#' print(rr) +#' +#' # Retrieve performance +#' rr$score(msr("classif.ce")) +#' rr$aggregate(msr("classif.ce")) +#' +#' # merged prediction objects of all resampling iterations +#' pred = rr$prediction() +#' pred$confusion +#' +#' # Repeat resampling with featureless learner +#' rr_featureless = resample(task, lrn("classif.featureless"), resampling) +#' +#' # Convert results to BenchmarkResult, then combine them +#' bmr1 = as_benchmark_result(rr) +#' bmr2 = as_benchmark_result(rr_featureless) +#' print(bmr1$combine(bmr2)) +resample = function(task, learner, resampling, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE) { + assert_subset(clone, c("task", "learner", "resampling")) + task = assert_task(as_task(task, clone = "task" %in% clone)) + learner = assert_learner(as_learner(learner, clone = "learner" %in% clone, discard_state = TRUE)) + resampling = assert_resampling(as_resampling(resampling, clone = "resampling" %in% clone)) + assert_flag(store_models) + assert_flag(store_backends) + # this does not check the internal validation task as it might not be set yet + assert_learnable(task, learner) + assert_flag(unmarshal) + + set_encapsulation(list(learner), encapsulate) + if (!resampling$is_instantiated) { + resampling = resampling$instantiate(task) + } + + n = resampling$iters + pb = if (isNamespaceLoaded("progressr")) { + # NB: the progress bar needs to be created in this env + pb = progressr::progressor(steps = n) + } else { + NULL + } + lgr_threshold = map_int(mlr_reflections$loggers, "threshold") + + grid = if (allow_hotstart) { + + lg$debug("Resampling with hotstart enabled.") + + hotstart_grid = map_dtr(seq_len(n), function(iteration) { + if (!is.null(learner$hotstart_stack)) { + # search for hotstart learner + task_hashes = resampling_task_hashes(task, resampling, learner) + start_learner = get_private(learner$hotstart_stack)$.start_learner(learner$clone(), task_hashes[iteration]) + } + if (is.null(learner$hotstart_stack) || is.null(start_learner)) { + # no hotstart learners stored or no adaptable model found + lg$debug("Resampling with hotstarting not possible. No start learner found.") + mode = "train" + } else { + # hotstart learner found + lg$debug("Resampling with hotstarting.") + start_learner$param_set$values = insert_named(start_learner$param_set$values, learner$param_set$values) + learner = start_learner + mode = "hotstart" + } + data.table(learner = list(learner), mode = mode) + }) + + # null hotstart stack to reduce overhead in parallelization + walk(hotstart_grid$learner, function(learner) { + learner$hotstart_stack = NULL + learner + }) + hotstart_grid + } else { + data.table(learner = replicate(n, learner), mode = "train") + } + + grid$iteration = seq_len(n) + res = mirai::mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal)) + + data = data.table( + task = list(task), + learner = grid$learner, + learner_state = map(res, "learner_state"), + resampling = list(resampling), + iteration = seq_len(n), + prediction = map(res, "prediction"), + uhash = UUIDgenerate(), + param_values = map(res, "param_values"), + learner_hash = map_chr(res, "learner_hash") + ) + + result_data = ResultData$new(data, store_backends = store_backends) + + # the worker already ensures that models are sent back in marshaled form if unmarshal = FALSE, so we don't have + # to do anything in this case. This allows us to minimize the amount of marshaling in those situtions where + # the model is available in both states on the worker + if (unmarshal && store_models) { + result_data$unmarshal() + } + + ResampleResult$new(result_data) +} diff --git a/attic/mirai.qmd b/attic/mirai.qmd new file mode 100644 index 000000000..158235100 --- /dev/null +++ b/attic/mirai.qmd @@ -0,0 +1,70 @@ +--- +title: "Mirai and mlr3 Demo" +author: "Your Name" +date: "2025-03-27" +format: html +--- + +```{r} +library(mlr3) + +learner = lrn("classif.rpart") +learner$encapsulate("callr", lrn("classif.featureless")) +task = tsk("pima") +resampling = rsmp("cv", folds = 3) + +rr = resample(task, learner, resampling) +``` + +# + + + +# Segfaults + +Simulate a learner with many segfaults. + +```{r} +daemons(0) +daemons(3) + +x = mirai_map(seq(6), function(i) { + if (i == 2) return(2) + tools::pskill(Sys.getpid()) +}) +``` + +```{r} +# no daemons left +# periodically check the status in mlr3? +status() + +# mlr3 could restart local daemons +# what about remote daemons? +launch_local(3) + +# all finished +status() + +x[.progress] +``` + +# Simple R Errors + +```{r} +daemons(0) +daemons(3) + +x = mirai_map(seq(3), function(i) { + if (i == 2) stop("Simple R error") + i +}) +``` + +```{r} +# all daemons available +status() + +x[.progress] +``` + From a9fa258dd08207a9957b2212acad5a37e7d6d8ac Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 8 Apr 2025 18:03:22 +0200 Subject: [PATCH 03/22] ... --- .Rbuildignore | 1 + R/Learner.R | 2 +- R/resample_mirai.R | 4 +- attic/mirai.qmd | 276 ++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 253 insertions(+), 30 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index e978731f6..efd7953f3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -23,3 +23,4 @@ ^cran-comments\.md$ ^CRAN-SUBMISSION$ ^benchmark$ +^attic$ diff --git a/R/Learner.R b/R/Learner.R index 72014c8f5..f59d62cf0 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -479,7 +479,7 @@ Learner = R6Class("Learner", #' #' @return `self` (invisibly). encapsulate = function(method, fallback = NULL) { - assert_choice(method, c("none", "try", "evaluate", "callr")) + assert_choice(method, c("none", "try", "evaluate", "callr", "mirai")) if (method != "none") { assert_learner(fallback, task_type = self$task_type) diff --git a/R/resample_mirai.R b/R/resample_mirai.R index 751b689a9..7084f4408 100644 --- a/R/resample_mirai.R +++ b/R/resample_mirai.R @@ -55,7 +55,7 @@ #' bmr1 = as_benchmark_result(rr) #' bmr2 = as_benchmark_result(rr_featureless) #' print(bmr1$combine(bmr2)) -resample = function(task, learner, resampling, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE) { +resample_mirai = function(task, learner, resampling, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE) { assert_subset(clone, c("task", "learner", "resampling")) task = assert_task(as_task(task, clone = "task" %in% clone)) learner = assert_learner(as_learner(learner, clone = "learner" %in% clone, discard_state = TRUE)) @@ -115,7 +115,7 @@ resample = function(task, learner, resampling, store_models = FALSE, store_backe } grid$iteration = seq_len(n) - res = mirai::mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal)) + res = mirai::collect_mirai(mirai::mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal))) data = data.table( task = list(task), diff --git a/attic/mirai.qmd b/attic/mirai.qmd index 158235100..4b3568573 100644 --- a/attic/mirai.qmd +++ b/attic/mirai.qmd @@ -1,70 +1,292 @@ --- -title: "Mirai and mlr3 Demo" -author: "Your Name" +title: "mirai and mlr3 Demo" +author: "Marc Becker" date: "2025-03-27" format: html --- + +1. Use `mirai` for parallelization + ```{r} -library(mlr3) +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("cv", folds = 3) + +future::plan("multisession", workers = 3) + +resample(task, learner, resampling) +``` + + +This means replacing `future_map` with `mirai_map` or promoting the `future.mirai` backend. + +Can `mirai` replace all `future` backends? + +Is it difficult to install mirai on some systems? + +How does nested parallelization work? + Currently the user calls `future::plan(list("multisession", "sequential"))` + So the user can switch between inner, outer or parallelizing both loops + With `mirai` we have to call `daemons()` for the inner loop? + +2. Add `mirai` as an encapsulation option +```{r} learner = lrn("classif.rpart") -learner$encapsulate("callr", lrn("classif.featureless")) +learner$encapsulate("callr", fallback = lrn("classif.featureless")) task = tsk("pima") resampling = rsmp("cv", folds = 3) -rr = resample(task, learner, resampling) +resample(task, learner, resampling) +``` + + +Setting a timeout works but the `mirai` must be stopped with `stop_mirai()` +Why is the `mirai` not stopped automatically? +We could wait for `errorValue 5` and then call `stop_mirai()` in mlr3 + +Can `mirai` record warnings? + +Could `mirai` enforce a memory limit? + +3. Combine parallelization and encapsulation with `mirai` + +```{r} +learner = lrn("classif.rpart") +learner$encapsulate("callr", fallback = lrn("classif.featureless")) +task = tsk("pima") +resampling = rsmp("cv", folds = 3) + +future::plan("multisession", workers = 3) + +resample(task, learner, resampling) +``` + +Any disadvantages of calling a `mirai` in a `mirai`? + +# Parallelization + +Use future `multisession` backend + +```{r} +library(mlr3) +library(future) + +lgr::get_logger("mlr3")$set_threshold("warn") + +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) + +future::plan("multisession", workers = 5) + +# 3.1 seconds +microbenchmark::microbenchmark( + resample = resample(task, learner, resampling), + times = 10, + unit = "ms" +) +``` + +Use future `multisession` backend and combine resampling iterations to chunks + +```{r} +library(mlr3) +library(future) + +lgr::get_logger("mlr3")$set_threshold("warn") + +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) + +future::plan("multisession", workers = 5) + +options(mlr3.exec_chunk_bins = 5) + +# 1 second +microbenchmark::microbenchmark( + resample = resample(task, learner, resampling), + times = 10, + unit = "ms" +) +``` + +Use `future.mirai` backend + +```{r} +library(mlr3) +library(future) +library(future.mirai) + +lgr::get_logger("mlr3")$set_threshold("warn") + +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) + +mirai::daemons(0) +mirai::daemons(5) +future::plan("mirai_cluster") + +# 650 ms +microbenchmark::microbenchmark( + resample = resample(task, learner, resampling), + times = 50, + unit = "ms" +) +``` + +Replace `future_map` with `mirai_map` + +```{r} +library(mlr3) +library(mirai) + +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) + +daemons(0) +daemons(5) + +# 440 ms +microbenchmark::microbenchmark( + resample = resample_mirai(task, learner, resampling), + times = 50, + unit = "ms" +) ``` -# +# Encapsulation +```{r} +library(mirai) +learner = lrn("classif.rpart") +learner$encapsulate("mirai", fallback = lrn("classif.featureless")) +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) -# Segfaults +# 6 seconds +daemons(1) +status() -Simulate a learner with many segfaults. +system.time({rr = resample(task, learner, resampling)}) +``` ```{r} +learner = lrn("classif.rpart") +learner$encapsulate("callr", fallback = lrn("classif.featureless")) +task = tsk("pima") +resampling = rsmp("subsampling", repeats = 100) + +# 90 seconds +system.time({rr = resample(task, learner, resampling)}) +``` + +# Encapsulation and Parallelization + +```{r} +library(mirai) +library(data.table) + daemons(0) daemons(3) -x = mirai_map(seq(6), function(i) { - if (i == 2) return(2) - tools::pskill(Sys.getpid()) +# setup encapsulation daemons +everywhere({ + mirai::daemons(1) }) + +Sys.sleep(1) + +# 3 resample iterations +x = collect_mirai(mirai_map(seq(3), function(i) { + + # train + model = mirai::collect_mirai(mirai::mirai({ + if (i == 2) stop("Simple R error") + sprintf("model_%i_%i", i, Sys.getpid()) + }, i = i)) + + # fit fallback learner + if (mirai::is_mirai_error(model)) model = sprintf("fallback_model_%i_%i", i, Sys.getpid()) + + # predict + prediction = mirai::collect_mirai(mirai::mirai({rnorm(1)})) + + data.table::data.table(model = model, prediction = prediction, pid = Sys.getpid()) +})) + +rbindlist(x) + +# model prediction pid +# +# 1: model_1_1858301 -0.2694271 1857905 +# 2: model_2_1858313 -0.3860743 1857907 +# 3: model_3_1858325 -0.0356403 1857909 ``` +mirai + mirai + ```{r} -# no daemons left -# periodically check the status in mlr3? +library(mirai) + +learner = lrn("classif.rpart") +learner$encapsulate("mirai", fallback = lrn("classif.featureless")) +task = tsk("pima") +resample = rsmp("subsampling", repeats = 100) + +daemons(0) +daemons(5) + status() -# mlr3 could restart local daemons -# what about remote daemons? -launch_local(3) +everywhere({ + mirai::daemons(1) +}) + +x = mirai::mirai({mirai::status()}) +x$data -# all finished status() -x[.progress] +# 1.5 seconds +system.time({rr = resample_mirai(task, learner, resample)}) ``` -# Simple R Errors +mirai + callr ```{r} +library(mirai) + +learner = lrn("classif.rpart") +learner$encapsulate("callr", fallback = lrn("classif.featureless")) +task = tsk("pima") +resample = rsmp("subsampling", repeats = 100) + daemons(0) -daemons(3) +daemons(5) -x = mirai_map(seq(3), function(i) { - if (i == 2) stop("Simple R error") - i -}) +# 20 seconds +system.time({rr = resample_mirai(task, learner, resample)}) ``` +future + callr + ```{r} -# all daemons available -status() +learner = lrn("classif.rpart") +learner$encapsulate("callr", fallback = lrn("classif.featureless")) +task = tsk("pima") +resample = rsmp("subsampling", repeats = 100) + +future::plan("multisession", workers = 5) + +options(mlr3.exec_chunk_bins = 5) -x[.progress] +# 26 seconds +system.time({rr = resample_mirai(task, learner, resample)}) ``` From e39c244cd191c928212ca88d05c579aa1acbb1ea Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 17 Apr 2025 20:31:16 +0200 Subject: [PATCH 04/22] ... --- .gitignore | 1 + attic/mirai.qmd | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 093a40e42..0ee0146ff 100644 --- a/.gitignore +++ b/.gitignore @@ -182,3 +182,4 @@ revdep/ # misc Meta/ Rplots.pdf +.cursor/rules/equal.mdc diff --git a/attic/mirai.qmd b/attic/mirai.qmd index 4b3568573..6393925b3 100644 --- a/attic/mirai.qmd +++ b/attic/mirai.qmd @@ -21,7 +21,8 @@ resample(task, learner, resampling) This means replacing `future_map` with `mirai_map` or promoting the `future.mirai` backend. -Can `mirai` replace all `future` backends? +**Can `mirai` replace all `future` backends?** + Is it difficult to install mirai on some systems? @@ -42,13 +43,22 @@ resample(task, learner, resampling) ``` -Setting a timeout works but the `mirai` must be stopped with `stop_mirai()` + + +**Can `mirai` record warnings? Is this valueable for mirai?** + For `callr` we catch warnings and write them to a log file. + We parse the log file after the process has finished. + Not sure why we do this. Maybe to record warnings in case of segfaults? + +**Could `mirai` enforce a memory limit?** + This would be very useful for AutML. -Can `mirai` record warnings? -Could `mirai` enforce a memory limit? +**Do I need to use `daemons()`? Or is a daemon automatically started when I call `mirai::mirai()`?** + Already in there. + 3. Combine parallelization and encapsulation with `mirai` @@ -233,6 +243,7 @@ mirai + mirai ```{r} library(mirai) +library(mlr3) learner = lrn("classif.rpart") learner$encapsulate("mirai", fallback = lrn("classif.featureless")) From 9ae87a44d8ac4471a6efc667a646e61111368c2d Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 11:17:25 +0200 Subject: [PATCH 05/22] ... --- DESCRIPTION | 2 + NAMESPACE | 2 + R/benchmark_mirai.R | 223 +++++++++++++++++++++++++++++++++++++++++ attic/mirai.qmd | 34 +++++++ man/BenchmarkResult.Rd | 3 +- man/ResampleResult.Rd | 3 +- man/benchmark.Rd | 3 +- man/benchmark_grid.Rd | 3 +- man/benchmark_mirai.Rd | 206 +++++++++++++++++++++++++++++++++++++ man/resample.Rd | 3 +- man/resample_mirai.Rd | 179 +++++++++++++++++++++++++++++++++ 11 files changed, 656 insertions(+), 5 deletions(-) create mode 100644 R/benchmark_mirai.R create mode 100644 man/benchmark_mirai.Rd create mode 100644 man/resample_mirai.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c79f908d6..334ffe6a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -184,6 +184,7 @@ Collate: 'auto_convert.R' 'benchmark.R' 'benchmark_grid.R' + 'benchmark_mirai.R' 'bibentries.R' 'default_fallback.R' 'default_measures.R' @@ -202,6 +203,7 @@ Collate: 'predict.R' 'reexports.R' 'resample.R' + 'resample_mirai.R' 'score_roc_measures.R' 'set_threads.R' 'set_validate.R' diff --git a/NAMESPACE b/NAMESPACE index 8e3b86a46..59be2beb7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -226,6 +226,7 @@ export(assert_validate) export(auto_convert) export(benchmark) export(benchmark_grid) +export(benchmark_mirai) export(callback_resample) export(check_prediction_data) export(clbk) @@ -258,6 +259,7 @@ export(msr) export(msrs) export(partition) export(resample) +export(resample_mirai) export(rsmp) export(rsmps) export(score_roc_measures) diff --git a/R/benchmark_mirai.R b/R/benchmark_mirai.R new file mode 100644 index 000000000..39619205a --- /dev/null +++ b/R/benchmark_mirai.R @@ -0,0 +1,223 @@ +#' @title Benchmark Multiple Learners on Multiple Tasks +#' +#' @description +#' Runs a benchmark on arbitrary combinations of tasks ([Task]), learners ([Learner]), and resampling strategies ([Resampling]), possibly in parallel. +#' +#' For large-scale benchmarking we recommend to use the \CRANpkg{mlr3batchmark} package. +#' This package runs benchmark experiments on high-performance computing clusters and handles failed experiments. +#' +#' @param design ([data.frame()])\cr +#' Data frame (or [data.table::data.table()]) with three columns: "task", "learner", and "resampling". +#' Each row defines a resampling by providing a [Task], [Learner] and an instantiated [Resampling] strategy. +#' The helper function [benchmark_grid()] can assist in generating an exhaustive design (see examples) and +#' instantiate the [Resampling]s per [Task]. +#' Additionally, you can set the additional column 'param_values', see [benchmark_grid()]. +#' @template param_store_models +#' @template param_store_backends +#' @template param_encapsulate +#' @template param_allow_hotstart +#' @template param_clone +#' @template param_unmarshal +#' @template param_callbacks +#' +#' @return [BenchmarkResult]. +#' +#' @note +#' The fitted models are discarded after the predictions have been scored in order to reduce memory consumption. +#' If you need access to the models for later analysis, set `store_models` to `TRUE`. +#' +#' @template section_predict_sets +#' @template section_parallelization +#' @template section_progress_bars +#' @template section_logging +#' +#' @template seealso_benchmark +#' @export +#' @examples +#' # benchmarking with benchmark_grid() +#' tasks = lapply(c("penguins", "sonar"), tsk) +#' learners = lapply(c("classif.featureless", "classif.rpart"), lrn) +#' resamplings = rsmp("cv", folds = 3) +#' +#' design = benchmark_grid(tasks, learners, resamplings) +#' print(design) +#' +#' set.seed(123) +#' bmr = benchmark(design) +#' +#' ## Data of all resamplings +#' head(as.data.table(bmr)) +#' +#' ## Aggregated performance values +#' aggr = bmr$aggregate() +#' print(aggr) +#' +#' ## Extract predictions of first resampling result +#' rr = aggr$resample_result[[1]] +#' as.data.table(rr$prediction()) +#' +#' # Benchmarking with a custom design: +#' # - fit classif.featureless on penguins with a 3-fold CV +#' # - fit classif.rpart on sonar using a holdout +#' tasks = list(tsk("penguins"), tsk("sonar")) +#' learners = list(lrn("classif.featureless"), lrn("classif.rpart")) +#' resamplings = list(rsmp("cv", folds = 3), rsmp("holdout")) +#' +#' design = data.table::data.table( +#' task = tasks, +#' learner = learners, +#' resampling = resamplings +#' ) +#' +#' ## Instantiate resamplings +#' design$resampling = Map( +#' function(task, resampling) resampling$clone()$instantiate(task), +#' task = design$task, resampling = design$resampling +#' ) +#' +#' ## Run benchmark +#' bmr = benchmark(design) +#' print(bmr) +#' +#' ## Get the training set of the 2nd iteration of the featureless learner on penguins +#' rr = bmr$aggregate()[learner_id == "classif.featureless"]$resample_result[[1]] +#' rr$resampling$train_set(2) +benchmark_mirai = function(design, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE, callbacks = NULL) { + assert_subset(clone, c("task", "learner", "resampling")) + assert_data_frame(design, min.rows = 1L) + assert_names(names(design), must.include = c("task", "learner", "resampling")) + assert_flag(unmarshal) + design$task = list(assert_tasks(as_tasks(design$task))) + design$learner = list(assert_learners(as_learners(design$learner))) + design$resampling = list(assert_resamplings(as_resamplings(design$resampling), instantiated = TRUE)) + if (is.null(design$param_values)) { + design$param_values = list() + } else { + design$param_values = list(assert_param_values(design$param_values, n_learners = length(design$learner))) + } + assert_flag(store_models) + assert_flag(store_backends) + callbacks = assert_callbacks(as_callbacks(callbacks)) + + # check for multiple task types + task_types = unique(map_chr(design$task, "task_type")) + if (length(task_types) > 1) { + stopf("Multiple task types detected, but mixing types is not supported: %s", str_collapse(task_types)) + } + learner_types = unique(map_chr(design$learner, "task_type")) + if (length(learner_types) > 1) { + stopf("Multiple learner types detected, but mixing types is not supported: %s", str_collapse(learner_types)) + } + + setDT(design) + task = learner = resampling = NULL + if ("task" %chin% clone) { + design[, "task" := list(list(task[[1L]]$clone())), by = list(hashes(task))] + } + if ("learner" %chin% clone) { + design[, "learner" := list(list(learner[[1L]]$clone())), by = list(hashes(learner))] + } + if ("resampling" %chin% clone) { + design[, "resampling" := list(list(resampling[[1L]]$clone())), by = list(hashes(resampling))] + } + + # set encapsulation + fallback + set_encapsulation(design$learner, encapsulate) + + # expand the design: add rows for each resampling iteration and param_values + grid = pmap_dtr(design, function(task, learner, resampling, param_values) { + iters = resampling$iters + n_params = max(1L, length(param_values)) + # insert constant values + param_values = map(param_values, function(values) insert_named(learner$param_set$values, values)) + assert_learnable(task, learner, unlist(param_values, recursive = FALSE)) + + # check that all row ids of the resampling are present in the task + if (resampling$task_row_hash != task$row_hash) { + stopf("Resampling '%s' is not instantiated on task '%s'", resampling$id, task$id) + } + + data.table( + task = list(task), learner = list(learner), resampling = list(resampling), + iteration = rep(seq_len(iters), times = n_params), + param_values = if (is.null(param_values)) list() else rep(param_values, each = iters), + uhash = rep(UUIDgenerate(n = n_params), each = iters) + ) + }) + + n = nrow(grid) + + # set default mode + set(grid, j = "mode", value = "train") + + lg$info("Running benchmark with %i resampling iterations", n) + pb = if (isNamespaceLoaded("progressr")) { + # NB: the progress bar needs to be created in this env + pb = progressr::progressor(steps = n) + } else { + NULL + } + + # add hot start learners + if (allow_hotstart) { + hotstart_grid = pmap_dtr(grid, function(task, learner, resampling, iteration, ...) { + if (!is.null(learner$hotstart_stack)) { + # search for hotstart learner + learner = learner$clone() + task_hashes = resampling_task_hashes(task, resampling, learner) + start_learner = get_private(learner$hotstart_stack)$.start_learner(learner, task_hashes[iteration]) + } + if (is.null(learner$hotstart_stack) || is.null(start_learner)) { + # no hotstart learners stored or no adaptable model found + mode = "train" + } else { + # hotstart learner found + start_learner$param_set$values = insert_named(start_learner$param_set$values, learner$param_set$values) + learner = start_learner + mode = "hotstart" + } + data.table(learner = list(learner), mode = mode) + }) + + # null hotstart stack to reduce overhead in parallelization + walk(hotstart_grid$learner, function(learner) { + learner$hotstart_stack = NULL + learner + }) + set(grid, j = "learner", value = hotstart_grid$learner) + set(grid, j = "mode", value = hotstart_grid$mode) + } + + # fixme + uhashes = grid$uhash + grid$uhash = NULL + + res = mirai::collect_mirai(mirai::mirai_map(grid,workhorse, .args = list( + store_models = store_models, + lgr_index = lgr::logger_index(), + pb = pb, + unmarshal = unmarshal, + callbacks = callbacks))) + + grid = insert_named(grid, list( + learner_state = map(res, "learner_state"), + prediction = map(res, "prediction"), + param_values = map(res, "param_values"), + learner_hash = map_chr(res, "learner_hash") + )) + grid$uhash = uhashes + + lg$info("Finished benchmark") + + set(grid, j = "mode", value = NULL) + + data_extra = if (length(callbacks) && any(map_lgl(res, function(x) !is.null(x$data_extra)))) map(res, "data_extra") + + result_data = ResultData$new(grid, data_extra, store_backends = store_backends) + + if (unmarshal && store_models) { + result_data$unmarshal() + } + + BenchmarkResult$new(result_data) +} diff --git a/attic/mirai.qmd b/attic/mirai.qmd index 6393925b3..51a46c05f 100644 --- a/attic/mirai.qmd +++ b/attic/mirai.qmd @@ -301,3 +301,37 @@ options(mlr3.exec_chunk_bins = 5) system.time({rr = resample_mirai(task, learner, resample)}) ``` +# Nested Parallelization + +```{r} +library(mirai) + +daemons(0) +daemons(5) + +status() + +everywhere({ + mirai::daemons(2) +}) + +x = mirai::mirai({mirai::status()}) +x$data + +status() +``` + +# Benchmark in Parallel + +```{r} +library(mirai) +library(mlr3) + +learner = lrn("classif.rpart") +task = tsk("pima") +resampling = rsmp("cv", folds = 3) + +grid = benchmark_grid(list(tsk("spam"), tsk("sonar")), learner, resampling) + +bmr = benchmark_mirai(grid) +``` diff --git a/man/BenchmarkResult.Rd b/man/BenchmarkResult.Rd index 25361e337..5a71e70dc 100644 --- a/man/BenchmarkResult.Rd +++ b/man/BenchmarkResult.Rd @@ -122,7 +122,8 @@ bmr$set_threshold(0.7, uhashes = uhashes(bmr, learner_ids = "classif.featureless Other benchmark: \code{\link{benchmark}()}, -\code{\link{benchmark_grid}()} +\code{\link{benchmark_grid}()}, +\code{\link{benchmark_mirai}()} } \concept{benchmark} \section{Active bindings}{ diff --git a/man/ResampleResult.Rd b/man/ResampleResult.Rd index ee9ca4abe..184cc7180 100644 --- a/man/ResampleResult.Rd +++ b/man/ResampleResult.Rd @@ -53,7 +53,8 @@ rr$errors } Other resample: -\code{\link{resample}()} +\code{\link{resample}()}, +\code{\link{resample_mirai}()} } \concept{resample} \section{Active bindings}{ diff --git a/man/benchmark.Rd b/man/benchmark.Rd index 23e9063c7..d46ead09c 100644 --- a/man/benchmark.Rd +++ b/man/benchmark.Rd @@ -200,6 +200,7 @@ rr$resampling$train_set(2) Other benchmark: \code{\link{BenchmarkResult}}, -\code{\link{benchmark_grid}()} +\code{\link{benchmark_grid}()}, +\code{\link{benchmark_mirai}()} } \concept{benchmark} diff --git a/man/benchmark_grid.Rd b/man/benchmark_grid.Rd index db72c5613..6e2a720fd 100644 --- a/man/benchmark_grid.Rd +++ b/man/benchmark_grid.Rd @@ -113,6 +113,7 @@ benchmark(grid) Other benchmark: \code{\link{BenchmarkResult}}, -\code{\link{benchmark}()} +\code{\link{benchmark}()}, +\code{\link{benchmark_mirai}()} } \concept{benchmark} diff --git a/man/benchmark_mirai.Rd b/man/benchmark_mirai.Rd new file mode 100644 index 000000000..75d1559a8 --- /dev/null +++ b/man/benchmark_mirai.Rd @@ -0,0 +1,206 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmark_mirai.R +\name{benchmark_mirai} +\alias{benchmark_mirai} +\title{Benchmark Multiple Learners on Multiple Tasks} +\usage{ +benchmark_mirai( + design, + store_models = FALSE, + store_backends = TRUE, + encapsulate = NA_character_, + allow_hotstart = FALSE, + clone = c("task", "learner", "resampling"), + unmarshal = TRUE, + callbacks = NULL +) +} +\arguments{ +\item{design}{(\code{\link[=data.frame]{data.frame()}})\cr +Data frame (or \code{\link[data.table:data.table]{data.table::data.table()}}) with three columns: "task", "learner", and "resampling". +Each row defines a resampling by providing a \link{Task}, \link{Learner} and an instantiated \link{Resampling} strategy. +The helper function \code{\link[=benchmark_grid]{benchmark_grid()}} can assist in generating an exhaustive design (see examples) and +instantiate the \link{Resampling}s per \link{Task}. +Additionally, you can set the additional column 'param_values', see \code{\link[=benchmark_grid]{benchmark_grid()}}.} + +\item{store_models}{(\code{logical(1)})\cr +Store the fitted model in the resulting object= +Set to \code{TRUE} if you want to further analyse the models or want to +extract information like variable importance.} + +\item{store_backends}{(\code{logical(1)})\cr +Keep the \link{DataBackend} of the \link{Task} in the \link{ResampleResult}? +Set to \code{TRUE} if your performance measures require a \link{Task}, +or to analyse results more conveniently. +Set to \code{FALSE} to reduce the file size and memory footprint +after serialization. +The current default is \code{TRUE}, but this eventually will be changed +in a future release.} + +\item{encapsulate}{(\code{character(1)})\cr +If not \code{NA}, enables encapsulation by setting the field +\code{Learner$encapsulate} to one of the supported values: +\code{"none"} (disable encapsulation), +\code{"try"} (captures errors but output is printed to the console and not logged), +\code{"evaluate"} (execute via \CRANpkg{evaluate}) and +\code{"callr"} (start in external session via \CRANpkg{callr}). +If \code{NA}, encapsulation is not changed, i.e. the settings of the +individual learner are active. +Additionally, if encapsulation is set to \code{"evaluate"} or \code{"callr"}, +the fallback learner is set to the featureless learner if the learner +does not already have a fallback configured.} + +\item{allow_hotstart}{(\code{logical(1)})\cr +Determines if learner(s) are hot started with trained models in +\verb{$hotstart_stack}. See also \link{HotstartStack}.} + +\item{clone}{(\code{character()})\cr +Select the input objects to be cloned before proceeding by +providing a set with possible values \code{"task"}, \code{"learner"} and +\code{"resampling"} for \link{Task}, \link{Learner} and \link{Resampling}, respectively. +Per default, all input objects are cloned.} + +\item{unmarshal}{\code{\link{Learner}}\cr +Whether to unmarshal learners that were marshaled during the execution. +If \code{TRUE} all models are stored in unmarshaled form. +If \code{FALSE}, all learners (that need marshaling) are stored in marshaled form.} + +\item{callbacks}{(List of \link[mlr3misc:Callback]{mlr3misc::Callback})\cr +Callbacks to be executed during the resampling process. +See \link{CallbackResample} and \link{ContextResample} for details.} +} +\value{ +\link{BenchmarkResult}. +} +\description{ +Runs a benchmark on arbitrary combinations of tasks (\link{Task}), learners (\link{Learner}), and resampling strategies (\link{Resampling}), possibly in parallel. + +For large-scale benchmarking we recommend to use the \CRANpkg{mlr3batchmark} package. +This package runs benchmark experiments on high-performance computing clusters and handles failed experiments. +} +\note{ +The fitted models are discarded after the predictions have been scored in order to reduce memory consumption. +If you need access to the models for later analysis, set \code{store_models} to \code{TRUE}. +} +\section{Predict Sets}{ + +If you want to compare the performance of a learner on the training with the performance +on the test set, you have to configure the \link{Learner} to predict on multiple sets by +setting the field \code{predict_sets} to \code{c("train", "test")} (default is \code{"test"}). +Each set yields a separate \link{Prediction} object during resampling. +In the next step, you have to configure the measures to operate on the respective Prediction object: + +\if{html}{\out{
}}\preformatted{m1 = msr("classif.ce", id = "ce.train", predict_sets = "train") +m2 = msr("classif.ce", id = "ce.test", predict_sets = "test") +}\if{html}{\out{
}} + +The (list of) created measures can finally be passed to \verb{$aggregate()} or \verb{$score()}. +} + +\section{Parallelization}{ + + +This function can be parallelized with the \CRANpkg{future} package. +One job is one resampling iteration, and all jobs are send to an apply function +from \CRANpkg{future.apply} in a single batch. +To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. +More on parallelization can be found in the book: +\url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} +} + +\section{Progress Bars}{ + +This function supports progress bars via the package \CRANpkg{progressr}. +Simply wrap the function call in \code{\link[progressr:with_progress]{progressr::with_progress()}} to enable them. +Alternatively, call \code{\link[progressr:handlers]{progressr::handlers()}} with \code{global = TRUE} to enable progress bars +globally. +We recommend the \CRANpkg{progress} package as backend which can be enabled with +\code{progressr::handlers("progress")}. +} + +\section{Logging}{ + + +The \CRANpkg{mlr3} uses the \CRANpkg{lgr} package for logging. +\CRANpkg{lgr} supports multiple log levels which can be queried with +\code{getOption("lgr.log_levels")}. + +To suppress output and reduce verbosity, you can lower the log from the +default level \code{"info"} to \code{"warn"}: + +\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("warn") +}\if{html}{\out{
}} + +To get additional log output for debugging, increase the log level to \code{"debug"} +or \code{"trace"}: + +\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("debug") +}\if{html}{\out{
}} + +To log to a file or a data base, see the documentation of \link[lgr:lgr-package]{lgr::lgr-package}. +} + +\examples{ +# benchmarking with benchmark_grid() +tasks = lapply(c("penguins", "sonar"), tsk) +learners = lapply(c("classif.featureless", "classif.rpart"), lrn) +resamplings = rsmp("cv", folds = 3) + +design = benchmark_grid(tasks, learners, resamplings) +print(design) + +set.seed(123) +bmr = benchmark(design) + +## Data of all resamplings +head(as.data.table(bmr)) + +## Aggregated performance values +aggr = bmr$aggregate() +print(aggr) + +## Extract predictions of first resampling result +rr = aggr$resample_result[[1]] +as.data.table(rr$prediction()) + +# Benchmarking with a custom design: +# - fit classif.featureless on penguins with a 3-fold CV +# - fit classif.rpart on sonar using a holdout +tasks = list(tsk("penguins"), tsk("sonar")) +learners = list(lrn("classif.featureless"), lrn("classif.rpart")) +resamplings = list(rsmp("cv", folds = 3), rsmp("holdout")) + +design = data.table::data.table( + task = tasks, + learner = learners, + resampling = resamplings +) + +## Instantiate resamplings +design$resampling = Map( + function(task, resampling) resampling$clone()$instantiate(task), + task = design$task, resampling = design$resampling +) + +## Run benchmark +bmr = benchmark(design) +print(bmr) + +## Get the training set of the 2nd iteration of the featureless learner on penguins +rr = bmr$aggregate()[learner_id == "classif.featureless"]$resample_result[[1]] +rr$resampling$train_set(2) +} +\seealso{ +\itemize{ +\item Chapter in the \href{https://mlr3book.mlr-org.com/}{mlr3book}: +\url{https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-benchmarking} +\item Package \CRANpkg{mlr3viz} for some generic visualizations. +\item \CRANpkg{mlr3benchmark} for post-hoc analysis of benchmark results. +} + +Other benchmark: +\code{\link{BenchmarkResult}}, +\code{\link{benchmark}()}, +\code{\link{benchmark_grid}()} +} +\concept{benchmark} diff --git a/man/resample.Rd b/man/resample.Rd index d5e10af0d..19a86de2a 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -178,6 +178,7 @@ print(bmr1$combine(bmr2)) } Other resample: -\code{\link{ResampleResult}} +\code{\link{ResampleResult}}, +\code{\link{resample_mirai}()} } \concept{resample} diff --git a/man/resample_mirai.Rd b/man/resample_mirai.Rd new file mode 100644 index 000000000..069043511 --- /dev/null +++ b/man/resample_mirai.Rd @@ -0,0 +1,179 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/resample_mirai.R +\name{resample_mirai} +\alias{resample_mirai} +\title{Resample a Learner on a Task} +\usage{ +resample_mirai( + task, + learner, + resampling, + store_models = FALSE, + store_backends = TRUE, + encapsulate = NA_character_, + allow_hotstart = FALSE, + clone = c("task", "learner", "resampling"), + unmarshal = TRUE +) +} +\arguments{ +\item{task}{(\link{Task}).} + +\item{learner}{(\link{Learner}).} + +\item{resampling}{(\link{Resampling}).} + +\item{store_models}{(\code{logical(1)})\cr +Store the fitted model in the resulting object= +Set to \code{TRUE} if you want to further analyse the models or want to +extract information like variable importance.} + +\item{store_backends}{(\code{logical(1)})\cr +Keep the \link{DataBackend} of the \link{Task} in the \link{ResampleResult}? +Set to \code{TRUE} if your performance measures require a \link{Task}, +or to analyse results more conveniently. +Set to \code{FALSE} to reduce the file size and memory footprint +after serialization. +The current default is \code{TRUE}, but this eventually will be changed +in a future release.} + +\item{encapsulate}{(\code{character(1)})\cr +If not \code{NA}, enables encapsulation by setting the field +\code{Learner$encapsulate} to one of the supported values: +\code{"none"} (disable encapsulation), +\code{"try"} (captures errors but output is printed to the console and not logged), +\code{"evaluate"} (execute via \CRANpkg{evaluate}) and +\code{"callr"} (start in external session via \CRANpkg{callr}). +If \code{NA}, encapsulation is not changed, i.e. the settings of the +individual learner are active. +Additionally, if encapsulation is set to \code{"evaluate"} or \code{"callr"}, +the fallback learner is set to the featureless learner if the learner +does not already have a fallback configured.} + +\item{allow_hotstart}{(\code{logical(1)})\cr +Determines if learner(s) are hot started with trained models in +\verb{$hotstart_stack}. See also \link{HotstartStack}.} + +\item{clone}{(\code{character()})\cr +Select the input objects to be cloned before proceeding by +providing a set with possible values \code{"task"}, \code{"learner"} and +\code{"resampling"} for \link{Task}, \link{Learner} and \link{Resampling}, respectively. +Per default, all input objects are cloned.} + +\item{unmarshal}{\code{\link{Learner}}\cr +Whether to unmarshal learners that were marshaled during the execution. +If \code{TRUE} all models are stored in unmarshaled form. +If \code{FALSE}, all learners (that need marshaling) are stored in marshaled form.} +} +\value{ +\link{ResampleResult}. +} +\description{ +Runs a resampling (possibly in parallel): +Repeatedly apply \link{Learner} \code{learner} on a training set of \link{Task} \code{task} to train a model, +then use the trained model to predict observations of a test set. +Training and test sets are defined by the \link{Resampling} \code{resampling}. +} +\note{ +The fitted models are discarded after the predictions have been computed in order to reduce memory consumption. +If you need access to the models for later analysis, set \code{store_models} to \code{TRUE}. +} +\section{Predict Sets}{ + +If you want to compare the performance of a learner on the training with the performance +on the test set, you have to configure the \link{Learner} to predict on multiple sets by +setting the field \code{predict_sets} to \code{c("train", "test")} (default is \code{"test"}). +Each set yields a separate \link{Prediction} object during resampling. +In the next step, you have to configure the measures to operate on the respective Prediction object: + +\if{html}{\out{
}}\preformatted{m1 = msr("classif.ce", id = "ce.train", predict_sets = "train") +m2 = msr("classif.ce", id = "ce.test", predict_sets = "test") +}\if{html}{\out{
}} + +The (list of) created measures can finally be passed to \verb{$aggregate()} or \verb{$score()}. +} + +\section{Parallelization}{ + + +This function can be parallelized with the \CRANpkg{future} package. +One job is one resampling iteration, and all jobs are send to an apply function +from \CRANpkg{future.apply} in a single batch. +To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. +More on parallelization can be found in the book: +\url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} +} + +\section{Progress Bars}{ + +This function supports progress bars via the package \CRANpkg{progressr}. +Simply wrap the function call in \code{\link[progressr:with_progress]{progressr::with_progress()}} to enable them. +Alternatively, call \code{\link[progressr:handlers]{progressr::handlers()}} with \code{global = TRUE} to enable progress bars +globally. +We recommend the \CRANpkg{progress} package as backend which can be enabled with +\code{progressr::handlers("progress")}. +} + +\section{Logging}{ + + +The \CRANpkg{mlr3} uses the \CRANpkg{lgr} package for logging. +\CRANpkg{lgr} supports multiple log levels which can be queried with +\code{getOption("lgr.log_levels")}. + +To suppress output and reduce verbosity, you can lower the log from the +default level \code{"info"} to \code{"warn"}: + +\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("warn") +}\if{html}{\out{
}} + +To get additional log output for debugging, increase the log level to \code{"debug"} +or \code{"trace"}: + +\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("debug") +}\if{html}{\out{
}} + +To log to a file or a data base, see the documentation of \link[lgr:lgr-package]{lgr::lgr-package}. +} + +\examples{ +task = tsk("penguins") +learner = lrn("classif.rpart") +resampling = rsmp("cv") + +# Explicitly instantiate the resampling for this task for reproduciblity +set.seed(123) +resampling$instantiate(task) + +rr = resample(task, learner, resampling) +print(rr) + +# Retrieve performance +rr$score(msr("classif.ce")) +rr$aggregate(msr("classif.ce")) + +# merged prediction objects of all resampling iterations +pred = rr$prediction() +pred$confusion + +# Repeat resampling with featureless learner +rr_featureless = resample(task, lrn("classif.featureless"), resampling) + +# Convert results to BenchmarkResult, then combine them +bmr1 = as_benchmark_result(rr) +bmr2 = as_benchmark_result(rr_featureless) +print(bmr1$combine(bmr2)) +} +\seealso{ +\itemize{ +\item \code{\link[=as_benchmark_result]{as_benchmark_result()}} to convert to a \link{BenchmarkResult}. +\item Chapter in the \href{https://mlr3book.mlr-org.com/}{mlr3book}: +\url{https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-resampling} +\item Package \CRANpkg{mlr3viz} for some generic visualizations. +} + +Other resample: +\code{\link{ResampleResult}}, +\code{\link{resample}()} +} +\concept{resample} From 67e6031cc68e524d922ab01828f92a566f86bd24 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:43:05 +0200 Subject: [PATCH 06/22] ... --- DESCRIPTION | 1 + R/benchmark_mirai.R | 223 -------------------------------------------- R/helper_exec.R | 3 + R/resample_mirai.R | 142 ---------------------------- 4 files changed, 4 insertions(+), 365 deletions(-) delete mode 100644 R/benchmark_mirai.R delete mode 100644 R/resample_mirai.R diff --git a/DESCRIPTION b/DESCRIPTION index 334ffe6a3..b74e1c912 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,6 +67,7 @@ Suggests: codetools, datasets, future.callr, + mirai, mlr3data, progressr, remotes, diff --git a/R/benchmark_mirai.R b/R/benchmark_mirai.R deleted file mode 100644 index 39619205a..000000000 --- a/R/benchmark_mirai.R +++ /dev/null @@ -1,223 +0,0 @@ -#' @title Benchmark Multiple Learners on Multiple Tasks -#' -#' @description -#' Runs a benchmark on arbitrary combinations of tasks ([Task]), learners ([Learner]), and resampling strategies ([Resampling]), possibly in parallel. -#' -#' For large-scale benchmarking we recommend to use the \CRANpkg{mlr3batchmark} package. -#' This package runs benchmark experiments on high-performance computing clusters and handles failed experiments. -#' -#' @param design ([data.frame()])\cr -#' Data frame (or [data.table::data.table()]) with three columns: "task", "learner", and "resampling". -#' Each row defines a resampling by providing a [Task], [Learner] and an instantiated [Resampling] strategy. -#' The helper function [benchmark_grid()] can assist in generating an exhaustive design (see examples) and -#' instantiate the [Resampling]s per [Task]. -#' Additionally, you can set the additional column 'param_values', see [benchmark_grid()]. -#' @template param_store_models -#' @template param_store_backends -#' @template param_encapsulate -#' @template param_allow_hotstart -#' @template param_clone -#' @template param_unmarshal -#' @template param_callbacks -#' -#' @return [BenchmarkResult]. -#' -#' @note -#' The fitted models are discarded after the predictions have been scored in order to reduce memory consumption. -#' If you need access to the models for later analysis, set `store_models` to `TRUE`. -#' -#' @template section_predict_sets -#' @template section_parallelization -#' @template section_progress_bars -#' @template section_logging -#' -#' @template seealso_benchmark -#' @export -#' @examples -#' # benchmarking with benchmark_grid() -#' tasks = lapply(c("penguins", "sonar"), tsk) -#' learners = lapply(c("classif.featureless", "classif.rpart"), lrn) -#' resamplings = rsmp("cv", folds = 3) -#' -#' design = benchmark_grid(tasks, learners, resamplings) -#' print(design) -#' -#' set.seed(123) -#' bmr = benchmark(design) -#' -#' ## Data of all resamplings -#' head(as.data.table(bmr)) -#' -#' ## Aggregated performance values -#' aggr = bmr$aggregate() -#' print(aggr) -#' -#' ## Extract predictions of first resampling result -#' rr = aggr$resample_result[[1]] -#' as.data.table(rr$prediction()) -#' -#' # Benchmarking with a custom design: -#' # - fit classif.featureless on penguins with a 3-fold CV -#' # - fit classif.rpart on sonar using a holdout -#' tasks = list(tsk("penguins"), tsk("sonar")) -#' learners = list(lrn("classif.featureless"), lrn("classif.rpart")) -#' resamplings = list(rsmp("cv", folds = 3), rsmp("holdout")) -#' -#' design = data.table::data.table( -#' task = tasks, -#' learner = learners, -#' resampling = resamplings -#' ) -#' -#' ## Instantiate resamplings -#' design$resampling = Map( -#' function(task, resampling) resampling$clone()$instantiate(task), -#' task = design$task, resampling = design$resampling -#' ) -#' -#' ## Run benchmark -#' bmr = benchmark(design) -#' print(bmr) -#' -#' ## Get the training set of the 2nd iteration of the featureless learner on penguins -#' rr = bmr$aggregate()[learner_id == "classif.featureless"]$resample_result[[1]] -#' rr$resampling$train_set(2) -benchmark_mirai = function(design, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE, callbacks = NULL) { - assert_subset(clone, c("task", "learner", "resampling")) - assert_data_frame(design, min.rows = 1L) - assert_names(names(design), must.include = c("task", "learner", "resampling")) - assert_flag(unmarshal) - design$task = list(assert_tasks(as_tasks(design$task))) - design$learner = list(assert_learners(as_learners(design$learner))) - design$resampling = list(assert_resamplings(as_resamplings(design$resampling), instantiated = TRUE)) - if (is.null(design$param_values)) { - design$param_values = list() - } else { - design$param_values = list(assert_param_values(design$param_values, n_learners = length(design$learner))) - } - assert_flag(store_models) - assert_flag(store_backends) - callbacks = assert_callbacks(as_callbacks(callbacks)) - - # check for multiple task types - task_types = unique(map_chr(design$task, "task_type")) - if (length(task_types) > 1) { - stopf("Multiple task types detected, but mixing types is not supported: %s", str_collapse(task_types)) - } - learner_types = unique(map_chr(design$learner, "task_type")) - if (length(learner_types) > 1) { - stopf("Multiple learner types detected, but mixing types is not supported: %s", str_collapse(learner_types)) - } - - setDT(design) - task = learner = resampling = NULL - if ("task" %chin% clone) { - design[, "task" := list(list(task[[1L]]$clone())), by = list(hashes(task))] - } - if ("learner" %chin% clone) { - design[, "learner" := list(list(learner[[1L]]$clone())), by = list(hashes(learner))] - } - if ("resampling" %chin% clone) { - design[, "resampling" := list(list(resampling[[1L]]$clone())), by = list(hashes(resampling))] - } - - # set encapsulation + fallback - set_encapsulation(design$learner, encapsulate) - - # expand the design: add rows for each resampling iteration and param_values - grid = pmap_dtr(design, function(task, learner, resampling, param_values) { - iters = resampling$iters - n_params = max(1L, length(param_values)) - # insert constant values - param_values = map(param_values, function(values) insert_named(learner$param_set$values, values)) - assert_learnable(task, learner, unlist(param_values, recursive = FALSE)) - - # check that all row ids of the resampling are present in the task - if (resampling$task_row_hash != task$row_hash) { - stopf("Resampling '%s' is not instantiated on task '%s'", resampling$id, task$id) - } - - data.table( - task = list(task), learner = list(learner), resampling = list(resampling), - iteration = rep(seq_len(iters), times = n_params), - param_values = if (is.null(param_values)) list() else rep(param_values, each = iters), - uhash = rep(UUIDgenerate(n = n_params), each = iters) - ) - }) - - n = nrow(grid) - - # set default mode - set(grid, j = "mode", value = "train") - - lg$info("Running benchmark with %i resampling iterations", n) - pb = if (isNamespaceLoaded("progressr")) { - # NB: the progress bar needs to be created in this env - pb = progressr::progressor(steps = n) - } else { - NULL - } - - # add hot start learners - if (allow_hotstart) { - hotstart_grid = pmap_dtr(grid, function(task, learner, resampling, iteration, ...) { - if (!is.null(learner$hotstart_stack)) { - # search for hotstart learner - learner = learner$clone() - task_hashes = resampling_task_hashes(task, resampling, learner) - start_learner = get_private(learner$hotstart_stack)$.start_learner(learner, task_hashes[iteration]) - } - if (is.null(learner$hotstart_stack) || is.null(start_learner)) { - # no hotstart learners stored or no adaptable model found - mode = "train" - } else { - # hotstart learner found - start_learner$param_set$values = insert_named(start_learner$param_set$values, learner$param_set$values) - learner = start_learner - mode = "hotstart" - } - data.table(learner = list(learner), mode = mode) - }) - - # null hotstart stack to reduce overhead in parallelization - walk(hotstart_grid$learner, function(learner) { - learner$hotstart_stack = NULL - learner - }) - set(grid, j = "learner", value = hotstart_grid$learner) - set(grid, j = "mode", value = hotstart_grid$mode) - } - - # fixme - uhashes = grid$uhash - grid$uhash = NULL - - res = mirai::collect_mirai(mirai::mirai_map(grid,workhorse, .args = list( - store_models = store_models, - lgr_index = lgr::logger_index(), - pb = pb, - unmarshal = unmarshal, - callbacks = callbacks))) - - grid = insert_named(grid, list( - learner_state = map(res, "learner_state"), - prediction = map(res, "prediction"), - param_values = map(res, "param_values"), - learner_hash = map_chr(res, "learner_hash") - )) - grid$uhash = uhashes - - lg$info("Finished benchmark") - - set(grid, j = "mode", value = NULL) - - data_extra = if (length(callbacks) && any(map_lgl(res, function(x) !is.null(x$data_extra)))) map(res, "data_extra") - - result_data = ResultData$new(grid, data_extra, store_backends = store_backends) - - if (unmarshal && store_models) { - result_data$unmarshal() - } - - BenchmarkResult$new(result_data) -} diff --git a/R/helper_exec.R b/R/helper_exec.R index cc3f95296..84b73ef7f 100644 --- a/R/helper_exec.R +++ b/R/helper_exec.R @@ -23,6 +23,9 @@ future_map = function(n, FUN, ..., MoreArgs = list()) { if (getOption("mlr3.debug", FALSE)) { lg$info("Running experiments sequentially in debug mode with %i iterations", n) mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE) + } else if (requireNamespace("mirai", quietly = TRUE) && mirai::status()$connections) { + lg$debug("Running resample() via mirai with %i iterations", n) + mirai::collect_mirai(mirai::mirai_map(data.table(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) } else { is_sequential = inherits(plan(), "sequential") scheduling = if (!is_sequential && isTRUE(getOption("mlr3.exec_random", TRUE))) structure(TRUE, ordering = "random") else TRUE diff --git a/R/resample_mirai.R b/R/resample_mirai.R deleted file mode 100644 index 7084f4408..000000000 --- a/R/resample_mirai.R +++ /dev/null @@ -1,142 +0,0 @@ -#' @title Resample a Learner on a Task -#' -#' @description -#' Runs a resampling (possibly in parallel): -#' Repeatedly apply [Learner] `learner` on a training set of [Task] `task` to train a model, -#' then use the trained model to predict observations of a test set. -#' Training and test sets are defined by the [Resampling] `resampling`. -#' -#' @param task ([Task]). -#' @param learner ([Learner]). -#' @param resampling ([Resampling]). -#' @template param_store_models -#' @template param_store_backends -#' @template param_encapsulate -#' @template param_allow_hotstart -#' @template param_clone -#' @template param_unmarshal -#' @return [ResampleResult]. -#' -#' @template section_predict_sets -#' @template section_parallelization -#' @template section_progress_bars -#' @template section_logging -#' -#' @note -#' The fitted models are discarded after the predictions have been computed in order to reduce memory consumption. -#' If you need access to the models for later analysis, set `store_models` to `TRUE`. -#' -#' @template seealso_resample -#' @export -#' @examples -#' task = tsk("penguins") -#' learner = lrn("classif.rpart") -#' resampling = rsmp("cv") -#' -#' # Explicitly instantiate the resampling for this task for reproduciblity -#' set.seed(123) -#' resampling$instantiate(task) -#' -#' rr = resample(task, learner, resampling) -#' print(rr) -#' -#' # Retrieve performance -#' rr$score(msr("classif.ce")) -#' rr$aggregate(msr("classif.ce")) -#' -#' # merged prediction objects of all resampling iterations -#' pred = rr$prediction() -#' pred$confusion -#' -#' # Repeat resampling with featureless learner -#' rr_featureless = resample(task, lrn("classif.featureless"), resampling) -#' -#' # Convert results to BenchmarkResult, then combine them -#' bmr1 = as_benchmark_result(rr) -#' bmr2 = as_benchmark_result(rr_featureless) -#' print(bmr1$combine(bmr2)) -resample_mirai = function(task, learner, resampling, store_models = FALSE, store_backends = TRUE, encapsulate = NA_character_, allow_hotstart = FALSE, clone = c("task", "learner", "resampling"), unmarshal = TRUE) { - assert_subset(clone, c("task", "learner", "resampling")) - task = assert_task(as_task(task, clone = "task" %in% clone)) - learner = assert_learner(as_learner(learner, clone = "learner" %in% clone, discard_state = TRUE)) - resampling = assert_resampling(as_resampling(resampling, clone = "resampling" %in% clone)) - assert_flag(store_models) - assert_flag(store_backends) - # this does not check the internal validation task as it might not be set yet - assert_learnable(task, learner) - assert_flag(unmarshal) - - set_encapsulation(list(learner), encapsulate) - if (!resampling$is_instantiated) { - resampling = resampling$instantiate(task) - } - - n = resampling$iters - pb = if (isNamespaceLoaded("progressr")) { - # NB: the progress bar needs to be created in this env - pb = progressr::progressor(steps = n) - } else { - NULL - } - lgr_threshold = map_int(mlr_reflections$loggers, "threshold") - - grid = if (allow_hotstart) { - - lg$debug("Resampling with hotstart enabled.") - - hotstart_grid = map_dtr(seq_len(n), function(iteration) { - if (!is.null(learner$hotstart_stack)) { - # search for hotstart learner - task_hashes = resampling_task_hashes(task, resampling, learner) - start_learner = get_private(learner$hotstart_stack)$.start_learner(learner$clone(), task_hashes[iteration]) - } - if (is.null(learner$hotstart_stack) || is.null(start_learner)) { - # no hotstart learners stored or no adaptable model found - lg$debug("Resampling with hotstarting not possible. No start learner found.") - mode = "train" - } else { - # hotstart learner found - lg$debug("Resampling with hotstarting.") - start_learner$param_set$values = insert_named(start_learner$param_set$values, learner$param_set$values) - learner = start_learner - mode = "hotstart" - } - data.table(learner = list(learner), mode = mode) - }) - - # null hotstart stack to reduce overhead in parallelization - walk(hotstart_grid$learner, function(learner) { - learner$hotstart_stack = NULL - learner - }) - hotstart_grid - } else { - data.table(learner = replicate(n, learner), mode = "train") - } - - grid$iteration = seq_len(n) - res = mirai::collect_mirai(mirai::mirai_map(grid, workhorse, .args = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb, unmarshal = unmarshal))) - - data = data.table( - task = list(task), - learner = grid$learner, - learner_state = map(res, "learner_state"), - resampling = list(resampling), - iteration = seq_len(n), - prediction = map(res, "prediction"), - uhash = UUIDgenerate(), - param_values = map(res, "param_values"), - learner_hash = map_chr(res, "learner_hash") - ) - - result_data = ResultData$new(data, store_backends = store_backends) - - # the worker already ensures that models are sent back in marshaled form if unmarshal = FALSE, so we don't have - # to do anything in this case. This allows us to minimize the amount of marshaling in those situtions where - # the model is available in both states on the worker - if (unmarshal && store_models) { - result_data$unmarshal() - } - - ResampleResult$new(result_data) -} From c050efa7550ebe850596b58dad3c9ef2055f46c5 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:45:09 +0200 Subject: [PATCH 07/22] ... --- attic/mirai.qmd | 337 ------------------------------------------------ 1 file changed, 337 deletions(-) delete mode 100644 attic/mirai.qmd diff --git a/attic/mirai.qmd b/attic/mirai.qmd deleted file mode 100644 index 51a46c05f..000000000 --- a/attic/mirai.qmd +++ /dev/null @@ -1,337 +0,0 @@ ---- -title: "mirai and mlr3 Demo" -author: "Marc Becker" -date: "2025-03-27" -format: html ---- - - -1. Use `mirai` for parallelization - -```{r} -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("cv", folds = 3) - -future::plan("multisession", workers = 3) - -resample(task, learner, resampling) -``` - - -This means replacing `future_map` with `mirai_map` or promoting the `future.mirai` backend. - -**Can `mirai` replace all `future` backends?** - - -Is it difficult to install mirai on some systems? - -How does nested parallelization work? - Currently the user calls `future::plan(list("multisession", "sequential"))` - So the user can switch between inner, outer or parallelizing both loops - With `mirai` we have to call `daemons()` for the inner loop? - -2. Add `mirai` as an encapsulation option - -```{r} -learner = lrn("classif.rpart") -learner$encapsulate("callr", fallback = lrn("classif.featureless")) -task = tsk("pima") -resampling = rsmp("cv", folds = 3) - -resample(task, learner, resampling) -``` - - - - -**Can `mirai` record warnings? Is this valueable for mirai?** - For `callr` we catch warnings and write them to a log file. - We parse the log file after the process has finished. - Not sure why we do this. Maybe to record warnings in case of segfaults? - -**Could `mirai` enforce a memory limit?** - This would be very useful for AutML. - - -**Do I need to use `daemons()`? Or is a daemon automatically started when I call `mirai::mirai()`?** - Already in there. - - -3. Combine parallelization and encapsulation with `mirai` - -```{r} -learner = lrn("classif.rpart") -learner$encapsulate("callr", fallback = lrn("classif.featureless")) -task = tsk("pima") -resampling = rsmp("cv", folds = 3) - -future::plan("multisession", workers = 3) - -resample(task, learner, resampling) -``` - -Any disadvantages of calling a `mirai` in a `mirai`? - -# Parallelization - -Use future `multisession` backend - -```{r} -library(mlr3) -library(future) - -lgr::get_logger("mlr3")$set_threshold("warn") - -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -future::plan("multisession", workers = 5) - -# 3.1 seconds -microbenchmark::microbenchmark( - resample = resample(task, learner, resampling), - times = 10, - unit = "ms" -) -``` - -Use future `multisession` backend and combine resampling iterations to chunks - -```{r} -library(mlr3) -library(future) - -lgr::get_logger("mlr3")$set_threshold("warn") - -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -future::plan("multisession", workers = 5) - -options(mlr3.exec_chunk_bins = 5) - -# 1 second -microbenchmark::microbenchmark( - resample = resample(task, learner, resampling), - times = 10, - unit = "ms" -) -``` - -Use `future.mirai` backend - -```{r} -library(mlr3) -library(future) -library(future.mirai) - -lgr::get_logger("mlr3")$set_threshold("warn") - -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -mirai::daemons(0) -mirai::daemons(5) -future::plan("mirai_cluster") - -# 650 ms -microbenchmark::microbenchmark( - resample = resample(task, learner, resampling), - times = 50, - unit = "ms" -) -``` - -Replace `future_map` with `mirai_map` - -```{r} -library(mlr3) -library(mirai) - -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -daemons(0) -daemons(5) - -# 440 ms -microbenchmark::microbenchmark( - resample = resample_mirai(task, learner, resampling), - times = 50, - unit = "ms" -) -``` - -# Encapsulation - -```{r} -library(mirai) - -learner = lrn("classif.rpart") -learner$encapsulate("mirai", fallback = lrn("classif.featureless")) -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -# 6 seconds -daemons(1) -status() - -system.time({rr = resample(task, learner, resampling)}) -``` - -```{r} -learner = lrn("classif.rpart") -learner$encapsulate("callr", fallback = lrn("classif.featureless")) -task = tsk("pima") -resampling = rsmp("subsampling", repeats = 100) - -# 90 seconds -system.time({rr = resample(task, learner, resampling)}) -``` - -# Encapsulation and Parallelization - -```{r} -library(mirai) -library(data.table) - -daemons(0) -daemons(3) - -# setup encapsulation daemons -everywhere({ - mirai::daemons(1) -}) - -Sys.sleep(1) - -# 3 resample iterations -x = collect_mirai(mirai_map(seq(3), function(i) { - - # train - model = mirai::collect_mirai(mirai::mirai({ - if (i == 2) stop("Simple R error") - sprintf("model_%i_%i", i, Sys.getpid()) - }, i = i)) - - # fit fallback learner - if (mirai::is_mirai_error(model)) model = sprintf("fallback_model_%i_%i", i, Sys.getpid()) - - # predict - prediction = mirai::collect_mirai(mirai::mirai({rnorm(1)})) - - data.table::data.table(model = model, prediction = prediction, pid = Sys.getpid()) -})) - -rbindlist(x) - -# model prediction pid -# -# 1: model_1_1858301 -0.2694271 1857905 -# 2: model_2_1858313 -0.3860743 1857907 -# 3: model_3_1858325 -0.0356403 1857909 -``` - -mirai + mirai - -```{r} -library(mirai) -library(mlr3) - -learner = lrn("classif.rpart") -learner$encapsulate("mirai", fallback = lrn("classif.featureless")) -task = tsk("pima") -resample = rsmp("subsampling", repeats = 100) - -daemons(0) -daemons(5) - -status() - -everywhere({ - mirai::daemons(1) -}) - -x = mirai::mirai({mirai::status()}) -x$data - -status() - -# 1.5 seconds -system.time({rr = resample_mirai(task, learner, resample)}) -``` - -mirai + callr - -```{r} -library(mirai) - -learner = lrn("classif.rpart") -learner$encapsulate("callr", fallback = lrn("classif.featureless")) -task = tsk("pima") -resample = rsmp("subsampling", repeats = 100) - -daemons(0) -daemons(5) - -# 20 seconds -system.time({rr = resample_mirai(task, learner, resample)}) -``` - -future + callr - -```{r} -learner = lrn("classif.rpart") -learner$encapsulate("callr", fallback = lrn("classif.featureless")) -task = tsk("pima") -resample = rsmp("subsampling", repeats = 100) - -future::plan("multisession", workers = 5) - -options(mlr3.exec_chunk_bins = 5) - -# 26 seconds -system.time({rr = resample_mirai(task, learner, resample)}) -``` - -# Nested Parallelization - -```{r} -library(mirai) - -daemons(0) -daemons(5) - -status() - -everywhere({ - mirai::daemons(2) -}) - -x = mirai::mirai({mirai::status()}) -x$data - -status() -``` - -# Benchmark in Parallel - -```{r} -library(mirai) -library(mlr3) - -learner = lrn("classif.rpart") -task = tsk("pima") -resampling = rsmp("cv", folds = 3) - -grid = benchmark_grid(list(tsk("spam"), tsk("sonar")), learner, resampling) - -bmr = benchmark_mirai(grid) -``` From 1cdcfc7a8690a3a0efc6e476f25098123c4c135f Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:46:42 +0200 Subject: [PATCH 08/22] ... --- DESCRIPTION | 4 +- NAMESPACE | 2 - man/BenchmarkResult.Rd | 3 +- man/ResampleResult.Rd | 3 +- man/benchmark.Rd | 3 +- man/benchmark_grid.Rd | 3 +- man/benchmark_mirai.Rd | 206 ----------------------------------------- man/resample.Rd | 3 +- man/resample_mirai.Rd | 179 ----------------------------------- 9 files changed, 7 insertions(+), 399 deletions(-) delete mode 100644 man/benchmark_mirai.Rd delete mode 100644 man/resample_mirai.Rd diff --git a/DESCRIPTION b/DESCRIPTION index b74e1c912..f1c928024 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -74,6 +74,8 @@ Suggests: RhpcBLASctl, rpart, testthat (>= 3.2.0) +Remotes: + mlr-org/mlr3misc@mirai Encoding: UTF-8 Config/testthat/edition: 3 Config/testthat/parallel: false @@ -185,7 +187,6 @@ Collate: 'auto_convert.R' 'benchmark.R' 'benchmark_grid.R' - 'benchmark_mirai.R' 'bibentries.R' 'default_fallback.R' 'default_measures.R' @@ -204,7 +205,6 @@ Collate: 'predict.R' 'reexports.R' 'resample.R' - 'resample_mirai.R' 'score_roc_measures.R' 'set_threads.R' 'set_validate.R' diff --git a/NAMESPACE b/NAMESPACE index 59be2beb7..8e3b86a46 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -226,7 +226,6 @@ export(assert_validate) export(auto_convert) export(benchmark) export(benchmark_grid) -export(benchmark_mirai) export(callback_resample) export(check_prediction_data) export(clbk) @@ -259,7 +258,6 @@ export(msr) export(msrs) export(partition) export(resample) -export(resample_mirai) export(rsmp) export(rsmps) export(score_roc_measures) diff --git a/man/BenchmarkResult.Rd b/man/BenchmarkResult.Rd index 5a71e70dc..25361e337 100644 --- a/man/BenchmarkResult.Rd +++ b/man/BenchmarkResult.Rd @@ -122,8 +122,7 @@ bmr$set_threshold(0.7, uhashes = uhashes(bmr, learner_ids = "classif.featureless Other benchmark: \code{\link{benchmark}()}, -\code{\link{benchmark_grid}()}, -\code{\link{benchmark_mirai}()} +\code{\link{benchmark_grid}()} } \concept{benchmark} \section{Active bindings}{ diff --git a/man/ResampleResult.Rd b/man/ResampleResult.Rd index 184cc7180..ee9ca4abe 100644 --- a/man/ResampleResult.Rd +++ b/man/ResampleResult.Rd @@ -53,8 +53,7 @@ rr$errors } Other resample: -\code{\link{resample}()}, -\code{\link{resample_mirai}()} +\code{\link{resample}()} } \concept{resample} \section{Active bindings}{ diff --git a/man/benchmark.Rd b/man/benchmark.Rd index d46ead09c..23e9063c7 100644 --- a/man/benchmark.Rd +++ b/man/benchmark.Rd @@ -200,7 +200,6 @@ rr$resampling$train_set(2) Other benchmark: \code{\link{BenchmarkResult}}, -\code{\link{benchmark_grid}()}, -\code{\link{benchmark_mirai}()} +\code{\link{benchmark_grid}()} } \concept{benchmark} diff --git a/man/benchmark_grid.Rd b/man/benchmark_grid.Rd index 6e2a720fd..db72c5613 100644 --- a/man/benchmark_grid.Rd +++ b/man/benchmark_grid.Rd @@ -113,7 +113,6 @@ benchmark(grid) Other benchmark: \code{\link{BenchmarkResult}}, -\code{\link{benchmark}()}, -\code{\link{benchmark_mirai}()} +\code{\link{benchmark}()} } \concept{benchmark} diff --git a/man/benchmark_mirai.Rd b/man/benchmark_mirai.Rd deleted file mode 100644 index 75d1559a8..000000000 --- a/man/benchmark_mirai.Rd +++ /dev/null @@ -1,206 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/benchmark_mirai.R -\name{benchmark_mirai} -\alias{benchmark_mirai} -\title{Benchmark Multiple Learners on Multiple Tasks} -\usage{ -benchmark_mirai( - design, - store_models = FALSE, - store_backends = TRUE, - encapsulate = NA_character_, - allow_hotstart = FALSE, - clone = c("task", "learner", "resampling"), - unmarshal = TRUE, - callbacks = NULL -) -} -\arguments{ -\item{design}{(\code{\link[=data.frame]{data.frame()}})\cr -Data frame (or \code{\link[data.table:data.table]{data.table::data.table()}}) with three columns: "task", "learner", and "resampling". -Each row defines a resampling by providing a \link{Task}, \link{Learner} and an instantiated \link{Resampling} strategy. -The helper function \code{\link[=benchmark_grid]{benchmark_grid()}} can assist in generating an exhaustive design (see examples) and -instantiate the \link{Resampling}s per \link{Task}. -Additionally, you can set the additional column 'param_values', see \code{\link[=benchmark_grid]{benchmark_grid()}}.} - -\item{store_models}{(\code{logical(1)})\cr -Store the fitted model in the resulting object= -Set to \code{TRUE} if you want to further analyse the models or want to -extract information like variable importance.} - -\item{store_backends}{(\code{logical(1)})\cr -Keep the \link{DataBackend} of the \link{Task} in the \link{ResampleResult}? -Set to \code{TRUE} if your performance measures require a \link{Task}, -or to analyse results more conveniently. -Set to \code{FALSE} to reduce the file size and memory footprint -after serialization. -The current default is \code{TRUE}, but this eventually will be changed -in a future release.} - -\item{encapsulate}{(\code{character(1)})\cr -If not \code{NA}, enables encapsulation by setting the field -\code{Learner$encapsulate} to one of the supported values: -\code{"none"} (disable encapsulation), -\code{"try"} (captures errors but output is printed to the console and not logged), -\code{"evaluate"} (execute via \CRANpkg{evaluate}) and -\code{"callr"} (start in external session via \CRANpkg{callr}). -If \code{NA}, encapsulation is not changed, i.e. the settings of the -individual learner are active. -Additionally, if encapsulation is set to \code{"evaluate"} or \code{"callr"}, -the fallback learner is set to the featureless learner if the learner -does not already have a fallback configured.} - -\item{allow_hotstart}{(\code{logical(1)})\cr -Determines if learner(s) are hot started with trained models in -\verb{$hotstart_stack}. See also \link{HotstartStack}.} - -\item{clone}{(\code{character()})\cr -Select the input objects to be cloned before proceeding by -providing a set with possible values \code{"task"}, \code{"learner"} and -\code{"resampling"} for \link{Task}, \link{Learner} and \link{Resampling}, respectively. -Per default, all input objects are cloned.} - -\item{unmarshal}{\code{\link{Learner}}\cr -Whether to unmarshal learners that were marshaled during the execution. -If \code{TRUE} all models are stored in unmarshaled form. -If \code{FALSE}, all learners (that need marshaling) are stored in marshaled form.} - -\item{callbacks}{(List of \link[mlr3misc:Callback]{mlr3misc::Callback})\cr -Callbacks to be executed during the resampling process. -See \link{CallbackResample} and \link{ContextResample} for details.} -} -\value{ -\link{BenchmarkResult}. -} -\description{ -Runs a benchmark on arbitrary combinations of tasks (\link{Task}), learners (\link{Learner}), and resampling strategies (\link{Resampling}), possibly in parallel. - -For large-scale benchmarking we recommend to use the \CRANpkg{mlr3batchmark} package. -This package runs benchmark experiments on high-performance computing clusters and handles failed experiments. -} -\note{ -The fitted models are discarded after the predictions have been scored in order to reduce memory consumption. -If you need access to the models for later analysis, set \code{store_models} to \code{TRUE}. -} -\section{Predict Sets}{ - -If you want to compare the performance of a learner on the training with the performance -on the test set, you have to configure the \link{Learner} to predict on multiple sets by -setting the field \code{predict_sets} to \code{c("train", "test")} (default is \code{"test"}). -Each set yields a separate \link{Prediction} object during resampling. -In the next step, you have to configure the measures to operate on the respective Prediction object: - -\if{html}{\out{
}}\preformatted{m1 = msr("classif.ce", id = "ce.train", predict_sets = "train") -m2 = msr("classif.ce", id = "ce.test", predict_sets = "test") -}\if{html}{\out{
}} - -The (list of) created measures can finally be passed to \verb{$aggregate()} or \verb{$score()}. -} - -\section{Parallelization}{ - - -This function can be parallelized with the \CRANpkg{future} package. -One job is one resampling iteration, and all jobs are send to an apply function -from \CRANpkg{future.apply} in a single batch. -To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. -More on parallelization can be found in the book: -\url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} -} - -\section{Progress Bars}{ - -This function supports progress bars via the package \CRANpkg{progressr}. -Simply wrap the function call in \code{\link[progressr:with_progress]{progressr::with_progress()}} to enable them. -Alternatively, call \code{\link[progressr:handlers]{progressr::handlers()}} with \code{global = TRUE} to enable progress bars -globally. -We recommend the \CRANpkg{progress} package as backend which can be enabled with -\code{progressr::handlers("progress")}. -} - -\section{Logging}{ - - -The \CRANpkg{mlr3} uses the \CRANpkg{lgr} package for logging. -\CRANpkg{lgr} supports multiple log levels which can be queried with -\code{getOption("lgr.log_levels")}. - -To suppress output and reduce verbosity, you can lower the log from the -default level \code{"info"} to \code{"warn"}: - -\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("warn") -}\if{html}{\out{
}} - -To get additional log output for debugging, increase the log level to \code{"debug"} -or \code{"trace"}: - -\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("debug") -}\if{html}{\out{
}} - -To log to a file or a data base, see the documentation of \link[lgr:lgr-package]{lgr::lgr-package}. -} - -\examples{ -# benchmarking with benchmark_grid() -tasks = lapply(c("penguins", "sonar"), tsk) -learners = lapply(c("classif.featureless", "classif.rpart"), lrn) -resamplings = rsmp("cv", folds = 3) - -design = benchmark_grid(tasks, learners, resamplings) -print(design) - -set.seed(123) -bmr = benchmark(design) - -## Data of all resamplings -head(as.data.table(bmr)) - -## Aggregated performance values -aggr = bmr$aggregate() -print(aggr) - -## Extract predictions of first resampling result -rr = aggr$resample_result[[1]] -as.data.table(rr$prediction()) - -# Benchmarking with a custom design: -# - fit classif.featureless on penguins with a 3-fold CV -# - fit classif.rpart on sonar using a holdout -tasks = list(tsk("penguins"), tsk("sonar")) -learners = list(lrn("classif.featureless"), lrn("classif.rpart")) -resamplings = list(rsmp("cv", folds = 3), rsmp("holdout")) - -design = data.table::data.table( - task = tasks, - learner = learners, - resampling = resamplings -) - -## Instantiate resamplings -design$resampling = Map( - function(task, resampling) resampling$clone()$instantiate(task), - task = design$task, resampling = design$resampling -) - -## Run benchmark -bmr = benchmark(design) -print(bmr) - -## Get the training set of the 2nd iteration of the featureless learner on penguins -rr = bmr$aggregate()[learner_id == "classif.featureless"]$resample_result[[1]] -rr$resampling$train_set(2) -} -\seealso{ -\itemize{ -\item Chapter in the \href{https://mlr3book.mlr-org.com/}{mlr3book}: -\url{https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-benchmarking} -\item Package \CRANpkg{mlr3viz} for some generic visualizations. -\item \CRANpkg{mlr3benchmark} for post-hoc analysis of benchmark results. -} - -Other benchmark: -\code{\link{BenchmarkResult}}, -\code{\link{benchmark}()}, -\code{\link{benchmark_grid}()} -} -\concept{benchmark} diff --git a/man/resample.Rd b/man/resample.Rd index 19a86de2a..d5e10af0d 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -178,7 +178,6 @@ print(bmr1$combine(bmr2)) } Other resample: -\code{\link{ResampleResult}}, -\code{\link{resample_mirai}()} +\code{\link{ResampleResult}} } \concept{resample} diff --git a/man/resample_mirai.Rd b/man/resample_mirai.Rd deleted file mode 100644 index 069043511..000000000 --- a/man/resample_mirai.Rd +++ /dev/null @@ -1,179 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/resample_mirai.R -\name{resample_mirai} -\alias{resample_mirai} -\title{Resample a Learner on a Task} -\usage{ -resample_mirai( - task, - learner, - resampling, - store_models = FALSE, - store_backends = TRUE, - encapsulate = NA_character_, - allow_hotstart = FALSE, - clone = c("task", "learner", "resampling"), - unmarshal = TRUE -) -} -\arguments{ -\item{task}{(\link{Task}).} - -\item{learner}{(\link{Learner}).} - -\item{resampling}{(\link{Resampling}).} - -\item{store_models}{(\code{logical(1)})\cr -Store the fitted model in the resulting object= -Set to \code{TRUE} if you want to further analyse the models or want to -extract information like variable importance.} - -\item{store_backends}{(\code{logical(1)})\cr -Keep the \link{DataBackend} of the \link{Task} in the \link{ResampleResult}? -Set to \code{TRUE} if your performance measures require a \link{Task}, -or to analyse results more conveniently. -Set to \code{FALSE} to reduce the file size and memory footprint -after serialization. -The current default is \code{TRUE}, but this eventually will be changed -in a future release.} - -\item{encapsulate}{(\code{character(1)})\cr -If not \code{NA}, enables encapsulation by setting the field -\code{Learner$encapsulate} to one of the supported values: -\code{"none"} (disable encapsulation), -\code{"try"} (captures errors but output is printed to the console and not logged), -\code{"evaluate"} (execute via \CRANpkg{evaluate}) and -\code{"callr"} (start in external session via \CRANpkg{callr}). -If \code{NA}, encapsulation is not changed, i.e. the settings of the -individual learner are active. -Additionally, if encapsulation is set to \code{"evaluate"} or \code{"callr"}, -the fallback learner is set to the featureless learner if the learner -does not already have a fallback configured.} - -\item{allow_hotstart}{(\code{logical(1)})\cr -Determines if learner(s) are hot started with trained models in -\verb{$hotstart_stack}. See also \link{HotstartStack}.} - -\item{clone}{(\code{character()})\cr -Select the input objects to be cloned before proceeding by -providing a set with possible values \code{"task"}, \code{"learner"} and -\code{"resampling"} for \link{Task}, \link{Learner} and \link{Resampling}, respectively. -Per default, all input objects are cloned.} - -\item{unmarshal}{\code{\link{Learner}}\cr -Whether to unmarshal learners that were marshaled during the execution. -If \code{TRUE} all models are stored in unmarshaled form. -If \code{FALSE}, all learners (that need marshaling) are stored in marshaled form.} -} -\value{ -\link{ResampleResult}. -} -\description{ -Runs a resampling (possibly in parallel): -Repeatedly apply \link{Learner} \code{learner} on a training set of \link{Task} \code{task} to train a model, -then use the trained model to predict observations of a test set. -Training and test sets are defined by the \link{Resampling} \code{resampling}. -} -\note{ -The fitted models are discarded after the predictions have been computed in order to reduce memory consumption. -If you need access to the models for later analysis, set \code{store_models} to \code{TRUE}. -} -\section{Predict Sets}{ - -If you want to compare the performance of a learner on the training with the performance -on the test set, you have to configure the \link{Learner} to predict on multiple sets by -setting the field \code{predict_sets} to \code{c("train", "test")} (default is \code{"test"}). -Each set yields a separate \link{Prediction} object during resampling. -In the next step, you have to configure the measures to operate on the respective Prediction object: - -\if{html}{\out{
}}\preformatted{m1 = msr("classif.ce", id = "ce.train", predict_sets = "train") -m2 = msr("classif.ce", id = "ce.test", predict_sets = "test") -}\if{html}{\out{
}} - -The (list of) created measures can finally be passed to \verb{$aggregate()} or \verb{$score()}. -} - -\section{Parallelization}{ - - -This function can be parallelized with the \CRANpkg{future} package. -One job is one resampling iteration, and all jobs are send to an apply function -from \CRANpkg{future.apply} in a single batch. -To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. -More on parallelization can be found in the book: -\url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} -} - -\section{Progress Bars}{ - -This function supports progress bars via the package \CRANpkg{progressr}. -Simply wrap the function call in \code{\link[progressr:with_progress]{progressr::with_progress()}} to enable them. -Alternatively, call \code{\link[progressr:handlers]{progressr::handlers()}} with \code{global = TRUE} to enable progress bars -globally. -We recommend the \CRANpkg{progress} package as backend which can be enabled with -\code{progressr::handlers("progress")}. -} - -\section{Logging}{ - - -The \CRANpkg{mlr3} uses the \CRANpkg{lgr} package for logging. -\CRANpkg{lgr} supports multiple log levels which can be queried with -\code{getOption("lgr.log_levels")}. - -To suppress output and reduce verbosity, you can lower the log from the -default level \code{"info"} to \code{"warn"}: - -\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("warn") -}\if{html}{\out{
}} - -To get additional log output for debugging, increase the log level to \code{"debug"} -or \code{"trace"}: - -\if{html}{\out{
}}\preformatted{lgr::get_logger("mlr3")$set_threshold("debug") -}\if{html}{\out{
}} - -To log to a file or a data base, see the documentation of \link[lgr:lgr-package]{lgr::lgr-package}. -} - -\examples{ -task = tsk("penguins") -learner = lrn("classif.rpart") -resampling = rsmp("cv") - -# Explicitly instantiate the resampling for this task for reproduciblity -set.seed(123) -resampling$instantiate(task) - -rr = resample(task, learner, resampling) -print(rr) - -# Retrieve performance -rr$score(msr("classif.ce")) -rr$aggregate(msr("classif.ce")) - -# merged prediction objects of all resampling iterations -pred = rr$prediction() -pred$confusion - -# Repeat resampling with featureless learner -rr_featureless = resample(task, lrn("classif.featureless"), resampling) - -# Convert results to BenchmarkResult, then combine them -bmr1 = as_benchmark_result(rr) -bmr2 = as_benchmark_result(rr_featureless) -print(bmr1$combine(bmr2)) -} -\seealso{ -\itemize{ -\item \code{\link[=as_benchmark_result]{as_benchmark_result()}} to convert to a \link{BenchmarkResult}. -\item Chapter in the \href{https://mlr3book.mlr-org.com/}{mlr3book}: -\url{https://mlr3book.mlr-org.com/chapters/chapter3/evaluation_and_benchmarking.html#sec-resampling} -\item Package \CRANpkg{mlr3viz} for some generic visualizations. -} - -Other resample: -\code{\link{ResampleResult}}, -\code{\link{resample}()} -} -\concept{resample} From 597fa9ba6c7743af5c0b66f40ce98e52aa9bed4c Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:47:24 +0200 Subject: [PATCH 09/22] ... --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ce4e11331..254df4405 100644 --- a/.gitignore +++ b/.gitignore @@ -183,4 +183,4 @@ revdep/ # misc Meta/ Rplots.pdf -.cursor/rules/equal.mdc +.cursor/ \ No newline at end of file From 63709a646cde637d26a7bbbe0298417735593969 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:49:28 +0200 Subject: [PATCH 10/22] ... --- R/helper_exec.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/helper_exec.R b/R/helper_exec.R index 84b73ef7f..04c27d9a0 100644 --- a/R/helper_exec.R +++ b/R/helper_exec.R @@ -25,7 +25,7 @@ future_map = function(n, FUN, ..., MoreArgs = list()) { mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE) } else if (requireNamespace("mirai", quietly = TRUE) && mirai::status()$connections) { lg$debug("Running resample() via mirai with %i iterations", n) - mirai::collect_mirai(mirai::mirai_map(data.table(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) + mirai::collect_mirai(mirai::mirai_map(data.frame(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) } else { is_sequential = inherits(plan(), "sequential") scheduling = if (!is_sequential && isTRUE(getOption("mlr3.exec_random", TRUE))) structure(TRUE, ordering = "random") else TRUE From 6c45a1579fb5e2be81f19380f57a69f3662920a4 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 12:55:05 +0200 Subject: [PATCH 11/22] ... --- R/helper_exec.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/helper_exec.R b/R/helper_exec.R index 04c27d9a0..84b73ef7f 100644 --- a/R/helper_exec.R +++ b/R/helper_exec.R @@ -25,7 +25,7 @@ future_map = function(n, FUN, ..., MoreArgs = list()) { mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE) } else if (requireNamespace("mirai", quietly = TRUE) && mirai::status()$connections) { lg$debug("Running resample() via mirai with %i iterations", n) - mirai::collect_mirai(mirai::mirai_map(data.frame(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) + mirai::collect_mirai(mirai::mirai_map(data.table(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) } else { is_sequential = inherits(plan(), "sequential") scheduling = if (!is_sequential && isTRUE(getOption("mlr3.exec_random", TRUE))) structure(TRUE, ordering = "random") else TRUE From 3d90df8135d06dcba99125156477df566854dd6c Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:01:26 +0200 Subject: [PATCH 12/22] ... --- R/helper_exec.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/helper_exec.R b/R/helper_exec.R index 84b73ef7f..b275d053d 100644 --- a/R/helper_exec.R +++ b/R/helper_exec.R @@ -25,7 +25,7 @@ future_map = function(n, FUN, ..., MoreArgs = list()) { mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE) } else if (requireNamespace("mirai", quietly = TRUE) && mirai::status()$connections) { lg$debug("Running resample() via mirai with %i iterations", n) - mirai::collect_mirai(mirai::mirai_map(data.table(...), workhorse, .args = c(MoreArgs, list(is_sequential = FALSE)))) + mirai::collect_mirai(mirai::mirai_map(data.table(...), FUN, .args = c(MoreArgs, list(is_sequential = FALSE)))) } else { is_sequential = inherits(plan(), "sequential") scheduling = if (!is_sequential && isTRUE(getOption("mlr3.exec_random", TRUE))) structure(TRUE, ordering = "random") else TRUE From 8ef3cbb72823f058b599e31ea277385e49c60736 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:14:24 +0200 Subject: [PATCH 13/22] ... --- inst/testthat/helper_misc.R | 8 +++ ...test_parallel.R => test_parallel_future.R} | 0 tests/testthat/test_parallel_mirai.R | 56 +++++++++++++++++++ 3 files changed, 64 insertions(+) rename tests/testthat/{test_parallel.R => test_parallel_future.R} (100%) create mode 100644 tests/testthat/test_parallel_mirai.R diff --git a/inst/testthat/helper_misc.R b/inst/testthat/helper_misc.R index 5b17636de..cff6d3244 100644 --- a/inst/testthat/helper_misc.R +++ b/inst/testthat/helper_misc.R @@ -17,6 +17,14 @@ with_future = function(backend, expr, ...) { force(expr) } +with_mirai = function(expr) { + requireNamespace("mirai") + mirai::daemons(1) + on.exit(mirai::daemons(0), add = TRUE) + force(expr) + expect_true(mirai::status()$mirai["completed"] > 0) +} + private = function(x) { x[[".__enclos_env__"]][["private"]] } diff --git a/tests/testthat/test_parallel.R b/tests/testthat/test_parallel_future.R similarity index 100% rename from tests/testthat/test_parallel.R rename to tests/testthat/test_parallel_future.R diff --git a/tests/testthat/test_parallel_mirai.R b/tests/testthat/test_parallel_mirai.R new file mode 100644 index 000000000..261107816 --- /dev/null +++ b/tests/testthat/test_parallel_mirai.R @@ -0,0 +1,56 @@ +skip_if_not_installed("mirai") + +test_that("parallel resample", { + with_mirai({ + task = tsk("pima") + learner = lrn("classif.rpart") + rr = resample(task, learner, rsmp("cv", folds = 3)) + expect_resample_result(rr) + expect_data_table(rr$errors, nrows = 0L) + }) +}) + +test_that("parallel benchmark", { + task = tsk("pima") + learner = lrn("classif.rpart") + + with_mirai({ + bmr = benchmark(benchmark_grid(task, learner, rsmp("cv", folds = 3))) + }) + expect_benchmark_result(bmr) + expect_equal(bmr$aggregate(conditions = TRUE)$warnings, 0L) + expect_equal(bmr$aggregate(conditions = TRUE)$errors, 0L) +}) + +test_that("real parallel resample", { + with_mirai({ + task = tsk("pima") + learner = lrn("classif.rpart") + rr = resample(task, learner, rsmp("cv", folds = 3)) + + expect_resample_result(rr) + expect_data_table(rr$errors, nrows = 0L) + }) +}) + +test_that("data table threads are not changed in main session", { + skip_on_os("mac") # number of threads cannot be changed on mac + skip_on_cran() + + old_dt_threads = getDTthreads() + on.exit({ + setDTthreads(old_dt_threads) + }, add = TRUE) + setDTthreads(2L) + + task = tsk("sonar") + learner = lrn("classif.debug", predict_type = "prob") + resampling = rsmp("cv", folds = 3L) + measure = msr("classif.auc") + + rr1 = with_seed(123, resample(task, learner, resampling)) + expect_equal(getDTthreads(), 2L) + + rr2 = with_seed(123, with_mirai(resample(task, learner, resampling))) + expect_equal(getDTthreads(), 2L) +}) From f511627370226e81e24dea23c7d4deec24bcbcb5 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:22:40 +0200 Subject: [PATCH 14/22] ... --- man-roxygen/section_parallelization.R | 7 ++++--- man/benchmark.Rd | 7 ++++--- man/resample.Rd | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/man-roxygen/section_parallelization.R b/man-roxygen/section_parallelization.R index 6c4fed578..e1212314d 100644 --- a/man-roxygen/section_parallelization.R +++ b/man-roxygen/section_parallelization.R @@ -1,8 +1,9 @@ #' @section Parallelization: #' -#' This function can be parallelized with the \CRANpkg{future} package. -#' One job is one resampling iteration, and all jobs are send to an apply function -#' from \CRANpkg{future.apply} in a single batch. +#' This function can be parallelized with the \CRANpkg{future} or \CRANpkg{mirai} package. +#' One job is one resampling iteration. +#' All jobs are send to an apply function from \CRANpkg{future.apply} or `mirai::mirai_map()` in a single batch. #' To select a parallel backend, use [future::plan()]. +#' To use \CRANpkg{mirai}, call `mirai::daemons()` before calling this function. #' More on parallelization can be found in the book: #' \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} diff --git a/man/benchmark.Rd b/man/benchmark.Rd index 23e9063c7..6422852ef 100644 --- a/man/benchmark.Rd +++ b/man/benchmark.Rd @@ -100,10 +100,11 @@ The (list of) created measures can finally be passed to \verb{$aggregate()} or \ \section{Parallelization}{ -This function can be parallelized with the \CRANpkg{future} package. -One job is one resampling iteration, and all jobs are send to an apply function -from \CRANpkg{future.apply} in a single batch. +This function can be parallelized with the \CRANpkg{future} or \CRANpkg{mirai} package. +One job is one resampling iteration. +All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. +To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } diff --git a/man/resample.Rd b/man/resample.Rd index d5e10af0d..0ccea0334 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -101,10 +101,11 @@ The (list of) created measures can finally be passed to \verb{$aggregate()} or \ \section{Parallelization}{ -This function can be parallelized with the \CRANpkg{future} package. -One job is one resampling iteration, and all jobs are send to an apply function -from \CRANpkg{future.apply} in a single batch. +This function can be parallelized with the \CRANpkg{future} or \CRANpkg{mirai} package. +One job is one resampling iteration. +All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. +To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } From 1198a7d9668cee15de539fe6439b26a1afc8d000 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:25:37 +0200 Subject: [PATCH 15/22] ... --- R/Learner.R | 5 +++++ man/Learner.Rd | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/R/Learner.R b/R/Learner.R index 3a6e012c1..8b8add191 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -536,6 +536,11 @@ Learner = R6Class("Learner", #' * `"callr"`: Uses the package \CRANpkg{callr} to call the learner, measure time and do the logging. #' This encapsulation spawns a separate R session in which the learner is called. #' While this comes with a considerable overhead, it also guards your session from being teared down by segfaults. + #' * `"mirai"`: Uses the package \CRANpkg{mirai} to call the learner, measure time and do the logging. + #' This encapsulation calls the function in a `mirai` on a `daemon`. + #' The `daemon` can be pre-started via `daemons(1)`. + #' All encapsulated function calls are executed in this `daemon`. + #' Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. #' #' The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails. #' If the training step or the predict step of the original learner fails, the fallback is used to make the predictions. diff --git a/man/Learner.Rd b/man/Learner.Rd index 96027fbd9..119db9f80 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -638,6 +638,11 @@ Output is printed to the console and not logged. \item \code{"callr"}: Uses the package \CRANpkg{callr} to call the learner, measure time and do the logging. This encapsulation spawns a separate R session in which the learner is called. While this comes with a considerable overhead, it also guards your session from being teared down by segfaults. +\item \code{"mirai"}: Uses the package \CRANpkg{mirai} to call the learner, measure time and do the logging. +This encapsulation calls the function in a \code{mirai} on a \code{daemon}. +The \code{daemon} can be pre-started via \code{daemons(1)}. +All encapsulated function calls are executed in this \code{daemon}. +Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. } The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails. From 8437803519a032aed1bab17a95b793aa78c10b2f Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:34:14 +0200 Subject: [PATCH 16/22] ... --- R/Learner.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Learner.R b/R/Learner.R index 8b8add191..4b9284ff1 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -538,8 +538,8 @@ Learner = R6Class("Learner", #' While this comes with a considerable overhead, it also guards your session from being teared down by segfaults. #' * `"mirai"`: Uses the package \CRANpkg{mirai} to call the learner, measure time and do the logging. #' This encapsulation calls the function in a `mirai` on a `daemon`. - #' The `daemon` can be pre-started via `daemons(1)`. - #' All encapsulated function calls are executed in this `daemon`. + #' The `daemon` can be pre-started via `daemons(1)`, otherwise a new R session will be created for each encapsulated call. + #' If a `deamon` is already running, it will be used to executed all calls. #' Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. #' #' The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails. From e28bc3856151c7db962554cb138a12cbd5527b5f Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:43:19 +0200 Subject: [PATCH 17/22] ... --- man-roxygen/section_parallelization.R | 2 ++ man/Learner.Rd | 4 ++-- man/benchmark.Rd | 2 ++ man/resample.Rd | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/man-roxygen/section_parallelization.R b/man-roxygen/section_parallelization.R index e1212314d..a7de99c34 100644 --- a/man-roxygen/section_parallelization.R +++ b/man-roxygen/section_parallelization.R @@ -5,5 +5,7 @@ #' All jobs are send to an apply function from \CRANpkg{future.apply} or `mirai::mirai_map()` in a single batch. #' To select a parallel backend, use [future::plan()]. #' To use \CRANpkg{mirai}, call `mirai::daemons()` before calling this function. +#' The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. +#' The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a `seed` and `dispatcher = FALSE` when calling `mirai::daemons()`. #' More on parallelization can be found in the book: #' \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} diff --git a/man/Learner.Rd b/man/Learner.Rd index 119db9f80..8d6b10518 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -640,8 +640,8 @@ This encapsulation spawns a separate R session in which the learner is called. While this comes with a considerable overhead, it also guards your session from being teared down by segfaults. \item \code{"mirai"}: Uses the package \CRANpkg{mirai} to call the learner, measure time and do the logging. This encapsulation calls the function in a \code{mirai} on a \code{daemon}. -The \code{daemon} can be pre-started via \code{daemons(1)}. -All encapsulated function calls are executed in this \code{daemon}. +The \code{daemon} can be pre-started via \code{daemons(1)}, otherwise a new R session will be created for each encapsulated call. +If a \code{deamon} is already running, it will be used to executed all calls. Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. } diff --git a/man/benchmark.Rd b/man/benchmark.Rd index 6422852ef..51e17507c 100644 --- a/man/benchmark.Rd +++ b/man/benchmark.Rd @@ -105,6 +105,8 @@ One job is one resampling iteration. All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. +The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. +The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } diff --git a/man/resample.Rd b/man/resample.Rd index 0ccea0334..d52fe1a8e 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -106,6 +106,8 @@ One job is one resampling iteration. All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. +The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. +The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } From 2ee1b76500bf4ae583d7149b7e9336ab0fe1f39a Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:43:56 +0200 Subject: [PATCH 18/22] ... --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index d2ba50e2e..cd8350605 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -25,3 +25,4 @@ ^benchmark$ ^attic$ ^.cursor$ + From 2e60779d3c1f3fab43e69bf774404d00c12de561 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:46:07 +0200 Subject: [PATCH 19/22] ... --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 254df4405..dcf4d2fc1 100644 --- a/.gitignore +++ b/.gitignore @@ -183,4 +183,4 @@ revdep/ # misc Meta/ Rplots.pdf -.cursor/ \ No newline at end of file +.cursor/ From 5f08eaaf05d44c3fe4f10fe64007f5d6731a88c3 Mon Sep 17 00:00:00 2001 From: be-marc Date: Thu, 22 May 2025 13:49:11 +0200 Subject: [PATCH 20/22] ... --- R/Learner.R | 2 +- man-roxygen/section_parallelization.R | 6 +++--- man/Learner.Rd | 2 +- man/benchmark.Rd | 6 +++--- man/resample.Rd | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/Learner.R b/R/Learner.R index 4b9284ff1..6e00a9437 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -540,7 +540,7 @@ Learner = R6Class("Learner", #' This encapsulation calls the function in a `mirai` on a `daemon`. #' The `daemon` can be pre-started via `daemons(1)`, otherwise a new R session will be created for each encapsulated call. #' If a `deamon` is already running, it will be used to executed all calls. - #' Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. + #' Using `mirai"` is similarly safe as `callr` but much faster if several learners are encapsulated one after the other on the same daemon. #' #' The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails. #' If the training step or the predict step of the original learner fails, the fallback is used to make the predictions. diff --git a/man-roxygen/section_parallelization.R b/man-roxygen/section_parallelization.R index a7de99c34..c419d4bef 100644 --- a/man-roxygen/section_parallelization.R +++ b/man-roxygen/section_parallelization.R @@ -4,8 +4,8 @@ #' One job is one resampling iteration. #' All jobs are send to an apply function from \CRANpkg{future.apply} or `mirai::mirai_map()` in a single batch. #' To select a parallel backend, use [future::plan()]. -#' To use \CRANpkg{mirai}, call `mirai::daemons()` before calling this function. -#' The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. -#' The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a `seed` and `dispatcher = FALSE` when calling `mirai::daemons()`. +#' To use `mirai`, call `mirai::daemons()` before calling this function. +#' The `future` package guarantees reproducible results independent of the parallel backend. +#' The results of `mirai` will not be the same but can be made reproducible by setting a `seed` and `dispatcher = FALSE` when calling `mirai::daemons()`. #' More on parallelization can be found in the book: #' \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} diff --git a/man/Learner.Rd b/man/Learner.Rd index 8d6b10518..b1358ddac 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -642,7 +642,7 @@ While this comes with a considerable overhead, it also guards your session from This encapsulation calls the function in a \code{mirai} on a \code{daemon}. The \code{daemon} can be pre-started via \code{daemons(1)}, otherwise a new R session will be created for each encapsulated call. If a \code{deamon} is already running, it will be used to executed all calls. -Using mirai is similarly safe as callr but much faster if several function calls are encapsulated one after the other on the same daemon. +Using \verb{mirai"} is similarly safe as \code{callr} but much faster if several learners are encapsulated one after the other on the same daemon. } The fallback learner is fitted to create valid predictions in case that either the model fitting or the prediction of the original learner fails. diff --git a/man/benchmark.Rd b/man/benchmark.Rd index 51e17507c..6472e81c8 100644 --- a/man/benchmark.Rd +++ b/man/benchmark.Rd @@ -104,9 +104,9 @@ This function can be parallelized with the \CRANpkg{future} or \CRANpkg{mirai} p One job is one resampling iteration. All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. -To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. -The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. -The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. +To use \code{mirai}, call \code{mirai::daemons()} before calling this function. +The \code{future} package guarantees reproducible results independent of the parallel backend. +The results of \code{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } diff --git a/man/resample.Rd b/man/resample.Rd index d52fe1a8e..1304ff283 100644 --- a/man/resample.Rd +++ b/man/resample.Rd @@ -105,9 +105,9 @@ This function can be parallelized with the \CRANpkg{future} or \CRANpkg{mirai} p One job is one resampling iteration. All jobs are send to an apply function from \CRANpkg{future.apply} or \code{mirai::mirai_map()} in a single batch. To select a parallel backend, use \code{\link[future:plan]{future::plan()}}. -To use \CRANpkg{mirai}, call \code{mirai::daemons()} before calling this function. -The \CRANpkg{future} package guarantees reproducible results independent of the parallel backend. -The results of \CRANpkg{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. +To use \code{mirai}, call \code{mirai::daemons()} before calling this function. +The \code{future} package guarantees reproducible results independent of the parallel backend. +The results of \code{mirai} will not be the same but can be made reproducible by setting a \code{seed} and \code{dispatcher = FALSE} when calling \code{mirai::daemons()}. More on parallelization can be found in the book: \url{https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html} } From db0af10f6813b9d7d1731f0c073f1a5caca1cf42 Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 May 2025 13:44:34 +0200 Subject: [PATCH 21/22] ... --- DESCRIPTION | 2 +- R/helper_exec.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f1c928024..a84f5385f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,7 @@ Suggests: codetools, datasets, future.callr, - mirai, + mirai (>= 2.3.0), mlr3data, progressr, remotes, diff --git a/R/helper_exec.R b/R/helper_exec.R index b275d053d..6868aecbc 100644 --- a/R/helper_exec.R +++ b/R/helper_exec.R @@ -23,7 +23,7 @@ future_map = function(n, FUN, ..., MoreArgs = list()) { if (getOption("mlr3.debug", FALSE)) { lg$info("Running experiments sequentially in debug mode with %i iterations", n) mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE) - } else if (requireNamespace("mirai", quietly = TRUE) && mirai::status()$connections) { + } else if (requireNamespace("mirai", quietly = TRUE) && mirai::daemons_set()) { lg$debug("Running resample() via mirai with %i iterations", n) mirai::collect_mirai(mirai::mirai_map(data.table(...), FUN, .args = c(MoreArgs, list(is_sequential = FALSE)))) } else { From 867d128b8633091be1199f92e1ef2a5ba95223df Mon Sep 17 00:00:00 2001 From: be-marc Date: Mon, 26 May 2025 11:40:12 +0200 Subject: [PATCH 22/22] chore: remotes --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a84f5385f..b41b9c50b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -75,7 +75,7 @@ Suggests: rpart, testthat (>= 3.2.0) Remotes: - mlr-org/mlr3misc@mirai + mlr-org/mlr3misc Encoding: UTF-8 Config/testthat/edition: 3 Config/testthat/parallel: false