2323# ' *Note*: All Benchmark Suites on OpenML are also collections.
2424# '
2525# ' @section Caching:
26- # ' The OpenML collection itself cannot be not cached, this is because it can be modified in-place
27- # ' on the server, e.g. by adding or removing tasks or runs.
28- # ' The construction argument `cache` therefore only controls wether caching is applied to the
29- # ' OpenML objects that are contained in the collection.
26+ # ' Because collections on OpenML can be modified (ids can be added), it is not possible to cache
27+ # ' this object.
3028# '
3129# ' @section mlr3 Intergration:
3230# ' * Obtain a list of [mlr3::Task]s using [mlr3::as_tasks].
3735# ' @references
3836# ' `r format_bib("vanschoren2014")`
3937# ' @export
40- # ' @examples
41- # ' try({
42- # ' library("mlr3")
43- # ' # OpenML Run collection:
44- # ' run_collection = OMLCollection$new(id = 232)
45- # ' # using sugar
46- # ' run_collection = ocl(id = 232)
47- # ' print(run_collection)
48- # '
49- # ' # OpenML task collection:
50- # ' task_collection = OMLCollection$new(id = 258)
51- # ' # using sugar
52- # ' task_collection = ocl(id = 258)
53- # ' print(task_collection)
54- # ' }, silent = TRUE)
38+ # ' @template examples
5539OMLCollection = R6Class(" OMLCollection" ,
5640 inherit = OMLObject ,
5741 public = list (
5842 # ' @description
5943 # ' Creates a new instance of this [R6][R6::R6Class] class.
6044 # '
6145 # ' @template param_id
62- # ' @param cache (`logical(1)` | `character(1)`)\cr
63- # ' See field `cache` for an explanation of possible values.
64- # ' Defaults to value of option `"mlr3oml.cache"`, or `FALSE` if not set.
65- # ' The collection itself is not cached, this is because it can be modified in-place on OpenML,
66- # ' e.g. by adding or removing tasks or runs. This parameter therefore only controls whether
67- # ' the contained elements are cached when loaded, e.g. when accessing the included tasks.
68- # ' @template param_parquet
6946 # ' @template param_test_server
7047 initialize = function (
7148 id ,
72- cache = cache_default(),
73- parquet = parquet_default(),
7449 test_server = test_server_default()
7550 ) {
76- private $ .parquet = assert_flag(parquet )
77- super $ initialize(id , cache , test_server , " collection" )
51+ super $ initialize(id , test_server , " collection" )
7852 },
7953 # ' @description
8054 # ' Prints the object.
8155 print = function () {
56+ # trigger download first for better printing
57+ self $ desc
8258 catf(" <OMLCollection: %i> %s" , self $ id , as_short_string(self $ name ))
8359 catf(" * data: %i" , length(self $ data_ids ))
8460 catf(" * tasks: %i" , length(self $ task_ids ))
@@ -89,6 +65,12 @@ OMLCollection = R6Class("OMLCollection",
8965 if (self $ test_server ) {
9066 catf(" * Using test server" )
9167 }
68+ },
69+ # ' @description
70+ # ' Downloads the whole object for offline usage.
71+ download = function () {
72+ self $ desc
73+ invisible (self )
9274 }
9375 ),
9476 active = list (
@@ -124,72 +106,7 @@ OMLCollection = R6Class("OMLCollection",
124106 run_ids = function () self $ desc $ runs $ run_id ,
125107 # ' @field task_ids (`integer(n)`)\cr
126108 # ' An vector containing the task ids of the collection.
127- task_ids = function () self $ desc $ task $ task_id ,
128- # ' @field runs (`data.table()`)
129- # ' A data.table summarizing the runs included in the collection. Returns NULL for
130- # ' Task Collections.
131- runs = function () {
132- if (self $ main_entity_type == " task" ) {
133- messagef(" Main entity type is task, returning NULL." )
134- return (NULL )
135- }
136- if (is.null(private $ .runs )) {
137- runs = map(
138- self $ run_ids ,
139- function (x ) OMLRun $ new(x , cache = self $ cache_dir , parquet = self $ parquet ,
140- test_server = self $ test_server
141- )
142- )
143-
144- private $ .runs = make_run_table(runs )
145- }
146- return (private $ .runs )
147- },
148- # ' @field flows (`data.table()`)
149- # ' A data.table summarizing the flows included in the collection. Returns `NULL` for
150- # ' Task Collections.
151- flows = function () {
152- if (self $ main_entity_type == " task" ) {
153- messagef(" Main entity type is task, returning NULL." )
154- return (NULL )
155- }
156- if (is.null(private $ .flows )) {
157- flows = map(
158- self $ flow_ids ,
159- function (x ) OMLFlow $ new(x , cache = self $ cache_dir , test_server = self $ test_server )
160- )
161- private $ .flows = make_flow_table(flows )
162- }
163- return (private $ .flows )
164- },
165- # ' @field data (`data.table()`)
166- # ' A data.table summarizing the datasets included in the collection.
167- data = function () {
168- if (is.null(private $ .data )) {
169- datasets = map(
170- self $ data_ids ,
171- function (x ) OMLData $ new(x , cache = self $ cache_dir , parquet = self $ parquet ,
172- test_server = self $ test_server
173- )
174- )
175- private $ .data = make_dataset_table(datasets )
176- }
177- return (private $ .data )
178- },
179- # ' @field tasks (`data.table()`)
180- # ' A data.table summarizing the tasks included in the collection.
181- tasks = function () {
182- if (is.null(private $ .tasks )) {
183- tasks = map(
184- self $ task_ids ,
185- function (x ) OMLTask $ new(x , cache = self $ cache_dir , parquet = self $ parquet ,
186- test_server = self $ test_server
187- )
188- )
189- private $ .tasks = make_task_table(tasks )
190- }
191- return (private $ .tasks )
192- }
109+ task_ids = function () self $ desc $ task $ task_id
193110 ),
194111 private = list (
195112 .runs = NULL ,
@@ -205,87 +122,25 @@ OMLCollection = R6Class("OMLCollection",
205122# ' @export
206123as_benchmark_result.OMLCollection = function (x , ... ) {
207124 assert_true(x $ main_entity_type == " run" )
208- rrs = map(x $ runs [[ " run " ]], as_resample_result )
125+ rrs = map(x $ run_ids , function ( id ) as_resample_result( OMLRun $ new( id , ... )) )
209126 bmr = as_benchmark_result(invoke(c , .args = rrs ))
210127 return (bmr )
211128}
212129
213130# ' @importFrom mlr3 as_tasks
214131# ' @export
215132as_tasks.OMLCollection = function (x , ... ) {
216- map(x $ tasks [[" task" ]], as_task , ... )
217- }
218-
219- # ' @importFrom mlr3 as_learners
220- # ' @export
221- as_learners.OMLCollection = function (x , ... ) {
222- map(x $ flows [[" flow" ]], as_learner , ... )
133+ map(x $ task_ids , function (id ) tsk(" oml" , task_id = id , ... ))
223134}
224135
225136# ' @importFrom mlr3 as_resamplings
226137# ' @export
227138as_resamplings.OMLCollection = function (x , ... ) {
228- map(x $ tasks [[" task" ]], as_resampling , ... )
229- }
230-
231- make_task_table = function (tasks ) {
232- g = function (task ) {
233- list (
234- id = task $ id ,
235- task = list (task ),
236- data = as_short_string(task $ data $ name ),
237- task_type = task $ task_type ,
238- target = tryCatch(task $ target_names , error = function (x ) NA_character_ ), # can have length > 1
239- nrow = as.integer(task $ data $ quality(" NumberOfInstances" )),
240- ncol = task $ data $ quality(" NumberOfFeatures" ),
241- missing = task $ data $ quality(" NumberOfMissingValues" ),
242- numeric = task $ data $ quality(" NumberOfNumericFeatures" ),
243- symbolic = task $ data $ quality(" NumberOfSymbolicFeatures" ),
244- binary = task $ data $ quality(" NumberOfBinaryFeatures" ),
245- task_splits = task $ estimation_procedure $ type %??% " none"
246- )
247- }
248- setkeyv(map_dtr(tasks , g , .fill = TRUE ), " id" )[]
249- }
250-
251- make_flow_table = function (flows ) {
252- g = function (flow ) {
253- list (
254- id = flow $ id ,
255- flow = list (flow ),
256- name = as_short_string(flow $ name )
257- )
258- }
259- setkeyv(map_dtr(flows , g ), " id" )[]
260- }
261-
262- make_dataset_table = function (datasets ) {
263- g = function (dataset ) {
264- list (
265- id = dataset $ id ,
266- data = list (dataset ),
267- name = dataset $ name ,
268- nrow = as.integer(dataset $ quality(" NumberOfInstances" )),
269- ncol = dataset $ quality(" NumberOfFeatures" ),
270- missing = dataset $ quality(" NumberOfMissingValues" ),
271- numeric = dataset $ quality(" NumberOfNumericFeatures" ),
272- symbolic = dataset $ quality(" NumberOfSymbolicFeatures" ),
273- binary = dataset $ quality(" NumberOfBinaryFeatures" )
274- )
275- }
276- setkeyv(map_dtr(datasets , g , .fill = TRUE ), " id" )[]
139+ map(x $ task_ids , function (id ) rsmp(" oml" , task_id = id , ... ))
277140}
278141
279- make_run_table = function (runs ) {
280- g = function (run ) {
281- list (
282- id = run $ id ,
283- run = list (run ),
284- task_type = run $ task_type ,
285- data = as_short_string(run $ desc $ input_data $ dataset $ name ),
286- flow = as_short_string(run $ desc $ flow_name ),
287- task_splits = run $ task $ estimation_procedure $ type
288- )
289- }
290- setkeyv(map_dtr(runs , g , .fill = TRUE ), " id" )[]
142+ # ' @importFrom mlr3 as_learners
143+ # ' @export
144+ as_learners.OMLCollection = function (x , ... ) {
145+ map(x $ flow_ids , function (id ) as_learner(OMLFlow $ new(id , ... )))
291146}
0 commit comments