Skip to content

Commit ca94488

Browse files
authored
feat: add publishing functions (#123)
* feat: add publishing functions * fix bug that arose from bool -> factor transformation in parquet files
1 parent f1fed69 commit ca94488

24 files changed

+577
-81
lines changed

.github/workflows/r-cmd-check-dev.yml

Lines changed: 0 additions & 58 deletions
This file was deleted.

.github/workflows/r-cmd-check.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ jobs:
5050
needs: check
5151

5252
- uses: r-lib/actions/check-r-package@v2
53+
env:
54+
TESTOPENMLAPIKEY: ${{ secrets.TESTOPENMLAPIKEY }}
5355

5456
- uses: mxschmitt/action-tmate@v3
5557
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}

DESCRIPTION

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Authors@R: c(
1010
Description: Provides an interface to 'OpenML.org' to list and download
1111
machine learning data, tasks and experiments. The 'OpenML' objects can
1212
be automatically converted to 'mlr3' objects. For a more
13-
sophisticated interface which also allows uploading to 'OpenML', see
13+
sophisticated interface with more upload options, see
1414
the 'OpenML' package.
1515
License: LGPL-3
1616
URL: https://mlr3oml.mlr-org.com, https://github.com/mlr-org/mlr3oml
@@ -39,7 +39,9 @@ Suggests:
3939
mlr3db (>= 0.5.0),
4040
qs,
4141
RWeka,
42-
testthat (>= 3.0.0)
42+
testthat (>= 3.0.0),
43+
xml2,
44+
httr
4345
Config/testthat/edition: 3
4446
Encoding: UTF-8
4547
NeedsCompilation: yes

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ export(odt)
3232
export(oflw)
3333
export(orn)
3434
export(otsk)
35+
export(publish_collection)
36+
export(publish_data)
37+
export(publish_task)
3538
export(read_arff)
3639
export(write_arff)
3740
import(checkmate)

NEWS.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
# mlr3oml 0.8.0-9000
1+
# mlr3oml 0.9.0
22

33
* Fix: Parquet datasets now work where columns simultaneously have to be renamed
44
and converted.
5+
* Added upload functions:
6+
* `publish_data` to upload a dataset on OpenML
7+
* `publish_task` to create a task on OpenML
8+
* `publish_collection` to create a collection on OpenML
59

610
# mlr3oml 0.8.0
711

R/defaults.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
parquet_default = function() getOption("mlr3oml.parquet", FALSE)
22
test_server_default = function() getOption("mlr3oml.test_server", FALSE)
3-
cache_default = function() getOption("mlr3oml.cache", FALSE)
43
limit_default = function() getOption("mlr3oml.limit", 5000L)
4+

R/publish_collection.R

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#' @title Publish a Collection to OpenML
2+
#'
3+
#' @description
4+
#' Publish a collection to OpenML
5+
#' This can also be achieved through the [website](https://openml.org).
6+
#'
7+
#' @param ids (`integer()`)\cr
8+
#' The IDs to include in the collection.
9+
#' Depending on the main entity tupe, these can be task or run IDs.
10+
#' @param main_entity_type (`character(1)`)\cr
11+
#' The main entity type of the collection. Can be either "task" or "run".
12+
#' @param name (`character(1)`)\cr
13+
#' The name for the collection.
14+
#' @param desc (`character(1)`)\cr
15+
#' The description of the collection.
16+
#' @param alias (`character(1)`)\cr
17+
#' The alias for the collection.
18+
#' @template param_test_server
19+
#' @template param_api_key
20+
#'
21+
#' @export
22+
publish_collection = function(ids, name, desc, main_entity_type = "task", alias = NULL, api_key = NULL,
23+
test_server = test_server_default()) {
24+
require_namespaces(c("xml2", "httr"))
25+
assert_flag(test_server)
26+
if (is.null(api_key)) {
27+
api_key = get_api_key(get_server(test_server))
28+
} else {
29+
assert_string(api_key)
30+
}
31+
assert_choice(main_entity_type, c("task", "run"))
32+
assert_string(name)
33+
assert_string(desc)
34+
assert_string(alias, null.ok = TRUE)
35+
36+
doc = xml2::xml_new_document()
37+
collection = xml2::xml_add_child(doc, "oml:study", "xmlns:oml" = "http://openml.org/openml")
38+
39+
# Order matters!
40+
if (!is.null(alias)) xml2::xml_add_child(.x = collection, .value = "oml:alias", alias)
41+
xml2::xml_add_child(.x = collection, .value = "oml:main_entity_type", main_entity_type)
42+
xml2::xml_add_child(.x = collection, .value = "oml:name", name)
43+
xml2::xml_add_child(.x = collection, .value = "oml:description", desc)
44+
45+
objects = xml2::xml_add_child(collection, .value = sprintf("oml:%ss", main_entity_type))
46+
for (id in ids) {
47+
xml2::xml_add_child(.x = objects, .value = sprintf("oml:%s_id", main_entity_type), id)
48+
}
49+
50+
desc_path = tempfile(fileext = ".xml")
51+
withr::defer(unlink(desc_path))
52+
xml2::write_xml(x = doc, file = desc_path)
53+
54+
response = httr::POST(
55+
url = sprintf("%s/study", get_server(test_server)),
56+
body = list(
57+
description = httr::upload_file(desc_path)
58+
),
59+
query = list(api_key = api_key)
60+
)
61+
62+
63+
response_list = xml2::as_list(httr::content(response))
64+
if (httr::http_error(response)) {
65+
warningf(
66+
paste(response_list$error$message, response_list$error$additional_information, collapse = "\n")
67+
)
68+
return(response)
69+
}
70+
as.integer(response_list$study_upload$id[[1L]])
71+
}

R/publish_data.R

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#' @title Upload data to OpenML
2+
#'
3+
#' @description
4+
#' Upload a dataset to OpenML.
5+
#' This can also be achieved through the [website](https://openml.org).
6+
#'
7+
#' @param data ([`data.frame()`])\cr
8+
#' The data to upload.
9+
#' @param name (`character(1)`)\cr
10+
#' The name of the dataset.
11+
#' @param desc (`character(1)`)\cr
12+
#' The description of the dataset.
13+
#' @param license (`character(1)`)\cr
14+
#' The license of the dataset
15+
#' @param default_target (`character(1)`)\cr
16+
#' The default target variable.
17+
#' @param citation (`character(1)`)\cr
18+
#' How to cite the dataset.
19+
#' @param original_data_url (character(1))\cr
20+
#' The URL of the original data set.
21+
#' @param paper_url (`character(1)`)\cr
22+
#' The URL of the paper describing the data set.
23+
#' @param row_identifier (`character(1)`)\cr
24+
#' Whether any of the columns is a row identifier.
25+
#' @param ignore_attribute (`character(1)`)\cr
26+
#' Which columns to ignore during modeling.
27+
#' @template param_test_server
28+
#' @template param_api_key
29+
#'
30+
#' @export
31+
publish_data = function(data, name, desc, license = NULL, default_target = NULL, citation = NULL,
32+
row_identifier = NULL, ignore_attribute = NULL, original_data_url = NULL, paper_url = NULL,
33+
test_server = test_server_default(), api_key = NULL) {
34+
require_namespaces(c("xml2", "httr"))
35+
assert_flag(test_server)
36+
if (is.null(api_key)) {
37+
api_key = get_api_key(get_server(test_server))
38+
} else {
39+
assert_string(api_key)
40+
}
41+
assert_data_frame(data)
42+
assert_subset(unique(map_chr(data, function(x) class(x)[[1L]])), c("numeric", "integer", "factor", "character"))
43+
assert_string(name)
44+
assert_string(desc)
45+
assert_string(license, null.ok = TRUE)
46+
assert_string(default_target, null.ok = TRUE)
47+
assert_choice(default_target, colnames(data), null.ok = TRUE)
48+
assert_choice(row_identifier, colnames(data), null.ok = TRUE)
49+
assert_choice(ignore_attribute, colnames(data), null.ok = TRUE)
50+
assert_string(citation, null.ok = TRUE)
51+
assert_string(original_data_url, null.ok = TRUE)
52+
assert_string(paper_url, null.ok = TRUE)
53+
54+
doc = xml2::xml_new_document()
55+
dat = xml2::xml_add_child(doc, "oml:data_set_description", "xmlns:oml" = "http://openml.org/openml")
56+
57+
add = function(name, value) {
58+
if (!is.null(value)) {
59+
xml2::xml_add_child(.x = dat, .value = paste0("oml:", name), value)
60+
}
61+
}
62+
63+
# Order matters!
64+
add("name", name)
65+
add("description", desc)
66+
add("format", "arff")
67+
add("licence", license)
68+
add("default_target_attribute", default_target)
69+
add("row_id_attribute", row_identifier)
70+
add("ignore_attribute", ignore_attribute)
71+
add("citation", citation)
72+
add("original_data_url", original_data_url)
73+
add("paper_url", paper_url)
74+
75+
desc_path = tempfile(fileext = ".xml")
76+
withr::defer(unlink(desc_path))
77+
xml2::write_xml(x = doc, file = desc_path)
78+
79+
data_path = tempfile("arff")
80+
withr::defer(unlink(data_path))
81+
write_arff(data, data_path)
82+
83+
response = httr::POST(
84+
url = sprintf("%s/data", get_server(test_server)),
85+
body = list(
86+
description = httr::upload_file(desc_path),
87+
dataset = httr::upload_file(data_path)
88+
),
89+
query = list(api_key = api_key)
90+
)
91+
response_list = xml2::as_list(httr::content(response))
92+
93+
if (httr::http_error(response)) {
94+
warningf(
95+
paste(response_list$error$message, response_list$error$additional_information, collapse = "\n")
96+
)
97+
return(response)
98+
}
99+
100+
as.integer(response_list$upload_data_set$id[[1]])
101+
}

R/publish_task.R

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#' @title Publish a task on OpenML
2+
#'
3+
#' @description
4+
#' Publish a task on OpenML.
5+
#' This can also be achieved through the [website](https://openml.org).
6+
#'
7+
#' @param id (`integer(1)`)\cr
8+
#' The dataset id.
9+
#' @param type (`character(1)` or `integer(1)`)\cr
10+
#' Can either be `"classif"` or `"regr"` or an integer indicating the task type.
11+
#' @param estimation_procedure (`integer(1)`)\cr
12+
#' The id of the estimation procedure.
13+
#' @param target (`character(1)`)\cr
14+
#' The target variable (if applicable).
15+
#' @template param_api_key
16+
#' @template param_test_server
17+
#'
18+
#' @export
19+
publish_task = function(id, type, estimation_procedure, target, api_key = NULL,
20+
test_server = test_server_default()) {
21+
require_namespaces(c("xml2", "httr"))
22+
assert_flag(test_server)
23+
if (is.null(api_key)) {
24+
api_key = get_api_key(get_server(test_server))
25+
} else {
26+
assert_string(api_key)
27+
}
28+
assert_int(id, lower = 1L)
29+
if (test_character(type, len = 1L)) {
30+
type = switch(type,
31+
regr = 2,
32+
classif = 1,
33+
stopf("Invalid type '%s'.", type)
34+
)
35+
} else {
36+
assert_int(type, lower = 1L)
37+
}
38+
assert_character(target, len = 1L)
39+
estimation_procedure = assert_int(estimation_procedure)
40+
41+
add = function(name, value) {
42+
if (!is.null(value)) {
43+
xml2::xml_add_child(.x = task, "oml:input", name = name, value)
44+
}
45+
}
46+
47+
doc = xml2::xml_new_document()
48+
task = xml2::xml_add_child(doc, "oml:task_inputs", "xmlns:oml" = "http://openml.org/openml")
49+
xml2::xml_add_child(task, "oml:task_type_id", type)
50+
add("source_data", id)
51+
if (!is.null(target)) add("target_feature", target)
52+
add("estimation_procedure", estimation_procedure)
53+
54+
withr::defer(unlink(desc_path))
55+
desc_path = tempfile(fileext = ".xml")
56+
xml2::write_xml(x = doc, file = desc_path)
57+
58+
response = httr::POST(
59+
url = sprintf("%s/task", get_server(test_server)),
60+
body = list(
61+
description = httr::upload_file(desc_path)
62+
),
63+
query = list(api_key = api_key)
64+
)
65+
66+
response_list = xml2::as_list(httr::content(response))
67+
if (httr::http_error(response)) {
68+
if (isTRUE(response_list$error$code[[1L]] == "614")) { # Task already exists.
69+
info = response_list$error$additional_information[[1L]]
70+
id = as.integer(substr(info, 17L, nchar(info) - 1L))
71+
messagef("Task already exists with id %s.", id)
72+
return(id)
73+
} else {
74+
warningf(
75+
paste(response_list$error$message, response_list$error$additional_information, collapse = "\n")
76+
)
77+
return(response)
78+
}
79+
}
80+
81+
as.integer(response_list$upload_task$id[[1L]])
82+
}
83+

0 commit comments

Comments
 (0)