Skip to content

Commit 35b36a4

Browse files
authored
fix(parquet): caching bug due to different partitions (#128)
* fix(parquet): caching bug due to different partitions Previously, the parquet files were first downloaded to a temporary directory and then a hard link was created. This failed when the directories were on different partitions and the file is now immediately downloaded to its final destination * also add caching to the CI
1 parent b4809f2 commit 35b36a4

File tree

8 files changed

+31
-16
lines changed

8 files changed

+31
-16
lines changed

.github/workflows/r-cmd-check.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ jobs:
4444
with:
4545
r-version: ${{ matrix.config.r }}
4646

47+
48+
- name: Cache OpenML
49+
uses: actions/cache@v3
50+
id: openml-cache
51+
with:
52+
path: ~/openml
53+
# we will just (almost) always get the cache from the restore keys
54+
key: ${{ runner.os }}-openml-${{ hashFiles('.') }}
55+
restore-keys: ${{ runner.os }}-openml-
56+
4757
- uses: r-lib/actions/setup-r-dependencies@v2
4858
with:
4959
extra-packages: any::rcmdcheck

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@ Config/testthat/edition: 3
4646
Encoding: UTF-8
4747
NeedsCompilation: yes
4848
Roxygen: list(markdown = TRUE)
49-
RoxygenNote: 7.2.3
49+
RoxygenNote: 7.2.3.9000
5050
Config/Needs/website: rmarkdown

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# mlr3oml 0.9.0-9000
22

3+
* Bugfix: Caching of parquet files failed when the temporary directory was on a
4+
different partition as the cache directory
35
* feat: Add option `mlr3oml.retries` to control number of retries when
46
downloading data from OpenML. The default is 3.
57

R/OMLData.R

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,6 @@ OMLData = R6Class("OMLData",
184184
}
185185
if (is.null(private$.parquet_path)) {
186186
loadNamespace("mlr3db")
187-
# this function is already cached, it works a little different than the cached(f, ...)
188-
# because we cache it as .parquet and not as .qs
189187
private$.parquet_path = cached(
190188
download_parquet, "data_parquet", self$id, desc = self$desc, cache_dir = self$cache_dir,
191189
server = self$server, test_server = self$test_server, parquet = TRUE

R/OMLRun.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,9 @@ as_resample_result.OMLRun = function(x, store_backends = TRUE, ...) {
245245
resampling = list(resampling),
246246
iteration = seq_len(resampling$iters),
247247
prediction = predictions,
248-
uhash = uuid::UUIDgenerate()
248+
uhash = uuid::UUIDgenerate(),
249+
param_values = replicate(list(), n = 2),
250+
learner_hash = learner$hash
249251
)
250252

251253
ResampleResult$new(ResultData$new(data, store_backends = store_backends))

R/cache.R

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ cached = function(fun, type, id, test_server, parquet = FALSE, ..., cache_dir =
117117
return(fun(id, ...))
118118
}
119119

120-
path = file.path(cache_dir, ifelse(test_server, "test", "public"), type)
120+
path = file.path(cache_dir, if (test_server) "test" else "public", type)
121121
file = file.path(path, sprintf("%i.%s", id, if (parquet) "parquet" else "qs"))
122122

123123
if (file.exists(file)) {
@@ -138,12 +138,11 @@ cached = function(fun, type, id, test_server, parquet = FALSE, ..., cache_dir =
138138
dir.create(path, recursive = TRUE)
139139
}
140140

141-
obj = fun(id, ...)
142141
if (parquet) {
143-
lg$debug("Moving parquet data from tempfile to cache.", type = type, id = id, file = file)
144-
file.rename(obj, file)
142+
obj = fun(id, ..., file = file)
145143
return(file)
146144
}
145+
obj = fun(id, ...)
147146

148147
lg$debug("Storing compressed object in cache", type = type, id = id, file = file)
149148
qs::qsave(obj, file = file, nthreads = getOption("Ncpus", 1L))

R/download_parquet.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
download_parquet = function(data_id, server, desc = download_desc_data(data_id, server)) {
2-
get_parquet(desc$minio_url, api_key = get_api_key(server))
1+
download_parquet = function(data_id, server, desc = download_desc_data(data_id, server), file = NULL) {
2+
get_parquet(desc$minio_url, api_key = get_api_key(server), file = file)
33
}
44

5-
get_parquet = function(url, ..., server, api_key = get_api_key(server), retries = getOption("mlr3oml.retries", 3L)) {
6-
path = tempfile(fileext = ".parquet")
5+
get_parquet = function(url, ..., server, api_key = get_api_key(server), retries = getOption("mlr3oml.retries", 3L), file = NULL) {
6+
file = file %??% tempfile(fileext = ".parquet")
77
url = sprintf(url, ...)
88

99
lg$info("Retrieving parquet.", url = url, authenticated = !is.na(api_key))
1010

1111
for (retry in seq_len(retries)) {
12-
response = download_file(url, path, api_key = api_key)
12+
response = download_file(url, file, api_key = api_key)
1313

1414
if (response$ok) {
15-
lg$debug("Downloaded parquet file.", path = path)
16-
return(path)
15+
lg$debug("Downloaded parquet file.", path = file)
16+
return(file)
1717
} else if (retry < retries && response$http_code >= 500L) {
1818
delay = max(rnorm(1L, mean = 10), 0)
1919
lg$debug("Server busy, retrying in %.2f seconds", delay, try = retry)

tests/testthat/setup.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,8 @@ lg = lgr::get_logger("mlr3oml")
99
old_threshold = lg$threshold
1010
lg$set_threshold("warn")
1111

12-
..old_opts = options(mlr3oml.verbose = FALSE, mlr3oml.retries = 20L)
12+
..old_opts = options(
13+
mlr3oml.verbose = FALSE,
14+
mlr3oml.retries = 20L,
15+
mlr3oml.cache = if (identical(Sys.getenv("GITHUB_ACTIONS"), "true")) "~/openml" else FALSE
16+
)

0 commit comments

Comments
 (0)