...

sebffischer · sebffischer · commit 61dc13b88382 · 2025-04-24T06:30:41.000+02:00
diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R
@@ -36,6 +36,7 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors",
   cloneable = FALSE,
   inherit = DataBackendDataTable,
   public = list(
+    chunk_size = NULL,
     #' @description
     #' Create a new instance of this [R6][R6::R6Class] class.
     #' @param data (`data.table`)\cr
@@ -48,10 +49,12 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors",
     #' @param cache (`character()`)\cr
     #'   Names of the columns that should be cached.
     #'   Per default, all columns that are converted are cached.
-    initialize = function(data, primary_key, converter, cache = names(converter)) {
+    initialize = function(data, primary_key, converter, cache = names(converter), chunk_size = 100) {
       private$.converter = assert_list(converter, types = "function", any.missing = FALSE)
       assert_subset(names(converter), colnames(data))
+      assert_subset(cache, names(converter), empty.ok = TRUE)
       private$.cached_cols = assert_subset(cache, names(converter))
+      self$chunk_size = assert_int(chunk_size, lower = 1L)
       walk(names(private$.converter), function(nm) {
         if (!inherits(data[[nm]], "lazy_tensor")) {
           stopf("Column '%s' is not a lazy tensor.", nm)
@@ -69,18 +72,25 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors",
         # no caching, no materialization as this is called in the training loop
         return(super$data(rows, cols))
       }
-      if (all(cols %in% names(private$.data_cache))) {
-        cache_hit = private$.data_cache[list(rows), cols, on = self$primary_key, with = FALSE]
+      if (all(intersect(cols, private$.cached_cols) %in% names(private$.data_cache))) {
+        expensive_cols = intersect(cols, private$.cached_cols)
+        other_cols = setdiff(cols, expensive_cols)
+        cache_hit = private$.data_cache[list(rows), expensive_cols, on = self$primary_key, with = FALSE]
         complete = complete.cases(cache_hit)
         cache_hit = cache_hit[complete]
         if (nrow(cache_hit) == length(rows)) {
-          return(cache_hit)
+          tbl = cbind(cache_hit, super$data(rows, other_cols))
+          setcolorder(tbl, cols)
+          return(tbl)
         }
-        combined = rbindlist(list(cache_hit, private$.load_and_cache(rows[!complete], cols)))
+        combined = rbindlist(list(cache_hit, private$.load_and_cache(rows[!complete], expensive_cols)))
         reorder = vector("integer", nrow(combined))
         reorder[complete] = seq_len(nrow(cache_hit))
         reorder[!complete] = nrow(cache_hit) + seq_len(nrow(combined) - nrow(cache_hit))
-        return(combined[reorder])
+
+        tbl = cbind(combined[reorder], super$data(rows, other_cols))
+        setcolorder(tbl, cols)
+        return(tbl)
       }
 
       private$.load_and_cache(rows, cols)
@@ -109,7 +119,17 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors",
       tbl = super$data(rows, cols)
       cols_to_convert = intersect(names(private$.converter), names(tbl))
       tbl_to_mat = tbl[, cols_to_convert, with = FALSE]
-      tbl_mat = materialize(tbl_to_mat, rbind = TRUE)
+      # chunk the rows of tbl_to_mat into chunks of size self$chunk_size, apply materialize
+      n = nrow(tbl_to_mat)
+      chunks = split(seq_len(n), rep(seq_len(ceiling(n / self$chunk_size)), each = self$chunk_size, length.out = n))
+
+      tbl_mat = if (n == 0) {
+        set_names(list(torch_empty(0)), names(tbl_to_mat))
+      } else {
+        set_names(lapply(transpose_list(lapply(chunks, function(chunk) {
+          materialize(tbl_to_mat[chunk, ], rbind = TRUE)
+        })), torch_cat, dim = 1L), names(tbl_to_mat))
+      }
 
       for (nm in cols_to_convert) {
         converted = private$.converter[[nm]](tbl_mat[[nm]])
@@ -135,13 +155,62 @@ as_data_backend.dataset = function(x, dataset_shapes, ...) {
 }
 
 #' @export
-as_task_classif.dataset = function(x, dataset_shapes, target, ...) {
-  # TODO
+as_task_classif.dataset = function(x, target, levels, converter = NULL, dataset_shapes = NULL, chunk_size = 100, cache = names(converter), ...) {
+  if (length(x) < 2) {
+    stopf("Dataset must have at least 2 rows.")
+  }
+  batch = dataloader(x, batch_size = 2)$.iter()$.next()
+  if (is.null(converter)) {
+    if (length(levels) == 2) {
+      if (batch[[target]]$dtype != torch_float()) {
+        stopf("Target must be a float tensor, but has dtype %s", batch[[target]]$dtype)
+      }
+      if (test_equal(batch[[target]]$shape, c(2L, 1L))) {
+        converter = set_names(list(crate(function(x) factor(as.integer(x), levels = 0:1, labels = levels), levels)), target)
+      } else {
+        stopf("Target must be a float tensor of shape (batch_size, 1), but has shape (batch_size, %s)",
+          paste(batch[[target]]$shape[-1L], collapse = ", "))
+      }
+      converter = set_names(list(crate(function(x) factor(as.integer(x), levels = 0:1, labels = levels), levels)), target)
+    } else {
+      if (batch[[target]]$dtype != torch_int()) {
+        stopf("Target must be an integer tensor, but has dtype %s", batch[[target]]$dtype)
+      }
+      if (test_equal(batch[[target]]$shape, 2L)) {
+        converter = set_names(list(crate(function(x) factor(as.integer(x), labels = levels), levels)), target)
+      } else {
+        stopf("Target must be an integer tensor of shape (batch_size), but has shape (batch_size, %s)",
+          paste(batch[[target]]$shape[-1L], collapse = ", "))
+      }
+      converter = set_names(list(crate(function(x) factor(as.integer(x), labels = levels), levels)), target)
+    }
+  }
+  be = as_data_backend(x, dataset_shapes, converter = converter, cache = cache, chunk_size = chunk_size)
+  as_task_classif(be, target = target, ...)
 }
 
 #' @export
-as_task_regr.dataset = function(x, dataset_shapes, target, converter, ...) {
-  # TODO
+as_task_regr.dataset = function(x, target, converter = NULL, dataset_shapes = NULL, chunk_size = 100, cache = names(converter), ...) {
+  if (length(x) < 2) {
+    stopf("Dataset must have at least 2 rows.")
+  }
+  if (is.null(converter)) {
+    converter = set_names(list(as.numeric), target)
+  }
+  batch = dataloader(x, batch_size = 2)$.iter()$.next()
+
+  if (batch[[target]]$dtype != torch_float()) {
+    stopf("Target must be a float tensor, but has dtype %s", batch[[target]]$dtype)
+  }
+
+  if (!test_equal(batch[[target]]$shape, c(2L, 1L))) {
+    stopf("Target must be a float tensor of shape (batch_size, 1), but has shape (batch_size, %s)",
+      paste(batch[[target]]$shape[-1L], collapse = ", "))
+  }
+
+  dataset_shapes = get_or_check_dataset_shapes(x, dataset_shapes)
+  be = as_data_backend(x, dataset_shapes, converter = converter, cache = cache, chunk_size = chunk_size)
+  as_task_regr(be, target = target, ...)
 }
 
 #' @export
@@ -177,4 +246,4 @@ check_lazy_tensors_backend = function(be, candidates, visited = character()) {
     }
     union(visited, intersect(candidates, be$colnames))
   }
-}
+}
diff --git a/R/materialize.R b/R/materialize.R
@@ -106,7 +106,7 @@ materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # no
   materialize_internal(x = x, device = device, cache = NULL, rbind = rbind)
 }
 
-get_input = function(ds, ids, varying_shapes, rbind) {
+get_input = function(ds, ids, varying_shapes) {
   if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined
     x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1)))
     if (varying_shapes) {
@@ -201,7 +201,7 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {
   }
 
   if (!do_caching || !input_hit) {
-    input = get_input(ds, ids, varying_shapes, rbind)
+    input = get_input(ds, ids, varying_shapes)
   }
 
   if (do_caching && !input_hit) {
diff --git a/TODO.md b/TODO.md
@@ -21,4 +21,5 @@
   ```
 * Add checks on usage of `DataBackendLazyTensors` in `task_dataset`
 * Add optimization that truths values don't have to be loaded twice during resampling, i.e.
-  once for making the predictions and once for retrieving the truth column.
+  once for making the predictions and once for retrieving the truth column.
+* only allow caching converter columns in `DataBackendLazyTensors` (probably just remove the `cache` parameter)
diff --git a/man/DataBackendLazyTensors.Rd b/man/DataBackendLazyTensors.Rd
diff --git a/tests/testthat/test_DataBackendLazyTensors.R b/tests/testthat/test_DataBackendLazyTensors.R
@@ -1,7 +1,3 @@
-test_that("correct input checks", {
-
-})
-
 test_that("main API works", {
   # regression target
   ds = tensor_dataset(
@@ -102,11 +98,71 @@ test_that("classif target works", {
 })
 
 test_that("errors when weird preprocessing", {
-  # test following example pipeops:
-  # - target trafo
-  # - fix factors
-  # - smote
+})
+
+test_that("chunking works ", {
+  ds = dataset(
+    initialize = function() {
+      self$x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))
+      self$y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))
+      self$counter = 0
+    },
+    .getbatch = function(i) {
+      self$counter = self$counter + 1
+      list(x = self$x[i, drop = FALSE], y = self$y[i, drop = FALSE])
+    },
+    .length = function() {
+      nrow(self$x)
+    }
+  )()
 
+  be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), chunk_size = 3,
+    converter = list(y = as.numeric))
+
+  counter_prev = ds$counter
+  be$data(1:3, c("x", "y"))
+  expect_equal(ds$counter, counter_prev + 1)
+  counter_prev = ds$counter
+  be$data(4:10, c("x", "y"))
+  expect_equal(ds$counter, counter_prev + 3)
+})
+
+test_that("can retrieve 0 rows", {
+  ds = tensor_dataset(
+    x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)),
+    y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))
+  )
+  be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)),
+    converter = list(y = as.numeric))
+  res = be$data(integer(0), c("x", "y", "row_id"))
+  expect_data_table(res, nrows = 0, ncols = 3)
+  expect_class(res$x, "lazy_tensor")
+  expect_class(res$y, "numeric")
+  expect_equal(res$row_id, integer(0))
+})
+
+test_that("task converters work", {
+  # regression target
+  ds = tensor_dataset(
+    x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(),
+    y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float()
+  )
+  task = as_task_regr(ds, target = "y", converter = list(y = as.numeric))
+  task$data(integer(0))
+  expect_equal(task$head(2)$y, 1:2)
+  expect_equal(task$feature_names, "x")
+  expect_equal(task$target_names, "y")
+  expect_task(task)
+
+
+  # binary classification
+  ds = tensor_dataset(
+    x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(),
+    y = torch_tensor(rep(0:1, times = 50))$float()$unsqueeze(2L)
+  )
+  task = as_task_classif(ds, target = "y", levels = c("yes", "no"))
+  expect_task(task)
+  expect_equal(task$head()$y, factor(rep(c("yes", "no"), times = 3), levels = c("yes", "no")))
 })
 
 test_that("caching works", {
@@ -147,8 +203,8 @@ test_that("caching works", {
   # y is no in the cache, so .getitem() is not called on $data()
   check(be, ds, 1, "y", 0)
 
-  # but x is not cached, so we still need to call .getitem below
-  check(be, ds, 1, c("x", "y"), 1)
+  # everything is in the cache
+  check(be, ds, 1, c("x", "y"), 0)
   # lazy tensor causes no materialization
   check(be, ds, 1, "x", 0)
 
@@ -247,3 +303,31 @@ test_that("check_lazy_tensors_backend works", {
   expect_error(check_lazy_tensors_backend(task2$backend, c("x", "y")),
     regexp = "A converter column ('y')", fixed = TRUE)
 })
+
+
+test_that("...", {
+  ds = dataset(
+    initialize = function(x, y) {
+      self$x = torch_randn(100, 3)
+      self$y = torch_randn(100, 1)
+      self$counter = 0
+    },
+    .getbatch = function(i) {
+      print("hallo")
+      self$counter = self$counter + 1L
+      list(x = self$x[i, drop = FALSE], y = self$y[i, drop = FALSE])
+    },
+    .length = function() 100
+  )()
+
+task = as_task_regr(ds, target = "y")
+
+counter = ds$counter
+task$head()
+print(ds$counter - counter)
+counter = ds$counter
+task$head()
+expec
+print(ds$counter - counter)
+
+})

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # no`
`106`	`106`	`materialize_internal(x = x, device = device, cache = NULL, rbind = rbind)`
`107`	`107`	`}`
`108`	`108`
`109`		`-get_input = function(ds, ids, varying_shapes, rbind) {`
	`109`	`+get_input = function(ds, ids, varying_shapes) {`
`110`	`110`	`if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined`
`111`	`111`	`x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1)))`
`112`	`112`	`if (varying_shapes) {`
`@@ -201,7 +201,7 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {`
`201`	`201`	`}`
`202`	`202`
`203`	`203`	`if (!do_caching \|\| !input_hit) {`
`204`		`- input = get_input(ds, ids, varying_shapes, rbind)`
	`204`	`+ input = get_input(ds, ids, varying_shapes)`
`205`	`205`	`}`
`206`	`206`
`207`	`207`	`if (do_caching && !input_hit) {`