diff --git a/benchmarks/lib/helpers.R b/benchmarks/lib/helpers.R index 1eac9faf..5393ddd7 100644 --- a/benchmarks/lib/helpers.R +++ b/benchmarks/lib/helpers.R @@ -83,6 +83,23 @@ generate_bench_h5ad <- function(x_type, n_obs, n_vars, cache_dir) { path } +#' Convert an H5AD bench file to a Zarr store and cache it +#' +#' @param x_type Matrix type key (matches h5ad_paths names) +#' @param h5ad_path Path to the corresponding H5AD file +#' @param cache_dir Directory to cache generated stores +#' @return Path to the generated Zarr store directory +generate_bench_zarr <- function(x_type, h5ad_path, cache_dir) { + path <- file.path(cache_dir, paste0("bench_", x_type, ".zarr")) + if (dir.exists(path)) { + return(path) + } + ad <- reticulate::import("anndata", convert = FALSE) + adata_py <- ad$read_h5ad(h5ad_path) + adata_py$write_zarr(path) + path +} + # --------------------------------------------------------------------------- # bench::mark → BMF JSON conversion # --------------------------------------------------------------------------- diff --git a/benchmarks/run_benchmarks.R b/benchmarks/run_benchmarks.R index 1de4a4a3..42a401aa 100644 --- a/benchmarks/run_benchmarks.R +++ b/benchmarks/run_benchmarks.R @@ -73,6 +73,22 @@ h5ad_paths <- setNames( ) cat("\n") +cat("Generating Zarr test data (converting from H5AD)...\n") +zarr_paths <- setNames( + vapply( + x_types, + function(xt) { + cat(sprintf(" %s... ", xt)) + path <- generate_bench_zarr(xt, h5ad_paths[[xt]], cache_dir) + cat("done\n") + path + }, + character(1) + ), + x_types +) +cat("\n") + # --------------------------------------------------------------------------- # Run selected suites # --------------------------------------------------------------------------- @@ -88,12 +104,12 @@ for (suite in suites_to_run) { suite_results <- switch( suite, - read = bench_read(h5ad_paths, opts$iterations, x_types), - write = bench_write(h5ad_paths, opts$iterations, x_types), - get = bench_get(h5ad_paths, opts$iterations), - set = bench_set(h5ad_paths, opts$iterations), - convert = bench_convert(h5ad_paths, opts$iterations, x_types), - subset = bench_subset(h5ad_paths, opts$iterations), + read = bench_read(h5ad_paths, opts$iterations, x_types, zarr_paths), + write = bench_write(h5ad_paths, opts$iterations, x_types, zarr_paths), + get = bench_get(h5ad_paths, opts$iterations, zarr_paths), + set = bench_set(h5ad_paths, opts$iterations, zarr_paths), + convert = bench_convert(h5ad_paths, opts$iterations, x_types, zarr_paths), + subset = bench_subset(h5ad_paths, opts$iterations, zarr_paths), { warning("Unknown suite: ", suite) list() diff --git a/benchmarks/suites/bench_convert.R b/benchmarks/suites/bench_convert.R index a679ea35..c01e1547 100644 --- a/benchmarks/suites/bench_convert.R +++ b/benchmarks/suites/bench_convert.R @@ -5,7 +5,7 @@ # format conversions (InMemory↔SCE, InMemory↔Seurat). # ============================================================================= -bench_convert <- function(h5ad_paths, iterations, x_types) { +bench_convert <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() # --- Backend conversions (per X type) --- @@ -47,6 +47,43 @@ bench_convert <- function(h5ad_paths, iterations, x_types) { ) } + # --- Zarr ↔ InMemory conversions (per X type) --- + for (xt in x_types) { + zarr_path <- zarr_paths[[xt]] + + # Zarr → InMemory + env <- new.env(parent = globalenv()) + env$.ad <- read_zarr(zarr_path, as = "ZarrAnnData") + + results <- c( + results, + run_one_benchmark( + name = paste0("convert_Zarr_to_InMemory_", xt), + expr = quote(.ad$as_InMemoryAnnData()), + iterations = iterations, + env = env + ) + ) + + # InMemory → Zarr + env2 <- new.env(parent = globalenv()) + env2$.ad <- read_zarr(zarr_path, as = "InMemoryAnnData") + + results <- c( + results, + run_one_benchmark( + name = paste0("convert_InMemory_to_Zarr_", xt), + expr = quote({ + .tmp <- tempfile() + .result <- .ad$as_ZarrAnnData(.tmp) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env2 + ) + ) + } + # --- Format conversions (using float_csparse as representative) --- path <- h5ad_paths[["float_csparse"]] ad <- read_h5ad(path, as = "InMemoryAnnData") diff --git a/benchmarks/suites/bench_get.R b/benchmarks/suites/bench_get.R index 08c28114..946baa81 100644 --- a/benchmarks/suites/bench_get.R +++ b/benchmarks/suites/bench_get.R @@ -42,15 +42,24 @@ colnames = quote(colnames(.ad)) ) -bench_get <- function(h5ad_paths, iterations) { +bench_get <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch( + backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) # Open the AnnData - ad <- read_h5ad(path, as = backend) + ad <- if (backend == "ZarrAnnData") { + read_zarr(zarr_paths[["float_csparse"]], as = "ZarrAnnData") + } else { + read_h5ad(path, as = backend) + } # --- Slot getters --- for (slot in .bench_slots) { diff --git a/benchmarks/suites/bench_read.R b/benchmarks/suites/bench_read.R index 605f9f4a..bc6dcc6b 100644 --- a/benchmarks/suites/bench_read.R +++ b/benchmarks/suites/bench_read.R @@ -5,7 +5,7 @@ # across different X matrix types. # ============================================================================= -bench_read <- function(h5ad_paths, iterations, x_types) { +bench_read <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() for (xt in x_types) { @@ -37,5 +37,32 @@ bench_read <- function(h5ad_paths, iterations, x_types) { ) } + # Read from Zarr store + for (xt in x_types) { + path <- zarr_paths[[xt]] + + # Read Zarr → InMemoryAnnData + results <- c( + results, + run_one_benchmark( + name = paste0("read_zarr_InMemory_", xt), + expr = quote(read_zarr(.path, as = "InMemoryAnnData")), + setup = bquote(.path <- .(path)), + iterations = iterations + ) + ) + + # Open Zarr lazily → ZarrAnnData + results <- c( + results, + run_one_benchmark( + name = paste0("read_zarr_Zarr_", xt), + expr = quote(read_zarr(.path, as = "ZarrAnnData")), + setup = bquote(.path <- .(path)), + iterations = iterations + ) + ) + } + results } diff --git a/benchmarks/suites/bench_set.R b/benchmarks/suites/bench_set.R index 8f751ba1..785e8c89 100644 --- a/benchmarks/suites/bench_set.R +++ b/benchmarks/suites/bench_set.R @@ -4,7 +4,7 @@ # Benchmarks setting every AnnData slot on both InMemory and HDF5 backends. # ============================================================================= -bench_set <- function(h5ad_paths, iterations) { +bench_set <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] @@ -22,12 +22,25 @@ bench_set <- function(h5ad_paths, iterations) { "uns" ) - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch( + backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) for (slot in slots) { - # For HDF5, we need a fresh writable copy for each slot - if (backend == "HDF5AnnData") { + # Each backend needs a fresh writable instance per slot + if (backend == "ZarrAnnData") { + # Copy Zarr store directory so each slot gets a fresh writable copy + zarr_path <- zarr_paths[["float_csparse"]] + tmp_parent <- tempfile() + dir.create(tmp_parent, recursive = TRUE) + file.copy(zarr_path, tmp_parent, recursive = TRUE) + tmp <- file.path(tmp_parent, basename(zarr_path)) + ad <- read_zarr(tmp, as = "ZarrAnnData", mode = "r+") + } else if (backend == "HDF5AnnData") { tmp <- tempfile(fileext = ".h5ad") file.copy(path, tmp) ad <- suppressWarnings( @@ -55,7 +68,9 @@ bench_set <- function(h5ad_paths, iterations) { ) ) - if (backend == "HDF5AnnData") { + if (backend == "ZarrAnnData") { + unlink(tmp, recursive = TRUE) + } else if (backend == "HDF5AnnData") { ad$close() unlink(tmp) } diff --git a/benchmarks/suites/bench_subset.R b/benchmarks/suites/bench_subset.R index f6a8b5f6..ac6935c4 100644 --- a/benchmarks/suites/bench_subset.R +++ b/benchmarks/suites/bench_subset.R @@ -5,13 +5,22 @@ # materialization back to concrete implementations. # ============================================================================= -bench_subset <- function(h5ad_paths, iterations) { +bench_subset <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" - ad <- read_h5ad(path, as = backend) + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch( + backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) + ad <- if (backend == "ZarrAnnData") { + read_zarr(zarr_paths[["float_csparse"]], as = "ZarrAnnData") + } else { + read_h5ad(path, as = backend) + } n_obs <- ad$n_obs() n_vars <- ad$n_vars() @@ -123,7 +132,22 @@ bench_subset <- function(h5ad_paths, iterations) { ) ) - # Clean up + # --- Materialize view → Zarr --- + results <- c( + results, + run_one_benchmark( + name = paste0("materialize_to_Zarr_", short), + expr = quote({ + .tmp <- tempfile() + .result <- .view$as_ZarrAnnData(.tmp) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env4 + ) + ) + + # Clean up (ZarrAnnData holds no persistent file handles) if (backend == "HDF5AnnData") { ad$close() } diff --git a/benchmarks/suites/bench_write.R b/benchmarks/suites/bench_write.R index 569b0788..e1acfddf 100644 --- a/benchmarks/suites/bench_write.R +++ b/benchmarks/suites/bench_write.R @@ -5,7 +5,7 @@ # with different compression settings and X matrix types. # ============================================================================= -bench_write <- function(h5ad_paths, iterations, x_types) { +bench_write <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() compressions <- c("none", "gzip") @@ -57,5 +57,50 @@ bench_write <- function(h5ad_paths, iterations, x_types) { } } + # Write to Zarr store + for (xt in x_types) { + path <- zarr_paths[[xt]] + + for (compression in compressions) { + # Write from InMemoryAnnData → Zarr + env <- new.env(parent = globalenv()) + env$.ad <- read_zarr(path, as = "InMemoryAnnData") + env$.compression <- compression + + results <- c( + results, + run_one_benchmark( + name = paste0("write_zarr_InMemory_", xt, "_", compression), + expr = quote({ + .tmp <- tempfile() + .ad$as_ZarrAnnData(.tmp, compression = .compression) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env + ) + ) + + # Write from ZarrAnnData → Zarr + env2 <- new.env(parent = globalenv()) + env2$.ad <- read_zarr(path, as = "ZarrAnnData") + env2$.compression <- compression + + results <- c( + results, + run_one_benchmark( + name = paste0("write_zarr_Zarr_", xt, "_", compression), + expr = quote({ + .tmp <- tempfile() + .ad$as_ZarrAnnData(.tmp, compression = .compression) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env2 + ) + ) + } + } + results }