anndataR/benchmarks/lib/helpers.R at 05d21263c2fc1f230cebb5604a1c8d1c1e45622c · scverse/anndataR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# =============================================================================
# Benchmark helpers: data generation, bench→BMF conversion, JSON output
# =============================================================================

# Load anndataR with internal functions available.
# In development: devtools::load_all() exposes unexported functions.
# In CI: the package is installed, so we use ::: for unexported functions.
if (
  file.exists("DESCRIPTION") &&
    grepl("anndataR", readLines("DESCRIPTION", n = 1))
) {
  devtools::load_all(".", quiet = TRUE)
} else {
  library(anndataR)
}
library(Matrix)

# ---------------------------------------------------------------------------
# Python dependency check
# ---------------------------------------------------------------------------

#' Check that reticulate, anndata, and dummy_anndata are available
check_bench_python_deps <- function() {
  if (!requireNamespace("reticulate", quietly = TRUE)) {
    stop(
      "reticulate is required for benchmarks.\n",
      "Install with: install.packages('reticulate')"
    )
  }
  if (!reticulate::py_module_available("anndata")) {
    stop(
      "Python 'anndata' module is required for benchmarks.\n",
      "Install with: reticulate::py_install('anndata')"
    )
  }
  if (!reticulate::py_module_available("dummy_anndata")) {
    stop(
      "Python 'dummy_anndata' module is required for benchmarks.\n",
      "Install with: reticulate::py_install('dummy-anndata')"
    )
  }
}

# ---------------------------------------------------------------------------
# Data generation (via Python dummy_anndata)
# ---------------------------------------------------------------------------

#' Generate and cache an H5AD file with all slots populated
#'
#' Uses Python's dummy_anndata to generate and write the H5AD file,
#' ensuring canonical encoding independent of anndataR's own writer.
#'
#' @param x_type Matrix type for X (e.g. "float_csparse").
#'   Must be a key in `dummy_anndata.matrix_generators`.
#' @param n_obs Number of observations
#' @param n_vars Number of variables
#' @param cache_dir Directory to cache generated files
#' @return Path to the generated H5AD file
generate_bench_h5ad <- function(x_type, n_obs, n_vars, cache_dir) {
  path <- file.path(cache_dir, paste0("bench_", x_type, ".h5ad"))
  if (file.exists(path)) {
    return(path)
  }

  da <- reticulate::import("dummy_anndata", convert = FALSE)

  adata_py <- da$generate_dataset(
    n_obs = as.integer(n_obs),
    n_vars = as.integer(n_vars),
    x_type = x_type,
    layer_types = list("float_csparse", "integer_csparse"),
    obs_types = list("categorical", "dense_array", "string_array"),
    var_types = list("string_array", "boolean_array", "dense_array"),
    obsm_types = list("float_matrix", "float_csparse"),
    varm_types = list("float_matrix"),
    obsp_types = list("float_csparse"),
    varp_types = list("float_csparse"),
    uns_types = list("string", "integer", "float"),
    nested_uns_types = list("string", "float")
  )

  adata_py$write_h5ad(path)
  path
}

#' Convert an H5AD bench file to a Zarr store and cache it
#'
#' @param x_type Matrix type key (matches h5ad_paths names)
#' @param h5ad_path Path to the corresponding H5AD file
#' @param cache_dir Directory to cache generated stores
#' @return Path to the generated Zarr store directory
generate_bench_zarr <- function(x_type, h5ad_path, cache_dir) {
  path <- file.path(cache_dir, paste0("bench_", x_type, ".zarr"))
  if (dir.exists(path)) {
    return(path)
  }
  ad <- reticulate::import("anndata", convert = FALSE)
  adata_py <- ad$read_h5ad(h5ad_path)
  adata_py$write_zarr(path)
  path
}

# ---------------------------------------------------------------------------
# bench::mark → BMF JSON conversion
# ---------------------------------------------------------------------------

#' Convert a bench::mark result into a BMF JSON entry
#'
#' @param bm A bench::mark result (tibble with one row)
#' @param name Benchmark name
#' @return A named list with BMF structure
bench_to_bmf <- function(bm, name) {
  # bench::mark returns bench_time objects; convert to nanoseconds
  median_ns <- as.numeric(bm$median, units = "secs") * 1e9
  min_ns <- as.numeric(bm$min, units = "secs") * 1e9
  # max may not be available with iterations = 1
  max_ns <- if ("max" %in% names(bm)) {
    as.numeric(bm$max, units = "secs") * 1e9
  } else {
    median_ns
  }

  entry <- list(
    latency = list(
      value = median_ns,
      lower_value = min_ns,
      upper_value = max_ns
    )
  )

  # Add memory allocation if available
  mem_bytes <- as.numeric(bm$mem_alloc)
  if (!is.na(mem_bytes)) {
    entry[["memory"]] <- list(value = mem_bytes)
  }

  stats::setNames(list(entry), name)
}

#' Write accumulated BMF results to a JSON file
#'
#' @param results Named list of BMF entries (as returned by bench_to_bmf)
#' @param path Output file path
write_bmf_json <- function(results, path) {
  # After accumulation via c(), results is already a flat named list:
  # list("bench_name1" = list(latency = ..., memory = ...),
  #      "bench_name2" = list(latency = ..., memory = ...), ...)
  # Write it directly as BMF JSON.
  jsonlite::write_json(results, path, auto_unbox = TRUE, pretty = TRUE)
  invisible(path)
}

# ---------------------------------------------------------------------------
# Safe benchmark runner
# ---------------------------------------------------------------------------

#' Run a single benchmark safely, returning a BMF entry or NULL on failure
#'
#' @param name Benchmark name
#' @param expr Expression to benchmark (quoted)
#' @param setup Expression to run before benchmarking (quoted)
#' @param iterations Number of iterations
#' @param env Environment for evaluation
#' @return A BMF entry (list) or NULL if the benchmark failed
run_one_benchmark <- function(
  name,
  expr,
  setup = NULL,
  iterations = 3L,
  env = parent.frame()
) {
  tryCatch(
    {
      if (!is.null(setup)) {
        eval(setup, envir = env)
      }
      bm <- bench::mark(
        eval(expr, envir = env),
        iterations = iterations,
        check = FALSE,
        filter_gc = FALSE
      )
      bench_to_bmf(bm, name)
    },
    error = function(e) {
      message("  [SKIP] ", name, ": ", conditionMessage(e))
      NULL
    }
  )
}

# ---------------------------------------------------------------------------
# CLI helpers
# ---------------------------------------------------------------------------

#' Parse command-line arguments for the benchmark runner
parse_bench_args <- function() {
  args <- commandArgs(trailingOnly = TRUE)
  opts <- list(
    n_obs = 2000L,
    n_vars = 1000L,
    iterations = 3L,
    suite = "all",
    output = "benchmarks/results.json"
  )

  i <- 1L
  while (i <= length(args)) {
    switch(
      args[i],
      "--n-obs" = {
        i <- i + 1L
        opts$n_obs <- as.integer(args[i])
      },
      "--n-vars" = {
        i <- i + 1L
        opts$n_vars <- as.integer(args[i])
      },
      "--iterations" = ,
      "-n" = {
        i <- i + 1L
        opts$iterations <- as.integer(args[i])
      },
      "--suite" = ,
      "-s" = {
        i <- i + 1L
        opts$suite <- args[i]
      },
      "--output" = ,
      "-o" = {
        i <- i + 1L
        opts$output <- args[i]
      },
      "--help" = ,
      "-h" = {
        cat(
          "Usage: Rscript benchmarks/run_benchmarks.R [OPTIONS]\n",
          "\n",
          "Options:\n",
          "  --n-obs N        Number of observations (default: 2000)\n",
          "  --n-vars N       Number of variables (default: 1000)\n",
          "  --iterations N   Iterations per benchmark (default: 3)\n",
          "  --suite NAME     Suite to run: all, read, write, get, set,\n",
          "                   convert, subset (default: all)\n",
          "  --output FILE    Output JSON path (default: benchmarks/results.json)\n",
          "  --help           Show this help\n",
          sep = ""
        )
        quit(status = 0)
      },
      {
        warning("Unknown argument: ", args[i])
      }
    )
    i <- i + 1L
  }
  opts
}