rformassspectrometry · philouail · Jan 22, 2026 · Oct 29, 2025 · Oct 29, 2025 · Jan 13, 2026
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -53,7 +53,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: 'devel', bioc: '3.22', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: ubuntu-latest, r: 'devel', bioc: '3.23', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           - { os: macOS-latest, r: 'latest', bioc: '3.22'}
           - { os: windows-latest, r: 'latest', bioc: '3.22'}
     env:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: Chromatograms
 Title: Infrastructure for Chromatographic Mass Spectrometry Data
-Version: 0.99.7
+Version: 1.1.1
 Description: The Chromatograms packages defines an efficient infrastructure
    for storing and handling of chromatographic mass spectrometry data. It
    provides different implementations of *backends* to store and represent the
@@ -41,6 +41,7 @@ Suggests:
     mzR (>= 2.41.4),
     MsBackendMetaboLights (>= 1.3.1),
     vdiffr,
+    IRanges,
     RColorBrewer
 License: Artistic-2.0
 Encoding: UTF-8

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(ChromBackendMemory)
 export(ChromBackendMzR)
 export(ChromBackendSpectra)
+export(chromSpectraIndex)
 export(coreChromVariables)
 export(corePeaksVariables)
 export(fillCoreChromVariables)
@@ -123,6 +124,7 @@ importFrom(stats,filter)
 importFrom(stats,loess)
 importFrom(stats,predict)
 importFrom(stats,sd)
+importFrom(stats,setNames)
 importFrom(stats,spline)
 importFrom(utils,capture.output)
 importFrom(utils,head)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,16 @@
-# Version 0.99.7
+# Version 1.1
+
+## Changes in 1.1.1
+
+- Aligned the package with the Bioconductor 3.22 release.
+- Expanded the vignette to cover ChromBackendSpectra usage, chromatogram
+  extraction with `chromExtract()`, and imputation workflows via
+  `imputePeaksData()`.
+- Added `spectraSortIndex()` for `ChromBackendSpectra` to compute the desired
+  retention-time order on demand, avoiding the need to keep on-disk `Spectra`
+  objects sorted in memory.
+
+# Version 0.99
 
 ## Changes in 0.99.7
 

diff --git a/R/ChromBackendMemory.R b/R/ChromBackendMemory.R
@@ -41,11 +41,7 @@ NULL
 #'
 #' @examples
 #'
-#' ## Create a ChromBackendMemory object
-#' cbm <- ChromBackendMemory()
-#'
-#' ## Initialize the ChromBackendMemory object with a data.frame of
-#' ## chromatographic data  and a list of data.frame of peaks data
+#' ## Method 1: Initialize backend directly
 #' cdata <- data.frame(
 #'     msLevel = c(1L, 1L, 1L),
 #'     mz = c(112.2, 123.3, 134.4),
@@ -67,9 +63,14 @@ NULL
 #'     )
 #' )
 #'
+#' cbm <- ChromBackendMemory()
 #' cbm <- backendInitialize(cbm, chromData = cdata, peaksData = pdata)
 #' cbm
 #'
+#' ## Method 2: Use Chromatograms constructor (recommended)
+#' chr <- Chromatograms(ChromBackendMemory(), chromData = cdata, peaksData = pdata)
+#' chr
+#'
 NULL
 
 #' @noRd

diff --git a/R/ChromBackendMzR.R b/R/ChromBackendMzR.R
@@ -24,7 +24,7 @@ NULL
 #'
 #' Implementing functionalities with the `ChromBackendMzR` backend should be
 #' simplified as much as possible and reuse the methods already implemented for
-#' `ChromBackendMemory` when possible.
+#' `ChromBackendMemory` when possible. 
 #'
 #' @param BPPARAM Parallel setup configuration. See [BiocParallel::bpparam()]
 #'        for more information.

diff --git a/R/ChromBackendSpectra.R b/R/ChromBackendSpectra.R
@@ -21,6 +21,14 @@ NULL
 #' condensing the `Spectra` data corresponding to each unique combination of
 #' the `factorize.by` variables.
 #'
+#' By "factorization" we mean the process of grouping spectra
+#' into chromatograms based on specified variables. For example, using
+#' `factorize.by = c("msLevel", "dataOrigin")` means that all MS1 spectra from
+#' file "A" form one chromatogram, all MS2 spectra from file "A" form another,
+#' and so on. Each unique combination of the factorization variables creates
+#' a separate chromatogram. This is essential for organizing spectral data into
+#' meaningful chromatographic traces that can be visualized and analyzed.
+#'
 #' The *dataOrigin* core chromatogram variable should reflect the *dataOrigin*
 #' of the `Spectra` object. The `factorize.by` parameter defines the variables
 #' for grouping `Spectra` data into chromatographic data. The default is
@@ -42,6 +50,24 @@ NULL
 #' replacement is unsupported — modifications are temporary to optimize memory.
 #' The `inMemory` slot indicates this with `TRUE`.
 #'
+#' **Spectra Sort Index**: The `ChromBackendSpectra` backend maintains a
+#' `spectraSortIndex` slot that stores a sort order for the internal `Spectra`
+#' object based on `dataOrigin` and `rtime`. To optimize performance, the sort
+#' index is only computed and stored when the spectra are unsorted; if already
+#' sorted (which is typical for most real-world data), `spectraSortIndex` remains
+#' empty (`integer()`). This avoids unnecessary subsetting operations. The sort
+#' index is automatically recalculated whenever the `factorize()` method is called,
+#' ensuring it remains valid and consistent. This approach avoids the need to
+#' physically reorder disk-backed `Spectra` objects, which would require loading
+#' all data into memory.
+#'
+#' **Factorize and Subsetting**: The `factorize()` method updates the
+#' `chromSpectraIndex` in both `chromData` and the `spectra` object to reflect
+#' the current grouping, and recalculates `spectraSortIndex` to maintain the
+#' correct sort order. The `[` subsetting operator properly handles subsetting
+#' of both `chromData`, `peaksData`, and `spectra`, while updating the
+#' `spectraSortIndex` to reference valid positions in the subsetted data.
+#'
 #' `ChromBackendSpectra` should reuse `ChromBackendMemory` methods whenever
 #' possible to keep implementations simple.
 #'
@@ -53,8 +79,10 @@ NULL
 #'        `"dataOrigin"`.
 #'
 #' @param factorize.by A `character` vector of variables for grouping `Spectra`
-#'        data into chromatographic data.
-#'        Default: `c("msLevel", "dataOrigin")`.
+#'        data into chromatographic data (i.e., creating separate chromatograms
+#'        for each unique combination of these variables).
+#'        Default: `c("msLevel", "dataOrigin")`, which creates one chromatogram
+#'        per MS level per data file.
 #'        If `chromData` is provided, it must contain these columns.
 #'
 #' @param object A `ChromBackendSpectra` object.
@@ -131,7 +159,8 @@ ChromBackendSpectra <- setClass(
     slots = c(
         inMemory = "logical",
         spectra = "Spectra",
-        summaryFun = "function"
+        summaryFun = "function",
+        spectraSortIndex = "integer"
     ),
     prototype = prototype(
         chromData = fillCoreChromVariables(data.frame()),
@@ -140,7 +169,8 @@ ChromBackendSpectra <- setClass(
         spectra = Spectra(),
         version = "0.1",
         inMemory = FALSE,
-        summaryFun = sumi
+        summaryFun = sumi,
+        spectraSortIndex = integer()
     )
 )
 
@@ -166,12 +196,10 @@ setMethod("backendInitialize", "ChromBackendSpectra",
               if (!is(spectra, "Spectra"))
                   stop("'spectra' must be a 'Spectra' object.")
               if (!length(spectra)) return(object)
-
               if (!all(factorize.by %in% spectraVariables(spectra)))
                   stop("All 'factorize.by' variables must exist in 'spectra'.")
               if (!is.data.frame(chromData))
                   stop("'chromData' must be a 'data.frame'.")
-
               if(!nrow(chromData))
                   chromData <- fillCoreChromVariables(data.frame())
               else  validChromData(chromData)
@@ -181,17 +209,19 @@ setMethod("backendInitialize", "ChromBackendSpectra",
                        "it needs to be part of the `coreChromVariables()` ",
                        "available.")
               ## Spectra object are not expected to be ordered by rtime,
-              ## so we fix that below.
-              spectra <- lapply(split(spectra, spectra$dataOrigin),
-                                function(x) {
-                  x[order(x$rtime)]
-              })
-              spectra <- concatenateSpectra(spectra)
+              ## so we store a sort index instead of concatenating.
+              ## This allows us to keep disk-backed backends intact.
+              ## Only store sort index if data is actually unsorted (optimization).
+              sort_idx <- order(
+                  spectra$dataOrigin,
+                  spectra$rtime
+              )
+              if (!identical(sort_idx, seq_along(spectra))) {
+                  object@spectraSortIndex <- sort_idx
+              }
               object@chromData <- chromData
               object@spectra <- spectra
-
               object <- factorize(object, factorize.by = factorize.by)
-
               ## map additional spectraVariables if any
               if (length(spectraVariables)) {
                   object <- .map_spectra_vars(object,
@@ -210,7 +240,7 @@ setMethod("show", "ChromBackendSpectra", function(object) {
 })
 
 #' @rdname ChromBackendSpectra
-#' @note ensure that it returns a factor
+#' @export
 chromSpectraIndex <- function(object) {
     if (!is(object, "ChromBackendSpectra"))
         stop("The object must be a 'ChromBackendSpectra' object.")
@@ -228,32 +258,52 @@ setMethod("factorize", "ChromBackendSpectra",
                      spectraVariables(.spectra(object))))
                   stop("All 'factorize.by' variables must be in the ",
                        "Spectra object.")
-           spectra_f <- interaction(as.list(
+            spectra_f <- interaction(as.list(
                spectraData(.spectra(object))[,
-                                                    factorize.by, drop = FALSE]),
+                                            factorize.by, drop = FALSE]),
                drop = TRUE, sep = "_")
-
-           cd <- .chromData(object)
-          if (nrow(cd)) {
-              if (!all(factorize.by %in% chromVariables(object)))
-                  stop("All 'factorize.by' variables must be in chromData.")
-              cd$chromSpectraIndex <- interaction(cd[, factorize.by,
-                                                     drop = FALSE],
-                                                  drop = TRUE, sep = "_")
-              levels(spectra_f) <- levels(cd$chromSpectraIndex)
-              object@spectra$chromSpectraIndex <- droplevels(spectra_f)
-              object@chromData <- .ensure_rt_mz_columns(cd,
-                                                        .spectra(object),
-                                                        spectra_f)
-          } else {
-              object@spectra$chromSpectraIndex <- spectra_f
-              full_sp <- do.call(rbindFill,
-                                 lapply(split(.spectra(object), spectra_f),
-                                        .spectra_format_chromData))
-              rownames(full_sp) <- NULL
-              object@chromData <- full_sp
-              }
-          object
+            cd <- .chromData(object)
+
+            if (nrow(cd)) {
+                ## chromData exists: validate and align spectra to it
+                if (!all(factorize.by %in% chromVariables(object)))
+                    stop("All 'factorize.by' variables must be in chromData.")
+                cd$chromSpectraIndex <- interaction(cd[, factorize.by,
+                                                        drop = FALSE],
+                                                     drop = TRUE, sep = "_")
+                object@spectra$chromSpectraIndex <- factor(as.character(spectra_f),
+                                                           levels = levels(cd$chromSpectraIndex))
+                ## Apply sort index for processing if needed
+                if (length(object@spectraSortIndex)) {
+                    sorted_spectra <- .spectra(object)[object@spectraSortIndex]
+                    sorted_spectra_f <- spectra_f[object@spectraSortIndex]
+                } else {
+                    sorted_spectra <- .spectra(object)
+                    sorted_spectra_f <- spectra_f
+                }
+                object@chromData <- .ensure_rt_mz_columns(cd,
+                                                          sorted_spectra,
+                                                          sorted_spectra_f)
+            } else {
+                ## chromData is empty: create it from spectra
+                object@spectra$chromSpectraIndex <- spectra_f
+                full_sp <- do.call(rbindFill,
+                                   lapply(split(.spectra(object), spectra_f),
+                                          .spectra_format_chromData))
+                rownames(full_sp) <- NULL
+                object@chromData <- full_sp
+            }
+            ## Recalculate sort index: only store if data is unsorted (optimization)
+            sort_idx <- order(
+                object@spectra$dataOrigin,
+                object@spectra$rtime
+            )
+            if (!identical(sort_idx, seq_along(object@spectra))) {
+                object@spectraSortIndex <- sort_idx
+            } else {
+                object@spectraSortIndex <- integer()
+            }
+            object
           })
 
 #' @rdname hidden_aliases
@@ -276,9 +326,15 @@ setMethod(
         }
         ## Ensure chromSpectraIndex only contains relevant levels needed
         valid_f <- chromSpectraIndex(object)
-        current_vals <- as.character(.spectra(object)$chromSpectraIndex)
+        ## Apply the sort index to spectra for processing (only if unsorted)
+        if (length(object@spectraSortIndex)) {
+            sorted_spectra <- .spectra(object)[object@spectraSortIndex]
+        } else {
+            sorted_spectra <- .spectra(object)
+        }
+        current_vals <- as.character(sorted_spectra$chromSpectraIndex)
         if (!setequal(unique(current_vals), levels(valid_f))) {
-            object@spectra$chromSpectraIndex <- factor(
+            sorted_spectra$chromSpectraIndex <- factor(
                 current_vals,
                 levels = levels(valid_f)
             )
@@ -287,8 +343,8 @@ setMethod(
         pd <- mapply(.process_peaks_data,
             cd = split(chromData(object), valid_f),
             s = split(
-                .spectra(object),
-                .spectra(object)$chromSpectraIndex
+                sorted_spectra,
+                sorted_spectra$chromSpectraIndex
             ),
             MoreArgs = list(
                 columns = columns,
@@ -323,11 +379,37 @@ setMethod(
 
 #' @rdname hidden_aliases
 #' @importMethodsFrom S4Vectors [ [[
+#' @importFrom MsCoreUtils i2index
+#' @importFrom stats setNames
 #' @export
 setMethod("[", "ChromBackendSpectra", function(x, i, j, ...) {
     if (!length(i))
         return(ChromBackendSpectra())
-    callNextMethod()
+
+    i <- i2index(i, length = length(x))
+    kept_indices <- chromSpectraIndex(x)[i]
+    x@chromData <- .chromData(x)[i, , drop = FALSE]
+    x@peaksData <- .peaksData(x)[i]
+    spectra_keep <- x@spectra$chromSpectraIndex %in% kept_indices
+    x@spectra <- x@spectra[spectra_keep]
+
+    ## Update spectraSortIndex: remap old positions to new positions
+    if (length(x@spectraSortIndex)) {
+        old_positions_kept <- which(spectra_keep)
+        ## Create mapping from old position to new position
+        ## e.g., if we kept positions c(2, 5, 7), they become c(1, 2, 3)
+        position_mapping <- setNames(seq_along(old_positions_kept), 
+                                     old_positions_kept)
+        ## Keep only sort indices that reference kept positions
+        kept_sort_positions <- x@spectraSortIndex %in% old_positions_kept
+        x@spectraSortIndex <- as.integer(
+            position_mapping[as.character(x@spectraSortIndex[kept_sort_positions])]
+        )
+    }
+
+    x@chromData$chromSpectraIndex <- droplevels(x@chromData$chromSpectraIndex)
+    x@spectra$chromSpectraIndex <- droplevels(x@spectra$chromSpectraIndex)
+    x
 })
 
 #' @rdname hidden_aliases