-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathChromBackendSpectra.R
More file actions
458 lines (432 loc) · 18.6 KB
/
ChromBackendSpectra.R
File metadata and controls
458 lines (432 loc) · 18.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#' @include helpers.R
#' @include hidden_aliases.R
#' @include ChromBackend.R
NULL
#' @title Chromatographic Data Backend for Spectra Objects
#'
#' @name ChromBackendSpectra
#'
#' @description
#' The `ChromBackendSpectra` class extends `ChromBackendMemory`, inheriting
#' all its slots and methods while providing additional functionality for
#' summarizing chromatographic data from [Spectra::Spectra()] objects.
#'
#' It can be initialized with a `Spectra` object, which is stored in the
#' `spectra` slot of the backend. Users can also provide a `data.frame`
#' containing chromatographic metadata, stored in `chromData`. This metadata
#' filters the `Spectra` object and generates `peaksData`. If `chromData` is
#' not provided, a default `data.frame` is created from the `Spectra` data.
#' An "rtMin", "rtMax", "mzMin", and "mzMax" column will be created by
#' condensing the `Spectra` data corresponding to each unique combination of
#' the `factorize.by` variables.
#'
#' By "factorization" we mean the process of grouping spectra
#' into chromatograms based on specified variables. For example, using
#' `factorize.by = c("msLevel", "dataOrigin")` means that all MS1 spectra from
#' file "A" form one chromatogram, all MS2 spectra from file "A" form another,
#' and so on. Each unique combination of the factorization variables creates
#' a separate chromatogram. This is essential for organizing spectral data into
#' meaningful chromatographic traces that can be visualized and analyzed.
#'
#' The *dataOrigin* core chromatogram variable should reflect the *dataOrigin*
#' of the `Spectra` object. The `factorize.by` parameter defines the variables
#' for grouping `Spectra` data into chromatographic data. The default is
#' `c("msLevel", "dataOrigin")`, which will define separate chromatograms for
#' each combination of `msLevel` and `dataOrigin`. These variables must be in
#' both `Spectra` and `chromData` (if provided).
#'
#' The `summarize.method` parameter defines how spectral data intensity is
#' summarized:
#' - **"sum"**: Sums intensity to create a Total Ion Chromatogram (TIC).
#' - **"max"**: Takes max intensity for a Base Peak Chromatogram (BPC).
#'
#' If `chromData` or its factorization columns are modified, the `factorize()`
#' method must be called to update `chromSpectraIndex`.
#'
#' @details
#' No `peaksData` is stored until the user calls a function that generates it
#' (e.g., `rtime()`, `peaksData()`, `intensity()`). The `peaksData` slot
#' replacement is unsupported — modifications are temporary to optimize memory.
#' The `inMemory` slot indicates this with `TRUE`.
#'
#' **Spectra Sort Index**: The `ChromBackendSpectra` backend maintains a
#' `spectraSortIndex` slot that stores a sort order for the internal `Spectra`
#' object based on `dataOrigin` and `rtime`. To optimize performance, the sort
#' index is only computed and stored when the spectra are unsorted; if already
#' sorted (which is typical for most real-world data), `spectraSortIndex` remains
#' empty (`integer()`). This avoids unnecessary subsetting operations. The sort
#' index is automatically recalculated whenever the `factorize()` method is called,
#' ensuring it remains valid and consistent. This approach avoids the need to
#' physically reorder disk-backed `Spectra` objects, which would require loading
#' all data into memory.
#'
#' **Factorize and Subsetting**: The `factorize()` method updates the
#' `chromSpectraIndex` in both `chromData` and the `spectra` object to reflect
#' the current grouping, and recalculates `spectraSortIndex` to maintain the
#' correct sort order. The `[` subsetting operator properly handles subsetting
#' of both `chromData`, `peaksData`, and `spectra`, while updating the
#' `spectraSortIndex` to reference valid positions in the subsetted data.
#'
#' `ChromBackendSpectra` should reuse `ChromBackendMemory` methods whenever
#' possible to keep implementations simple.
#'
#' @param chromData A `data.frame` with chromatographic data for use in
#' `backendInitialize()`. If missing, a default is generated. Columns
#' like `rtMin`, `rtMax`, `mzMin`, and `mzMax` must be provided and not
#' contain `NA` values. Use `-Inf/Inf` for unspecified values. The
#' `"dataOrigin"` column must match the `Spectra` object's
#' `"dataOrigin"`.
#'
#' @param factorize.by A `character` vector of variables for grouping `Spectra`
#' data into chromatographic data (i.e., creating separate chromatograms
#' for each unique combination of these variables).
#' Default: `c("msLevel", "dataOrigin")`, which creates one chromatogram
#' per MS level per data file.
#' If `chromData` is provided, it must contain these columns.
#'
#' @param object A `ChromBackendSpectra` object.
#'
#' @param spectra A `Spectra` object.
#'
#' @param spectraVariables A `character` vector specifying which variables
#' from the `Spectra` object should be added to the chromData. These
#' will be mapped using the `chromSpectraIndex` variable.
#'
#' @param summarize.method A `character` string specifying intensity summary:
#' `"sum"` (default) or `"max"`.
#'
#' @param ... Additional parameters.
#'
#' @author Philippine Louail, Johannes Rainer.
#'
#' @exportClass ChromBackendSpectra
#'
#' @return Refer to the individual function description for information on the
#' return value.
#'
#' @importClassesFrom Spectra Spectra
#' @importFrom Spectra Spectra spectraVariables spectraData concatenateSpectra
#'
#' @examples
#' library(Spectra)
#' library(MsBackendMetaboLights)
#'
#' ## Get Spectra data from MetaboLights
#' be <- backendInitialize(MsBackendMetaboLights(),
#' mtblsId = "MTBLS39",
#' filePattern = c("63B.cdf")
#' )
#' s <- Spectra(be)
#'
#' s <- setBackend(s, MsBackendMemory())
#'
#' ## Initialize ChromBackendSpectra
#' be_empty <- new("ChromBackendSpectra")
#' be <- backendInitialize(be_empty, s)
#'
#' ## replace the msLevel data
#' msLevel(be) <- c(1L, 2L, 3L)
#'
#' ## re-factorize the data
#' be <- factorize(be)
#'
#' ## Create BPC : we summarize the intensity present in the Spectra object
#' ## by the maximum value, thus creating a Base Peak Chromatogram.
#' be <- backendInitialize(be_empty, s, summarize.method = "max")
#'
#' ## Can now see the details of this bpc by looking at the chromData of our
#' ## object
#' chromData(be)
#'
#' ## Another possibilities is to create eics from the Spectra object.
#' ## Here we create an EIC with a specific m/z and retention time window.
#' df <- data.frame(mzMin = 100.01, mzMax = 100.02 , rtMin = 50, rtMax = 100)
#' be <- backendInitialize(be_empty, s, summarize.method = "sum")
#' chromData(be) <- cbind(chromData(be), df)
#'
#' ## now when we call the peaksData function, we will get the intensity
#' ## of the spectra object that are in the m/z and retention time window
#' ## defined in the chromData.
#' peaksData(be)
#'
NULL
#' @noRd
ChromBackendSpectra <- setClass(
"ChromBackendSpectra",
contains = "ChromBackendMemory",
slots = c(
inMemory = "logical",
spectra = "Spectra",
summaryFun = "function",
spectraSortIndex = "integer"
),
prototype = prototype(
chromData = fillCoreChromVariables(data.frame()),
peaksData = list(.EMPTY_PEAKS_DATA),
readonly = TRUE,
spectra = Spectra(),
version = "0.1",
inMemory = FALSE,
summaryFun = sumi,
spectraSortIndex = integer()
)
)
#' @rdname ChromBackendSpectra
#' @importFrom methods new
#' @export ChromBackendSpectra
ChromBackendSpectra <- function() {
new("ChromBackendSpectra")
}
#' @rdname ChromBackendSpectra
#' @importFrom methods callNextMethod
#' @importFrom MsCoreUtils rbindFill sumi maxi
setMethod("backendInitialize", "ChromBackendSpectra",
function(object, spectra = Spectra(),
factorize.by = c("msLevel" , "dataOrigin"),
summarize.method = c("sum", "max"),
chromData = fillCoreChromVariables(),
spectraVariables = character(),
...) {
summarize.method <- match.arg(summarize.method)
object@summaryFun <- if (summarize.method == "sum") sumi else maxi
if (!is(spectra, "Spectra"))
stop("'spectra' must be a 'Spectra' object.")
if (!length(spectra)) return(object)
if (!all(factorize.by %in% spectraVariables(spectra)))
stop("All 'factorize.by' variables must exist in 'spectra'.")
if (!is.data.frame(chromData))
stop("'chromData' must be a 'data.frame'.")
if(!nrow(chromData))
chromData <- fillCoreChromVariables(data.frame())
else validChromData(chromData)
if (!all(factorize.by %in% colnames(chromData)))
stop("All 'factorize.by' variables must exist ",
"in 'chromData'. If no chromData was provided, ",
"it needs to be part of the `coreChromVariables()` ",
"available.")
## Spectra object are not expected to be ordered by rtime,
## so we store a sort index instead of concatenating.
## This allows us to keep disk-backed backends intact.
## Only store sort index if data is actually unsorted (optimization).
sort_idx <- order(
spectra$dataOrigin,
spectra$rtime
)
if (!identical(sort_idx, seq_along(spectra))) {
object@spectraSortIndex <- sort_idx
}
object@chromData <- chromData
object@spectra <- spectra
object <- factorize(object, factorize.by = factorize.by)
## map additional spectraVariables if any
if (length(spectraVariables)) {
object <- .map_spectra_vars(object,
spectraVariables = spectraVariables)
}
callNextMethod(object, chromData = .chromData(object))
}
)
#' @rdname hidden_aliases
#' @importFrom methods callNextMethod
setMethod("show", "ChromBackendSpectra", function(object) {
callNextMethod()
cat("\nThe Spectra object contains", length(object@spectra), "spectra\n")
if (.inMemory(object)) cat("\nPeaks data is cached in memory\n")
})
#' @rdname ChromBackendSpectra
#' @export
chromSpectraIndex <- function(object) {
if (!is(object, "ChromBackendSpectra"))
stop("The object must be a 'ChromBackendSpectra' object.")
cd <- chromData(object, columns = "chromSpectraIndex", drop = TRUE)
if (!is.factor(cd))
cd <- factor(cd)
cd <- droplevels(cd)
cd
}
#' @rdname hidden_aliases
setMethod("factorize", "ChromBackendSpectra",
function(object, factorize.by = c("msLevel", "dataOrigin"),...) {
if (!all(factorize.by %in%
spectraVariables(.spectra(object))))
stop("All 'factorize.by' variables must be in the ",
"Spectra object.")
spectra_f <- interaction(as.list(
spectraData(.spectra(object))[,
factorize.by, drop = FALSE]),
drop = TRUE, sep = "_")
cd <- .chromData(object)
if (nrow(cd)) {
## chromData exists: validate and align spectra to it
if (!all(factorize.by %in% chromVariables(object)))
stop("All 'factorize.by' variables must be in chromData.")
cd$chromSpectraIndex <- interaction(cd[, factorize.by,
drop = FALSE],
drop = TRUE, sep = "_")
object@spectra$chromSpectraIndex <- factor(as.character(spectra_f),
levels = levels(cd$chromSpectraIndex))
## Apply sort index for processing if needed
if (length(object@spectraSortIndex)) {
sorted_spectra <- .spectra(object)[object@spectraSortIndex]
sorted_spectra_f <- spectra_f[object@spectraSortIndex]
} else {
sorted_spectra <- .spectra(object)
sorted_spectra_f <- spectra_f
}
object@chromData <- .ensure_rt_mz_columns(cd,
sorted_spectra,
sorted_spectra_f)
} else {
## chromData is empty: create it from spectra
object@spectra$chromSpectraIndex <- spectra_f
full_sp <- do.call(rbindFill,
lapply(split(.spectra(object), spectra_f),
.spectra_format_chromData))
rownames(full_sp) <- NULL
object@chromData <- full_sp
}
## Recalculate sort index: only store if data is unsorted (optimization)
sort_idx <- order(
object@spectra$dataOrigin,
object@spectra$rtime
)
if (!identical(sort_idx, seq_along(object@spectra))) {
object@spectraSortIndex <- sort_idx
} else {
object@spectraSortIndex <- integer()
}
object
})
#' @rdname hidden_aliases
#' @importMethodsFrom ProtGenerics backendParallelFactor
setMethod("backendParallelFactor", "ChromBackendSpectra", function(object, ...)
factor()
)
#' @rdname hidden_aliases
#' @export
setMethod("isReadOnly", "ChromBackendSpectra", function(object) TRUE)
#' @rdname hidden_aliases
setMethod(
"peaksData", "ChromBackendSpectra",
function(object, columns = peaksVariables(object),
drop = FALSE, ...) {
if (.inMemory(object) || !length(object)) {
return(callNextMethod())
}
## Ensure chromSpectraIndex only contains relevant levels needed
valid_f <- chromSpectraIndex(object)
## Apply the sort index to spectra for processing (only if unsorted)
if (length(object@spectraSortIndex)) {
sorted_spectra <- .spectra(object)[object@spectraSortIndex]
} else {
sorted_spectra <- .spectra(object)
}
current_vals <- as.character(sorted_spectra$chromSpectraIndex)
if (!setequal(unique(current_vals), levels(valid_f))) {
sorted_spectra$chromSpectraIndex <- factor(
current_vals,
levels = levels(valid_f)
)
}
## Process peaks data
pd <- mapply(.process_peaks_data,
cd = split(chromData(object), valid_f),
s = split(
sorted_spectra,
sorted_spectra$chromSpectraIndex
),
MoreArgs = list(
columns = columns,
fun = object@summaryFun,
drop = drop
),
SIMPLIFY = FALSE
)
unlist(pd, use.names = FALSE, recursive = FALSE)
}
)
#' @rdname hidden_aliases
setReplaceMethod("peaksData", "ChromBackendSpectra", function(object, value) {
message(
"The `peaksData` slot will be modified but the changes will not",
" affect the Spectra object."
)
object <- callNextMethod()
object@inMemory <- TRUE
object
})
#' @rdname hidden_aliases
#' @export
setMethod(
"supportsSetBackend", "ChromBackendSpectra",
function(object, ...) FALSE
)
#' @rdname hidden_aliases
#' @importMethodsFrom S4Vectors [ [[
#' @importFrom MsCoreUtils i2index
#' @importFrom stats setNames
#' @export
setMethod("[", "ChromBackendSpectra", function(x, i, j, ...) {
if (!length(i))
return(ChromBackendSpectra())
i <- i2index(i, length = length(x))
kept_indices <- chromSpectraIndex(x)[i]
x@chromData <- .chromData(x)[i, , drop = FALSE]
x@peaksData <- .peaksData(x)[i]
spectra_keep <- x@spectra$chromSpectraIndex %in% kept_indices
x@spectra <- x@spectra[spectra_keep]
## Update spectraSortIndex: remap old positions to new positions
if (length(x@spectraSortIndex)) {
old_positions_kept <- which(spectra_keep)
## Create mapping from old position to new position
## e.g., if we kept positions c(2, 5, 7), they become c(1, 2, 3)
position_mapping <- setNames(seq_along(old_positions_kept),
old_positions_kept)
## Keep only sort indices that reference kept positions
kept_sort_positions <- x@spectraSortIndex %in% old_positions_kept
x@spectraSortIndex <- as.integer(
position_mapping[as.character(x@spectraSortIndex[kept_sort_positions])]
)
}
x@chromData$chromSpectraIndex <- droplevels(x@chromData$chromSpectraIndex)
x@spectra$chromSpectraIndex <- droplevels(x@spectra$chromSpectraIndex)
x
})
#' @rdname hidden_aliases
setMethod("chromExtract", "ChromBackendSpectra",
function(object, peak.table, by, ...) {
required_cols <- c("rtMin", "rtMax", "mzMin", "mzMax", by)
.validate_chromExtract_input(
object = object,
peak.table = peak.table,
by = by, required_cols = required_cols
)
matched <- .match_chromdata_peaktable(
object = object,
peak.table = peak.table,
by = by
)
cd <- .chromData(matched$object)
chrom_keys <- matched$chrom_keys
peak_keys <- matched$peak_keys
cd_split <- split(cd, chrom_keys) ## UT need to check that
pk_split <- split(peak.table, peak_keys)
## Check overlapping columns
overl_cols <- .check_overl_columns(peak.table = peak.table,
object = object,
required_cols = required_cols)
## Merge peak.table into chromData safely.
new_cdata <- mapply(function(cd_row, pks) { ## could switch to bpmapply ?
d <- suppressWarnings(cbind(cd_row, pks[!overl_cols]))
d[, names(peak.table)[overl_cols]] <- pks[, overl_cols]
d
}, cd_row = cd_split,
pks = pk_split, SIMPLIFY = FALSE)
new_cdata <- do.call(rbind, new_cdata)
rownames(new_cdata) <- NULL
object@chromData <- new_cdata
object@peaksData <- replicate(nrow(new_cdata),
.EMPTY_PEAKS_DATA,
simplify = FALSE)
object
})