From 5fe5b8979b96246c0e6903627ed7436883930670 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 18:07:02 +0200 Subject: [PATCH 01/24] updates for Vizgen --- R/preprocessing.R | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 45baa4254..63a6e6cc7 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -1,5 +1,7 @@ #' @include generics.R #' @importFrom progressr progressor +#' @import dplyr +#' @importFrom magrittr %>% %<>% #' NULL @@ -2393,6 +2395,21 @@ ReadVitessce <- function( #' \item \dQuote{fov}: cell's fov #' } #' @param z Z-index to load; must be between 0 and 6, inclusive +#' @param use.BiocParallel If to use \code{BiocParallel::bplapply()}, +#' default is \code{TRUE}, if \code{FALSE}, uses \code{future} library +#' @param workers.total Number of cores to use for \code{BiocParallel::bplapply()} +#' @param DTthreads.pct Set percentage eg \code{50} of total threads to use for \code{data.table::fread}, +#' if set to \code{NULL} will use default setting as in \code{data.table::getDTthreads(verbose = T)} +#' @param coord.space; +#' choose one or more of: +#' \itemize{ +#' \item \dQuote{pixel}: molecule coordinates in pixel space +#' \item \dQuote{micron}: molecule coordinates in micron space +#' } +#' @param use.cellpose.out If TRUE, and ./Cellpose folder exists, will load results from current MERSCOPE Instrument output. Default to TRUE. Set to FALSE if to use previous outputs (ie. non-Cellpose). +#' @param add.zIndex If to add \code{z} slice index to a cell +#' @param update.object If to update final object, default to TRUE. +#' @param ... Arguments passed to \code{ReadVizgen()} #' #' @return \code{ReadVizgen}: A list with some combination of the #' following values: @@ -2435,6 +2452,12 @@ ReadVizgen <- function( molecules = NULL, type = 'segmentations', mol.type = 'microns', + use.cellpose.out = TRUE, + coord.space = "micron", + metadata = NULL, + use.BiocParallel = TRUE, + workers.total = 12, + DTthreads.pct = NULL, metadata = NULL, filter = NA_character_, z = 3L @@ -2443,7 +2466,22 @@ ReadVizgen <- function( if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") } - # hdf5r is only used for loading polygon boundaries + if (!requireNamespace("BiocParallel", quietly = TRUE)) { + stop("Please install 'BiocParallel' for parallelization", call. = FALSE) + } + + if (!requireNamespace("sfarrow", quietly = TRUE)) { + stop("Please install 'sfarrow' for reading cell boundaries from `.parquet` files ", + call. = FALSE) + } + + + + + + + + # hdf5r is only used for loading polygon boundaries # Not needed for all Vizgen input hdf5 <- requireNamespace("hdf5r", quietly = TRUE) # Argument checking From aa32cc66c3258b0fe16b73919648a62f30015846 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 18:12:49 +0200 Subject: [PATCH 02/24] updates for Vizgen --- R/convenience.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/convenience.R b/R/convenience.R index ff0e91900..775591ff7 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -1,5 +1,6 @@ #' @include generics.R #' @include visualization.R +#' @importFrom magrittr %>% %<>% #' NULL From 56d983c3e5e9b2106e3fac26384d9857b2aadc66 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 19:23:25 +0200 Subject: [PATCH 03/24] updates for `ReadVizgen()` --- R/preprocessing.R | 560 +++++++++++++++++++++++++++------------------- 1 file changed, 334 insertions(+), 226 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 63a6e6cc7..d65642402 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2446,42 +2446,59 @@ ReadVitessce <- function( #' @template note-reqdpkg #' ReadVizgen <- function( - data.dir, - transcripts = NULL, - spatial = NULL, - molecules = NULL, - type = 'segmentations', - mol.type = 'microns', - use.cellpose.out = TRUE, - coord.space = "micron", - metadata = NULL, - use.BiocParallel = TRUE, - workers.total = 12, - DTthreads.pct = NULL, - metadata = NULL, - filter = NA_character_, - z = 3L + data.dir, + transcripts = NULL, + spatial = NULL, + molecules = NULL, + type = 'segmentations', + mol.type = 'microns', + use.cellpose.out = TRUE, + coord.space = "micron", + metadata = NULL, + use.BiocParallel = TRUE, + workers.total = 12, + DTthreads.pct = NULL, + metadata = NULL, + filter = NA_character_, + z = 3L ) { + # TODO: handle multiple segmentations per z-plane + if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") } if (!requireNamespace("BiocParallel", quietly = TRUE)) { - stop("Please install 'BiocParallel' for parallelization", call. = FALSE) - } - - if (!requireNamespace("sfarrow", quietly = TRUE)) { - stop("Please install 'sfarrow' for reading cell boundaries from `.parquet` files ", - call. = FALSE) - } - - - - - - - - # hdf5r is only used for loading polygon boundaries + stop("Please install 'BiocParallel' for parallelization") + } + + if (!requireNamespace("sfarrow", quietly = TRUE)) { + stop("Please install 'sfarrow' for reading cell boundaries from `.parquet` files") + } + + # setting workers to use for parallel computing - `parallel` ---- + if (use.BiocParallel) { + message("Using parallelization with: `BiocParallel`") + if (is.null(workers.total)) { + workers.total <- quantile(BiocParallel::multicoreWorkers() %>% seq)[4] %>% ceiling + message(workers.total, " of total workers available will be used") + } else { message("Setting total workers to: ", workers.total) } + } else { message("Using parallelization with: `future`", "\n", + "NOTE: set workers for parallelization, eg: `future::plan('multisession', workers = 10)`") + } + + # setting cores to use for parallel computing - `data.table` + if (!is.null(DTthreads.pct)) { + message("Using parallelization with: `data.table`", "\n", + "..for `data.table::fread`") + data.table::setDTthreads(threads = 0) # all cores + DTthreads <- data.table::getDTthreads() # max cores + DTthreads <- c((DTthreads * DTthreads.pct) / 100) %>% ceiling # percentage from total threads + message("Setting DTthreads to: ", DTthreads, " (", paste0(DTthreads.pct, "%"), ")") + data.table::setDTthreads(threads = DTthreads) # set + } + + # hdf5r is only used for loading polygon boundaries # Not needed for all Vizgen input hdf5 <- requireNamespace("hdf5r", quietly = TRUE) # Argument checking @@ -2515,12 +2532,26 @@ ReadVizgen <- function( if (use.dir && !dir.exists(paths = data.dir)) { stop("Cannot find Vizgen directory ", data.dir) } - # Identify input files - files <- c( - transcripts = transcripts %||% 'cell_by_gene[_a-zA-Z0-9]*.csv', - spatial = spatial %||% 'cell_metadata[_a-zA-Z0-9]*.csv', - molecules = molecules %||% 'detected_transcripts[_a-zA-Z0-9]*.csv' - ) + + # use Cellpose output ---- + if (use.cellpose.out && grep("Cellpose", list.files(data.dir)) %>% any) { + message("Cellpose output will be used..") + # Identify input files + files <- c(transcripts = transcripts %||% paste0(data.dir, + paste0("/", list.files(data.dir, pattern = "Cellpose"), + "/cellpose_cell_by_gene.csv")), + + spatial = spatial %||% paste0(data.dir, + paste0("/", list.files(data.dir, pattern = "Cellpose"), + "/cellpose_cell_metadata.csv")), + + molecules = molecules %||% "detected_transcripts[_a-zA-Z0-9]*.csv") + } else { + files <- c(transcripts = transcripts %||% "cell_by_gene[_a-zA-Z0-9]*.csv", + spatial = spatial %||% "cell_metadata[_a-zA-Z0-9]*.csv", + molecules = molecules %||% "detected_transcripts[_a-zA-Z0-9]*.csv") + } + files[is.na(x = files)] <- NA_character_ h5dir <- file.path( ifelse( @@ -2575,16 +2606,28 @@ ReadVizgen <- function( sp <- sp[, -1, drop = FALSE] # Check to see if we should load segmentations if ('segmentations' %in% type) { - poly <- if (isFALSE(x = hdf5)) { + poly <- if (isFALSE(x = hdf5) && !use.cellpose.out) { warning( "Cannot find hdf5r; unable to load segmentation vertices", immediate. = TRUE ) FALSE - } else if (!dir.exists(paths = h5dir)) { + } else if (!dir.exists(paths = h5dir) && !use.cellpose.out) { warning("Cannot find cell boundary H5 files", immediate. = TRUE) FALSE - } else { + } else if (use.cellpose.out) { # added for Cellpose output ---- + files2scan <- + list.files(data.dir, pattern = ".parquet$", + all.files = TRUE, + full.names = TRUE, + recursive = TRUE) + if (length(files2scan)) { + message("Cell segmentations are found in `.parquet` file(s)", "\n", + "..using ", coord.space, " space coordinates") + } + TRUE + } + else { TRUE } if (isFALSE(x = poly)) { @@ -2632,199 +2675,264 @@ ReadVizgen <- function( files <- files[setdiff(x = names(x = files), y = 'molecules')] } files <- files[!is.na(x = files)] - # Read input data + + # Read input data ---- outs <- vector(mode = 'list', length = length(x = files)) names(x = outs) <- names(x = files) if (!is.null(metadata)) { outs <- c(outs, list(metadata = NULL)) } for (otype in names(x = outs)) { - outs[[otype]] <- switch( - EXPR = otype, - 'transcripts' = { - ptx <- progressor() - ptx(message = 'Reading counts matrix', class = 'sticky', amount = 0) - tx <- data.table::fread( - file = files[[otype]], - sep = ',', - data.table = FALSE, - verbose = FALSE - ) - rownames(x = tx) <- as.character(x = tx[, 1]) - tx <- t(x = as.matrix(x = tx[, -1, drop = FALSE])) - if (!is.na(x = filter)) { - ptx( - message = paste("Filtering genes with pattern", filter), - class = 'sticky', - amount = 0 - ) - tx <- tx[!grepl(pattern = filter, x = rownames(x = tx)), , drop = FALSE] - } - ratio <- getOption(x = 'Seurat.input.sparse_ratio', default = 0.4) - if ((sum(tx == 0) / length(x = tx)) > ratio) { - ptx( - message = 'Converting counts to sparse matrix', - class = 'sticky', - amount = 0 - ) - tx <- as.sparse(x = tx) - } - ptx(type = 'finish') - tx - }, - 'centroids' = { - pcents <- progressor() - pcents( - message = 'Creating centroid coordinates', - class = 'sticky', - amount = 0 - ) - pcents(type = 'finish') - data.frame( - x = sp$center_x, - y = sp$center_y, - cell = rownames(x = sp), - stringsAsFactors = FALSE - ) - }, - 'segmentations' = { - ppoly <- progressor(steps = length(x = unique(x = sp$fov))) - ppoly( - message = "Creating polygon coordinates", - class = 'sticky', - amount = 0 - ) - pg <- future_lapply( - X = unique(x = sp$fov), - FUN = function(f, ...) { - fname <- file.path(h5dir, paste0('feature_data_', f, '.hdf5')) - if (!file.exists(fname)) { - warning( - "Cannot find HDF5 file for field of view ", - f, - immediate. = TRUE - ) - return(NULL) - } - hfile <- hdf5r::H5File$new(filename = fname, mode = 'r') - on.exit(expr = hfile$close_all()) - cells <- rownames(x = subset(x = sp, subset = fov == f)) - df <- lapply( - X = cells, - FUN = function(x) { - return(tryCatch( - expr = { - cc <- hfile[['featuredata']][[x]][[zidx]][['p_0']][['coordinates']]$read() - cc <- as.data.frame(x = t(x = cc)) - colnames(x = cc) <- c('x', 'y') - cc$cell <- x - cc - }, - error = function(...) { - return(NULL) - } - )) - } - ) - ppoly() - return(do.call(what = 'rbind', args = df)) - } - ) - ppoly(type = 'finish') - pg <- do.call(what = 'rbind', args = pg) - npg <- length(x = unique(x = pg$cell)) - if (npg < nrow(x = sp)) { - warning( - nrow(x = sp) - npg, - " cells missing polygon information", - immediate. = TRUE - ) - } - pg - }, - 'boxes' = { - pbox <- progressor(steps = nrow(x = sp)) - pbox(message = "Creating box coordinates", class = 'sticky', amount = 0) - bx <- future_lapply( - X = rownames(x = sp), - FUN = function(cell) { - row <- sp[cell, ] - df <- expand.grid( - x = c(row$min_x, row$max_x), - y = c(row$min_y, row$max_y), - cell = cell, - KEEP.OUT.ATTRS = FALSE, - stringsAsFactors = FALSE - ) - df <- df[c(1, 3, 4, 2), , drop = FALSE] - pbox() - return(df) - } - ) - pbox(type = 'finish') - do.call(what = 'rbind', args = bx) - }, - 'metadata' = { - pmeta <- progressor() - pmeta( - message = 'Loading metadata', - class = 'sticky', - amount = 0 - ) - pmeta(type = 'finish') - sp[, metadata, drop = FALSE] - }, - 'pixels' = { - ppixels <- progressor() - ppixels( - message = 'Creating pixel-level molecule coordinates', - class = 'sticky', - amount = 0 - ) - df <- data.frame( - x = mx$x, - y = mx$y, - gene = mx$gene, - stringsAsFactors = FALSE - ) - # if (!is.na(x = filter)) { - # ppixels( - # message = paste("Filtering molecules with pattern", filter), - # class = 'sticky', - # amount = 0 - # ) - # df <- df[!grepl(pattern = filter, x = df$gene), , drop = FALSE] - # } - ppixels(type = 'finish') - df - }, - 'microns' = { - pmicrons <- progressor() - pmicrons( - message = "Creating micron-level molecule coordinates", - class = 'sticky', - amount = 0 - ) - df <- data.frame( - x = mx$global_x, - y = mx$global_y, - gene = mx$gene, - stringsAsFactors = FALSE - ) - # if (!is.na(x = filter)) { - # pmicrons( - # message = paste("Filtering molecules with pattern", filter), - # class = 'sticky', - # amount = 0 - # ) - # df <- df[!grepl(pattern = filter, x = df$gene), , drop = FALSE] - # } - pmicrons(type = 'finish') - df - }, - stop("Unknown MERFISH input type: ", type) - ) - } + outs[[otype]] <- + switch(EXPR = otype, + transcripts = { + ptx <- progressor() + ptx(message = "Reading counts matrix", class = "sticky", + amount = 0) + tx <- data.table::fread(file = files[[otype]], sep = ",", + data.table = FALSE, verbose = FALSE) + rownames(x = tx) <- as.character(x = tx[, 1]) + tx <- t(x = as.matrix(x = tx[, -1, drop = FALSE])) + if (!is.na(x = filter)) { + ptx(message = paste("Filtering genes with pattern", + filter), class = "sticky", amount = 0) + tx <- tx[!grepl(pattern = filter, x = rownames(x = tx)), + , drop = FALSE] + } + ratio <- getOption(x = "Seurat.input.sparse_ratio", + default = 0.4) + if ((sum(tx == 0)/length(x = tx)) > ratio) { + ptx(message = "Converting counts to sparse matrix", + class = "sticky", amount = 0) + tx <- as.sparse(x = tx) + } + ptx(type = "finish") + tx + + }, centroids = { + pcents <- progressor() + pcents(message = "Creating centroid coordinates", + class = "sticky", amount = 0) + pcents(type = "finish") + data.frame(x = sp$center_x, y = sp$center_y, cell = rownames(x = sp), + stringsAsFactors = FALSE) + + # use Cellpose segmentations + }, segmentations = { + if (use.cellpose.out) { + files2scan <- + list.files(data.dir, pattern = ".parquet$", + all.files = TRUE, + full.names = TRUE, + recursive = TRUE) + if (length(files2scan)) { + file2read <- files2scan %>% + grep(coord.space, ., value = TRUE) %>% + grep(".parquet$", ., value = TRUE) + } + + # read .parquet file.. + parq <- sfarrow::st_read_parquet(file2read) + + # get all cell segmentations + segs <- filter(parq, ZIndex == z) %>% pull(Geometry) + # check if any cells have > 1 segmentation boundary + test.segs <- + lapply(segs %>% seq, function(i) segs[[i]][[1]] %>% length) + if (which(unlist(test.segs) > 1) %>% any) { + segs.art.index <- which(unlist(test.segs) > 1) + message(segs.art.index %>% length, + " Cells have > 1 segmentaion boundary artifacts", "\n", + "..removing artifacts", "\n", + "..keeping cell boundaries with maximum coords") + # usually artifacts have small boundaries/coords + # find cell boundaries with maximum coords + for (i in segs.art.index %>% seq) { + dims <- lapply(segs[[segs.art.index[i]]][[1]] %>% seq(), + function(d) { dim(segs[[segs.art.index[i]]][[1]][[d]])[1] } ) + # get & keep boundaries with maximum coords + maxs.segs <- which(unlist(dims) == max(unlist(dims))) + segs[[segs.art.index[i]]][[1]] <- segs[[segs.art.index[i]]][[1]][maxs.segs] + } + } else { message("All cells have 1 segmentaion boundary (no artifacts)") } + # add cell names + names(segs) <- filter(parq, ZIndex == z) %>% pull(EntityID) %>% as.character + + # TODO: (optionally) resample & make cell boundaries equidistant! + if (use.BiocParallel) { + # extract cell boundaries per cells (faster) + gc() %>% invisible() # free up memory + message("Using fast extraction of cell boundaries..") + segs_list <- + BiocParallel::bplapply(segs %>% seq, + function(i) { + segs[[i]][[1]] %>% + data.table::as.data.table(x = .) %>% + as.data.frame %>% + mutate(cell = names(segs)[i]) }, + BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + force.GC = FALSE, + progressbar = TRUE) + ) + } else { + # extract cell boundaries per cells + message("Extracting of cell boundaries..") + segs_list <- + lapply(segs %>% seq, + function(i) { + segs[[i]][[1]] %>% + data.table::as.data.table(x = .) %>% + as.data.frame %>% + mutate(cell = names(segs)[i]) + } + ) + } + + #segs_list %>% length + # df of all cell segmentations + gc() %>% invisible() # free up memory + #segs <- do.call("rbind", segs_list) + segs <- data.table::rbindlist(segs_list) + names(segs)[1:2] <- c("x", "y") + message("All cell boundaries (with no artifacts) are loaded..") + segs + } else { # use non-Cellpose segmentations + ppoly <- progressor(steps = length(x = unique(x = sp$fov))) + ppoly(message = "Creating polygon coordinates", class = "sticky", + amount = 0) + # use parallel or future + if (use.BiocParallel) { + pg <- BiocParallel::bplapply(X = unique(x = sp$fov), FUN = function(f, ...) { + fname <- file.path(h5dir, paste0("feature_data_", f, ".hdf5")) + if (!file.exists(fname)) { + warning("Cannot find HDF5 file for field of view ", + f, immediate. = TRUE) + return(NULL) + } + # reading hdf5 files + hfile <- hdf5r::H5File$new(filename = fname, + mode = "r") + on.exit(expr = hfile$close_all()) + cells <- rownames(x = subset(x = sp, subset = fov == f)) + # creating df for cell boundaries + df <- BiocParallel::bplapply(X = cells, FUN = function(x) { + return(tryCatch(expr = { + cc <- hfile[["featuredata"]][[x]][[zidx]][["p_0"]][["coordinates"]]$read() + cc <- as.data.frame(x = t(x = cc)) + colnames(x = cc) <- c("x", "y") + cc$cell <- x + cc + }, error = function(...) { + return(NULL) + })) + }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + force.GC = FALSE, + progressbar = TRUE)) + ppoly() + return(do.call(what = "rbind", args = df)) + }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + force.GC = FALSE, + progressbar = TRUE)) + } else { + pg <- + future.apply::future_lapply(X = unique(x = sp$fov), + FUN = function(f, ...) { + fname <- file.path(h5dir, paste0("feature_data_", + f, ".hdf5")) + if (!file.exists(fname)) { + warning("Cannot find HDF5 file for field of view ", + f, immediate. = TRUE) + return(NULL) + } + hfile <- hdf5r::H5File$new(filename = fname, + mode = "r") + on.exit(expr = hfile$close_all()) + cells <- rownames(x = subset(x = sp, subset = fov == f)) + df <- lapply(X = cells, FUN = function(x) { + return(tryCatch(expr = { + cc <- hfile[["featuredata"]][[x]][[zidx]][["p_0"]][["coordinates"]]$read() + cc <- as.data.frame(x = t(x = cc)) + colnames(x = cc) <- c("x", "y") + cc$cell <- x + cc + }, error = function(...) { + return(NULL) + })) + }) + ppoly() + return(do.call(what = "rbind", args = df)) + } + ) + } + ppoly(type = "finish") + # cell polygons + pg <- do.call(what = "rbind", args = pg) + npg <- length(x = unique(x = pg$cell)) + if (npg < nrow(x = sp)) { + warning(nrow(x = sp) - npg, " cells missing polygon information", + immediate. = TRUE) + } + pg # final segmentaions out.. + } + }, boxes = { + pbox <- progressor(steps = nrow(x = sp)) + pbox(message = "Creating box coordinates", class = "sticky", + amount = 0) + # use parallel or future + if (use.BiocParallel) { + message("Creating box coordinates..") + bx <- BiocParallel::bplapply(X = rownames(x = sp), FUN = function(cell) { + row <- sp[cell, ] + df <- expand.grid(x = c(row$min_x, row$max_x), + y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE) + df <- df[c(1, 3, 4, 2), , drop = FALSE] + pbox() + return(df) + }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + force.GC = FALSE, progressbar = TRUE) + ) + } else { + bx <- future.apply::future_lapply(X = rownames(x = sp), FUN = function(cell) { + row <- sp[cell, ] + df <- expand.grid(x = c(row$min_x, row$max_x), + y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, + stringsAsFactors = FALSE) + df <- df[c(1, 3, 4, 2), , drop = FALSE] + pbox() + return(df) + }) + } + pbox(type = "finish") + do.call(what = "rbind", args = bx) + }, metadata = { + pmeta <- progressor() + pmeta(message = "Loading metadata", class = "sticky", + amount = 0) + pmeta(type = "finish") + sp[, metadata, drop = FALSE] + }, pixels = { + ppixels <- progressor() + ppixels(message = "Creating pixel-level molecule coordinates", + class = "sticky", amount = 0) + df <- data.frame(x = mx$x, y = mx$y, gene = mx$gene, + stringsAsFactors = FALSE) + ppixels(type = "finish") + df + }, microns = { + pmicrons <- progressor() + pmicrons(message = "Creating micron-level molecule coordinates", + class = "sticky", amount = 0) + df <- data.frame(x = mx$global_x, y = mx$global_y, + gene = mx$gene, stringsAsFactors = FALSE) + pmicrons(type = "finish") + df + }, stop("Unknown MERFISH input type: ", type)) + } + + # add z-slice index for cells ---- + outs$zIndex <- data.frame(z = rep_len(z, length.out = outs$centroids$cell %>% length), cell = outs$centroids$cell) + return(outs) } From 115e45d0584d14d7a26c4e7d24ec21b572a7d96f Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 19:39:34 +0200 Subject: [PATCH 04/24] updates for `LoadVizgen` --- R/convenience.R | 91 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 775591ff7..b06deb1c1 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -137,37 +137,70 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' #' @rdname ReadVizgen #' -LoadVizgen <- function(data.dir, fov, assay = 'Vizgen', z = 3L) { - data <- ReadVizgen( - data.dir = data.dir, - filter = "^Blank-", - type = c("centroids", "segmentations"), - z = z - ) - segs <- CreateSegmentation(data$segmentations) - cents <- CreateCentroids(data$centroids) - segmentations.data <- list( - "centroids" = cents, - "segmentation" = segs - ) - coords <- CreateFOV( - coords = segmentations.data, - type = c("segmentation", "centroids"), - molecules = data$microns, - assay = assay - ) - obj <- CreateSeuratObject(counts = data$transcripts, assay = assay) - # only consider the cells we have counts and a segmentation for - # Cells which don't have a segmentation are probably found in other z slices. - coords <- subset( - x = coords, - cells = intersect( - x = Cells(x = coords[["segmentation"]]), - y = Cells(x = obj) - ) - ) +LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', + add.zIndex = TRUE, update.object = TRUE, + ...) { + data <- ReadVizgen(data.dir = data.dir, ...) + + # if "segmentations" are not present, use cell bounding boxes instead + if (!"segmentations" %in% names(data)) { + bound.boxes <- CreateSegmentation(data[["boxes"]]) + cents <- CreateCentroids(data[["centroids"]]) + bound.boxes.data <- list(centroids = cents, boxes = bound.boxes) + message("Creating FOVs", "\n", + "..using box coordinates instead of segmentations") + coords <- CreateFOV(coords = bound.boxes.data, type = c("boxes", + "centroids"), molecules = data[[mol.type]], assay = assay) + message("Creating Seurat object") + obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) + # only consider the cells we have counts and a segmentation for + # Cells which don't have a segmentation are probably found in other z slices. + coords <- subset(x = coords, + cells = intersect(x = Cells(x = coords[["boxes"]]), + y = Cells(x = obj))) + } else { + segs <- CreateSegmentation(data[["segmentations"]]) + cents <- CreateCentroids(data[["centroids"]]) + segmentations.data <- list(centroids = cents, segmentation = segs) + message("Creating FOVs", "\n", + "..using segmentations") + coords <- CreateFOV(coords = segmentations.data, type = c("segmentation", + "centroids"), molecules = data[[mol.type]], assay = assay) + message("Creating Seurat object..") + obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) + # only consider the cells we have counts and a segmentation for + # Cells which don't have a segmentation are probably found in other z slices. + coords <- subset(x = coords, + cells = intersect(x = Cells(x = coords[["segmentation"]]), + y = Cells(x = obj))) + } + + # add z-stack index for cells + if (add.zIndex) { obj$z <- data$zIndex$z } + + # add metadata vars + message("..adding metadata infos") + if (c("metadata" %in% names(data))) { + metadata <- match.arg(arg = "metadata", choices = names(data), several.ok = TRUE) + meta.vars <- names(data[[metadata]]) + for (i in meta.vars %>% seq) { + obj %<>% AddMetaData(metadata = data[[metadata]][[meta.vars[i]]], + col.name = meta.vars[i]) + } + } + + # sanity on fov name + fov %<>% gsub("_|-", ".", .) + + #message("..adding FOV") # add coords to seurat object obj[[fov]] <- coords + + if (update.object) { + message("Updating object..") + obj %<>% UpdateSeuratObject() } + + #message("Object is ready!") return(obj) } From eff89e97fabc1f13981dedcb24ba97cf4787f635 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 20:04:58 +0200 Subject: [PATCH 05/24] fix for argument 'metadata' --- R/preprocessing.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index d65642402..b4000e728 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2458,7 +2458,6 @@ ReadVizgen <- function( use.BiocParallel = TRUE, workers.total = 12, DTthreads.pct = NULL, - metadata = NULL, filter = NA_character_, z = 3L ) { From 87d9303481d72bc6602681d663f7d23bb63970ec Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 20:35:48 +0200 Subject: [PATCH 06/24] param args for `LoadVizgen` --- R/convenience.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/convenience.R b/R/convenience.R index b06deb1c1..5e5c16bfb 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -130,6 +130,10 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' @return \code{LoadVizgen}: A \code{\link[SeuratObject]{Seurat}} object #' +#' @param add.zIndex If to add \code{z} slice index to a cell +#' @param update.object If to update final object, default to TRUE. +#' @param ... Arguments passed to \code{ReadVizgen} +#' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV #' CreateSegmentation CreateSeuratObject #' From 089740a5658b6325ee0856a7273e09093ef21e78 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 13 Apr 2023 20:48:35 +0200 Subject: [PATCH 07/24] fix args for `ReadVizgen` --- R/preprocessing.R | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index b4000e728..9845d727e 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2395,21 +2395,20 @@ ReadVitessce <- function( #' \item \dQuote{fov}: cell's fov #' } #' @param z Z-index to load; must be between 0 and 6, inclusive -#' @param use.BiocParallel If to use \code{BiocParallel::bplapply()}, +#' @param use.BiocParallel If to use \code{BiocParallel::bplapply}, #' default is \code{TRUE}, if \code{FALSE}, uses \code{future} library -#' @param workers.total Number of cores to use for \code{BiocParallel::bplapply()} +#' @param workers.total Number of cores to use for \code{BiocParallel::bplapply} #' @param DTthreads.pct Set percentage eg \code{50} of total threads to use for \code{data.table::fread}, #' if set to \code{NULL} will use default setting as in \code{data.table::getDTthreads(verbose = T)} -#' @param coord.space; +#' @param coord.space Type of molecule spatial coordinate space to use; #' choose one or more of: #' \itemize{ #' \item \dQuote{pixel}: molecule coordinates in pixel space #' \item \dQuote{micron}: molecule coordinates in micron space #' } -#' @param use.cellpose.out If TRUE, and ./Cellpose folder exists, will load results from current MERSCOPE Instrument output. Default to TRUE. Set to FALSE if to use previous outputs (ie. non-Cellpose). -#' @param add.zIndex If to add \code{z} slice index to a cell -#' @param update.object If to update final object, default to TRUE. -#' @param ... Arguments passed to \code{ReadVizgen()} +#' @param use.cellpose.out If \code{TRUE}, and \code{./Cellpose} folder exists, +#' will load results from current MERSCOPE Instrument output. Default to \code{TRUE}; +#' set it to \code{FALSE} if to use previous outputs (ie. non-Cellpose). #' #' @return \code{ReadVizgen}: A list with some combination of the #' following values: @@ -2452,14 +2451,14 @@ ReadVizgen <- function( molecules = NULL, type = 'segmentations', mol.type = 'microns', - use.cellpose.out = TRUE, - coord.space = "micron", - metadata = NULL, - use.BiocParallel = TRUE, - workers.total = 12, + metadata = NULL, + filter = NA_character_, + z = 3L, + use.BiocParallel = TRUE, + workers.total = 12, DTthreads.pct = NULL, - filter = NA_character_, - z = 3L + coord.space = "micron", + use.cellpose.out = TRUE ) { # TODO: handle multiple segmentations per z-plane From 547000e3c8b462d448a4e50066a370d65eb717d9 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 19 Apr 2023 18:03:58 +0200 Subject: [PATCH 08/24] add support for `future.apply` --- R/preprocessing.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 9845d727e..e83b4bb7c 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2729,7 +2729,7 @@ ReadVizgen <- function( grep(".parquet$", ., value = TRUE) } - # read .parquet file.. + # read .parquet file ---- parq <- sfarrow::st_read_parquet(file2read) # get all cell segmentations @@ -2766,7 +2766,7 @@ ReadVizgen <- function( function(i) { segs[[i]][[1]] %>% data.table::as.data.table(x = .) %>% - as.data.frame %>% + #as.data.frame %>% mutate(cell = names(segs)[i]) }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, force.GC = FALSE, @@ -2776,14 +2776,14 @@ ReadVizgen <- function( # extract cell boundaries per cells message("Extracting of cell boundaries..") segs_list <- - lapply(segs %>% seq, - function(i) { - segs[[i]][[1]] %>% - data.table::as.data.table(x = .) %>% - as.data.frame %>% - mutate(cell = names(segs)[i]) - } - ) + future.apply::future_lapply(segs %>% seq, + function(i) { + segs[[i]][[1]] %>% + data.table::as.data.table(x = .) %>% + #as.data.frame %>% + mutate(cell = names(segs)[i]) + } + ) } #segs_list %>% length From f73dbdba57c31d209cbff2d95d2e400052107d1e Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 24 May 2023 14:24:14 +0200 Subject: [PATCH 09/24] Vizgen support single `.parquet` file --- R/preprocessing.R | 622 +++++++++++++++++++++++----------------------- 1 file changed, 313 insertions(+), 309 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index e83b4bb7c..cfbb2eaaf 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -58,11 +58,11 @@ globalVariables( #' CalculateBarcodeInflections(pbmc_small, group.column = 'groups') #' CalculateBarcodeInflections <- function( - object, - barcode.column = "nCount_RNA", - group.column = "orig.ident", - threshold.low = NULL, - threshold.high = NULL + object, + barcode.column = "nCount_RNA", + group.column = "orig.ident", + threshold.low = NULL, + threshold.high = NULL ) { ## Check that barcode.column exists in meta.data if (!(barcode.column %in% colnames(x = object[[]]))) { @@ -203,15 +203,15 @@ CalculateBarcodeInflections <- function( #' } #' HTODemux <- function( - object, - assay = "HTO", - positive.quantile = 0.99, - init = NULL, - nstarts = 100, - kfunc = "clara", - nsamples = 100, - seed = 42, - verbose = TRUE + object, + assay = "HTO", + positive.quantile = 0.99, + init = NULL, + nstarts = 100, + kfunc = "clara", + nsamples = 100, + seed = 42, + verbose = TRUE ) { if (!is.null(x = seed)) { set.seed(seed = seed) @@ -370,14 +370,14 @@ HTODemux <- function( #' pbmc_small <- GetResidual(object = pbmc_small, features = c('MS4A1', 'TCL1A')) #' GetResidual <- function( - object, - features, - assay = NULL, - umi.assay = NULL, - clip.range = NULL, - replace.value = FALSE, - na.rm = TRUE, - verbose = TRUE + object, + features, + assay = NULL, + umi.assay = NULL, + clip.range = NULL, + replace.value = FALSE, + na.rm = TRUE, + verbose = TRUE ) { assay <- assay %||% DefaultAssay(object = object) if (IsSCT(assay = object[[assay]])) { @@ -502,14 +502,14 @@ GetResidual <- function( #' } #' Load10X_Spatial <- function( - data.dir, - filename = 'filtered_feature_bc_matrix.h5', - assay = 'Spatial', - slice = 'slice1', - filter.matrix = TRUE, - to.upper = FALSE, - image = NULL, - ... + data.dir, + filename = 'filtered_feature_bc_matrix.h5', + assay = 'Spatial', + slice = 'slice1', + filter.matrix = TRUE, + to.upper = FALSE, + image = NULL, + ... ) { if (length(x = data.dir) > 1) { warning("'Load10X_Spatial' accepts only one 'data.dir'", immediate. = TRUE) @@ -522,9 +522,9 @@ Load10X_Spatial <- function( object <- CreateSeuratObject(counts = data, assay = assay) if (is.null(x = image)) { image <- Read10X_Image( - image.dir = file.path(data.dir, 'spatial'), - filter.matrix = filter.matrix - ) + image.dir = file.path(data.dir, 'spatial'), + filter.matrix = filter.matrix + ) } else { if (!inherits(x = image, what = "VisiumV1")) stop("Image must be an object of class 'VisiumV1'.") @@ -557,13 +557,13 @@ Load10X_Spatial <- function( #' @concept preprocessing #' LoadSTARmap <- function( - data.dir, - counts.file = "cell_barcode_count.csv", - gene.file = "genes.csv", - qhull.file = "qhulls.tsv", - centroid.file = "centroids.tsv", - assay = "Spatial", - image = "image" + data.dir, + counts.file = "cell_barcode_count.csv", + gene.file = "genes.csv", + qhull.file = "qhulls.tsv", + centroid.file = "centroids.tsv", + assay = "Spatial", + image = "image" ) { if (!dir.exists(paths = data.dir)) { stop("Cannot find directory ", data.dir, call. = FALSE) @@ -669,13 +669,13 @@ LogNormalize <- function(data, scale.factor = 1e4, verbose = TRUE) { #' } #' MULTIseqDemux <- function( - object, - assay = "HTO", - quantile = 0.7, - autoThresh = FALSE, - maxiter = 5, - qrange = seq(from = 0.1, to = 0.9, by = 0.05), - verbose = TRUE + object, + assay = "HTO", + quantile = 0.7, + autoThresh = FALSE, + maxiter = 5, + qrange = seq(from = 0.1, to = 0.9, by = 0.05), + verbose = TRUE ) { assay <- assay %||% DefaultAssay(object = object) multi_data_norm <- t(x = GetAssayData( @@ -787,11 +787,11 @@ MULTIseqDemux <- function( #' } #' Read10X <- function( - data.dir, - gene.column = 2, - cell.column = 1, - unique.features = TRUE, - strip.suffix = FALSE + data.dir, + gene.column = 2, + cell.column = 1, + unique.features = TRUE, + strip.suffix = FALSE ) { full.data <- list() has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) @@ -828,7 +828,7 @@ Read10X <- function( } else { cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL) } - + if (ncol(x = cell.barcodes) > 1) { cell.names <- cell.barcodes[, cell.column] } else { @@ -851,7 +851,7 @@ Read10X <- function( } else { colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names) } - + if (has_dt) { feature.names <- as.data.frame(data.table::fread(ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc), header = FALSE)) } else { @@ -861,7 +861,7 @@ Read10X <- function( stringsAsFactors = FALSE ) } - + if (any(is.na(x = feature.names[, gene.column]))) { warning( 'Some features names are NA. Replacing NA names with ID from the opposite column requested', @@ -1108,10 +1108,10 @@ Read10X_Image <- function(image.dir, image.name = "tissue_lowres_image.png", fil #' @template note-reqdpkg #' ReadAkoya <- function( - filename, - type = c('inform', 'processor', 'qupath'), - filter = 'DAPI|Blank|Empty', - inform.quant = c('mean', 'total', 'min', 'max', 'std') + filename, + type = c('inform', 'processor', 'qupath'), + filter = 'DAPI|Blank|Empty', + inform.quant = c('mean', 'total', 'min', 'max', 'std') ) { if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") @@ -1429,18 +1429,18 @@ ReadAkoya <- function( #' } #' ReadMtx <- function( - mtx, - cells, - features, - cell.column = 1, - feature.column = 2, - cell.sep = "\t", - feature.sep = "\t", - skip.cell = 0, - skip.feature = 0, - mtx.transpose = FALSE, - unique.features = TRUE, - strip.suffix = FALSE + mtx, + cells, + features, + cell.column = 1, + feature.column = 2, + cell.sep = "\t", + feature.sep = "\t", + skip.cell = 0, + skip.feature = 0, + mtx.transpose = FALSE, + unique.features = TRUE, + strip.suffix = FALSE ) { all.files <- list( "expression matrix" = mtx, @@ -1539,7 +1539,7 @@ ReadMtx <- function( feature.column, ". Try specifiying a different column.", call. = FALSE - ) + ) } else { warning( "Some features names are NA in column ", @@ -1548,7 +1548,7 @@ ReadMtx <- function( replacement.column, ".", call. = FALSE - ) + ) } feature.names[na.features, feature.column] <- feature.names[na.features, replacement.column] } @@ -1572,7 +1572,7 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } if (length(x = feature.names) != nrow(x = data)) { stop( @@ -1586,9 +1586,9 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } - + colnames(x = data) <- cell.names rownames(x = data) <- feature.names data <- as.sparse(x = data) @@ -1672,24 +1672,24 @@ ReadMtx <- function( #' @template note-reqdpkg #' ReadNanostring <- function( - data.dir, - mtx.file = NULL, - metadata.file = NULL, - molecules.file = NULL, - segmentations.file = NULL, - type = 'centroids', - mol.type = 'pixels', - metadata = NULL, - mols.filter = NA_character_, - genes.filter = NA_character_, - fov.filter = NULL, - subset.counts.matrix = NULL, - cell.mols.only = TRUE + data.dir, + mtx.file = NULL, + metadata.file = NULL, + molecules.file = NULL, + segmentations.file = NULL, + type = 'centroids', + mol.type = 'pixels', + metadata = NULL, + mols.filter = NA_character_, + genes.filter = NA_character_, + fov.filter = NULL, + subset.counts.matrix = NULL, + cell.mols.only = TRUE ) { if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") } - + # Argument checking type <- match.arg( arg = type, @@ -1712,7 +1712,7 @@ ReadNanostring <- function( several.ok = TRUE ) } - + use.dir <- all(vapply( X = c(mtx.file, metadata.file, molecules.file), FUN = function(x) { @@ -1720,7 +1720,7 @@ ReadNanostring <- function( }, FUN.VALUE = logical(length = 1L) )) - + if (use.dir && !dir.exists(paths = data.dir)) { stop("Cannot find Nanostring directory ", data.dir) } @@ -1731,7 +1731,7 @@ ReadNanostring <- function( molecules.file = molecules.file %||% '[_a-zA-Z0-9]*_tx_file.csv', segmentations.file = segmentations.file %||% '[_a-zA-Z0-9]*-polygons.csv' ) - + files <- vapply( X = files, FUN = function(x) { @@ -1752,7 +1752,7 @@ ReadNanostring <- function( USE.NAMES = TRUE ) files[!file.exists(files)] <- NA_character_ - + if (all(is.na(x = files))) { stop("Cannot find Nanostring input files in ", data.dir) } @@ -1770,7 +1770,7 @@ ReadNanostring <- function( data.table = FALSE, verbose = FALSE ) - + # filter metadata file by FOVs if (!is.null(x = fov.filter)) { md <- md[md$fov %in% fov.filter,] @@ -1790,7 +1790,7 @@ ReadNanostring <- function( data.table = FALSE, verbose = FALSE ) - + # filter metadata file by FOVs if (!is.null(x = fov.filter)) { segs <- segs[segs$fov %in% fov.filter,] @@ -1810,17 +1810,17 @@ ReadNanostring <- function( sep = ',', verbose = FALSE ) - + # filter molecules file by FOVs if (!is.null(x = fov.filter)) { mx <- mx[mx$fov %in% fov.filter,] } - + # Molecules outside of a cell have a cell_ID of 0 if (cell.mols.only) { mx <- mx[mx$cell_ID != 0,] } - + if (!is.na(x = mols.filter)) { ppremol( message = paste("Filtering molecules with pattern", mols.filter), @@ -1836,7 +1836,7 @@ ReadNanostring <- function( files <- files[setdiff(x = names(x = files), y = 'molecules.file')] } files <- files[!is.na(x = files)] - + outs <- list("matrix"=NULL, "pixels"=NULL, "centroids"=NULL) if (!is.null(metadata)) { outs <- append(outs, list("metadata" = NULL)) @@ -1844,7 +1844,7 @@ ReadNanostring <- function( if ("segmentations" %in% type) { outs <- append(outs, list("segmentations" = NULL)) } - + for (otype in names(x = outs)) { outs[[otype]] <- switch( EXPR = otype, @@ -1871,7 +1871,7 @@ ReadNanostring <- function( } tx <- subset(tx, select = -c(fov, cell_ID)) } - + tx <- as.data.frame(t(x = as.matrix(x = tx[, -1, drop = FALSE]))) if (!is.na(x = genes.filter)) { ptx( @@ -1884,7 +1884,7 @@ ReadNanostring <- function( # only keep cells with counts greater than 0 tx <- tx[, which(colSums(tx) != 0)] ratio <- getOption(x = 'Seurat.input.sparse_ratio', default = 0.4) - + if ((sum(tx == 0) / length(x = tx)) > ratio) { ptx( message = 'Converting counts to sparse matrix', @@ -1893,9 +1893,9 @@ ReadNanostring <- function( ) tx <- as.sparse(x = tx) } - + ptx(type = 'finish') - + tx }, 'centroids' = { @@ -2014,10 +2014,10 @@ ReadNanostring <- function( #' @concept preprocessing #' ReadXenium <- function( - data.dir, - outs = c("matrix", "microns"), - type = "centroids", - mols.qv.threshold = 20 + data.dir, + outs = c("matrix", "microns"), + type = "centroids", + mols.qv.threshold = 20 ) { # Argument checking type <- match.arg( @@ -2025,17 +2025,17 @@ ReadXenium <- function( choices = c("centroids", "segmentations"), several.ok = TRUE ) - + outs <- match.arg( arg = outs, choices = c("matrix", "microns"), several.ok = TRUE ) - + outs <- c(outs, type) - + has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) - + data <- sapply(outs, function(otype) { switch( EXPR = otype, @@ -2074,7 +2074,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + # load cell boundaries if (has_dt) { cell_boundaries_df <- as.data.frame(data.table::fread(file.path(data.dir, "cell_boundaries.csv.gz"))) @@ -2092,7 +2092,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + # molecules if (has_dt) { tx_dt <- as.data.frame(data.table::fread(file.path(data.dir, "transcripts.csv.gz"))) @@ -2101,7 +2101,7 @@ ReadXenium <- function( transcripts <- read.csv(file.path(data.dir, "transcripts.csv.gz")) transcripts <- subset(transcripts, qv >= mols.qv.threshold) } - + df <- data.frame( x = transcripts$x_location, @@ -2219,11 +2219,11 @@ ReadSlideSeq <- function(coord.file, assay = 'Spatial') { #' } #' ReadVitessce <- function( - counts = NULL, - coords = NULL, - molecules = NULL, - type = c('segmentations', 'centroids'), - filter = NA_character_ + counts = NULL, + coords = NULL, + molecules = NULL, + type = c('segmentations', 'centroids'), + filter = NA_character_ ) { if (!requireNamespace('jsonlite', quietly = TRUE)) { stop("Please install 'jsonlite' for this function") @@ -2724,11 +2724,15 @@ ReadVizgen <- function( full.names = TRUE, recursive = TRUE) if (length(files2scan)) { - file2read <- files2scan %>% + file2read <- + files2scan %>% grep(coord.space, ., value = TRUE) %>% - grep(".parquet$", ., value = TRUE) + { if (is.empty(.)) { + # support only single .parquet file + files2scan %>% + grep(".parquet$", ., value = TRUE) + } else { (.) } } } - # read .parquet file ---- parq <- sfarrow::st_read_parquet(file2read) @@ -2989,9 +2993,9 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) { #' @concept preprocessing #' RunMarkVario <- function( - spatial.location, - data, - ... + spatial.location, + data, + ... ) { pp <- ppp( x = spatial.location[, 1], @@ -3107,10 +3111,10 @@ RunMoransI <- function(data, pos, verbose = TRUE) { #' head(x = downsampled) #' SampleUMI <- function( - data, - max.umi = 1000, - upsample = FALSE, - verbose = FALSE + data, + max.umi = 1000, + upsample = FALSE, + verbose = FALSE ) { data <- as.sparse(x = data) if (length(x = max.umi) == 1) { @@ -3195,24 +3199,24 @@ SampleUMI <- function( #' SCTransform(object = pbmc_small) #' SCTransform <- function( - object, - assay = 'RNA', - new.assay.name = 'SCT', - reference.SCT.model = NULL, - do.correct.umi = TRUE, - ncells = 5000, - residual.features = NULL, - variable.features.n = 3000, - variable.features.rv.th = 1.3, - vars.to.regress = NULL, - do.scale = FALSE, - do.center = TRUE, - clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), - conserve.memory = FALSE, - return.only.var.genes = TRUE, - seed.use = 1448145, - verbose = TRUE, - ... + object, + assay = 'RNA', + new.assay.name = 'SCT', + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... ) { if (!is.null(x = seed.use)) { set.seed(seed = seed.use) @@ -3242,7 +3246,7 @@ SCTransform <- function( if (reference.SCT.model$model_str != 'y ~ log_umi') { stop('reference.SCT.model must be derived using default SCT regression formula, `y ~ log_umi`') } - + } # check for latent_var in meta data if ('latent_var' %in% names(x = vst.args)) { @@ -3281,7 +3285,7 @@ SCTransform <- function( vst.args[['n_cells']] <- min(ncells, ncol(x = umi)) residual.type <- vst.args[['residual_type']] %||% 'pearson' res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(x = ncol(x = umi)), sqrt(x = ncol(x = umi))) - # set sct normalization method + # set sct normalization method if (!is.null( reference.SCT.model)) { sct.method <- "reference.model" } else if (!is.null(x = residual.features)) { @@ -3291,7 +3295,7 @@ SCTransform <- function( } else { sct.method <- "default" } - + # set vst model vst.out <- switch( EXPR = sct.method, @@ -3348,7 +3352,7 @@ SCTransform <- function( vst.out <- do.call(what = 'vst', args = vst.args) vst.out }) - + feature.variance <- vst.out$gene_attr[,"residual_variance"] names(x = feature.variance) <- rownames(x = vst.out$gene_attr) if (verbose) { @@ -3360,7 +3364,7 @@ SCTransform <- function( } else { top.features <- names(x = feature.variance)[feature.variance >= variable.features.rv.th] } - + # get residuals vst.out <- switch( EXPR = sct.method, @@ -3570,16 +3574,16 @@ SubsetByBarcodeInflections <- function(object) { #' @export #' FindVariableFeatures.default <- function( - object, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - verbose = TRUE, - ... + object, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + verbose = TRUE, + ... ) { CheckDots(...) if (!inherits(x = object, 'Matrix')) { @@ -3670,19 +3674,19 @@ FindVariableFeatures.default <- function( #' @method FindVariableFeatures Assay #' FindVariableFeatures.Assay <- function( - object, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - nfeatures = 2000, - mean.cutoff = c(0.1, 8), - dispersion.cutoff = c(1, Inf), - verbose = TRUE, - ... + object, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + nfeatures = 2000, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + verbose = TRUE, + ... ) { if (length(x = mean.cutoff) != 2 || length(x = dispersion.cutoff) != 2) { stop("Both 'mean.cutoff' and 'dispersion.cutoff' must be two numbers") @@ -3749,9 +3753,9 @@ FindVariableFeatures.Assay <- function( #' @method FindVariableFeatures SCTAssay #' FindVariableFeatures.SCTAssay <- function( - object, - nfeatures = 2000, - ... + object, + nfeatures = 2000, + ... ) { if (length(x = slot(object = object, name = "SCTModel.list")) > 1) { stop("SCT assay is comprised of multiple SCT models. To change the variable features, please set manually with VariableFeatures<-", call. = FALSE) @@ -3771,20 +3775,20 @@ FindVariableFeatures.SCTAssay <- function( #' @method FindVariableFeatures Seurat #' FindVariableFeatures.Seurat <- function( - object, - assay = NULL, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - nfeatures = 2000, - mean.cutoff = c(0.1, 8), - dispersion.cutoff = c(1, Inf), - verbose = TRUE, - ... + object, + assay = NULL, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + nfeatures = 2000, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -3840,14 +3844,14 @@ FindVariableFeatures.Seurat <- function( #' #' FindSpatiallyVariableFeatures.default <- function( - object, - spatial.location, - selection.method = c('markvariogram', 'moransi'), - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - verbose = TRUE, - ... + object, + spatial.location, + selection.method = c('markvariogram', 'moransi'), + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + verbose = TRUE, + ... ) { # error check dimensions if (ncol(x = object) != nrow(x = spatial.location)) { @@ -3892,17 +3896,17 @@ FindSpatiallyVariableFeatures.default <- function( #' @export #' FindSpatiallyVariableFeatures.Assay <- function( - object, - slot = "scale.data", - spatial.location, - selection.method = c('markvariogram', 'moransi'), - features = NULL, - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - nfeatures = nfeatures, - verbose = TRUE, - ... + object, + slot = "scale.data", + spatial.location, + selection.method = c('markvariogram', 'moransi'), + features = NULL, + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = nfeatures, + verbose = TRUE, + ... ) { features <- features %||% rownames(x = object) if (selection.method == "markvariogram" && "markvariogram" %in% names(x = Misc(object = object))) { @@ -3984,18 +3988,18 @@ FindSpatiallyVariableFeatures.Assay <- function( #' @export #' FindSpatiallyVariableFeatures.Seurat <- function( - object, - assay = NULL, - slot = "scale.data", - features = NULL, - image = NULL, - selection.method = c('markvariogram', 'moransi'), - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - nfeatures = 2000, - verbose = TRUE, - ... + object, + assay = NULL, + slot = "scale.data", + features = NULL, + image = NULL, + selection.method = c('markvariogram', 'moransi'), + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = 2000, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) features <- features %||% rownames(x = object[[assay]]) @@ -4043,13 +4047,13 @@ FindSpatiallyVariableFeatures.Seurat <- function( #' @export #' NormalizeData.default <- function( - object, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - block.size = NULL, - verbose = TRUE, - ... + object, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + block.size = NULL, + verbose = TRUE, + ... ) { CheckDots(...) if (is.null(x = normalization.method)) { @@ -4150,12 +4154,12 @@ NormalizeData.default <- function( #' @method NormalizeData Assay #' NormalizeData.Assay <- function( - object, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - verbose = TRUE, - ... + object, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + verbose = TRUE, + ... ) { object <- SetAssayData( object = object, @@ -4187,13 +4191,13 @@ NormalizeData.Assay <- function( #' } #' NormalizeData.Seurat <- function( - object, - assay = NULL, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - verbose = TRUE, - ... + object, + assay = NULL, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -4243,20 +4247,20 @@ NormalizeData.Seurat <- function( #' @export #' ScaleData.default <- function( - object, - features = NULL, - vars.to.regress = NULL, - latent.data = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + vars.to.regress = NULL, + latent.data = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { CheckDots(...) features <- features %||% rownames(x = object) @@ -4484,20 +4488,20 @@ ScaleData.default <- function( #' @method ScaleData Assay #' ScaleData.Assay <- function( - object, - features = NULL, - vars.to.regress = NULL, - latent.data = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + vars.to.regress = NULL, + latent.data = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { use.umi <- ifelse(test = model.use != 'linear', yes = TRUE, no = use.umi) slot.use <- ifelse(test = use.umi, yes = 'counts', no = 'data') @@ -4542,20 +4546,20 @@ ScaleData.Assay <- function( #' @method ScaleData Seurat #' ScaleData.Seurat <- function( - object, - features = NULL, - assay = NULL, - vars.to.regress = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + assay = NULL, + vars.to.regress = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -4955,13 +4959,13 @@ FindThresh <- function(call.list) { #' @importFrom sctransform get_residuals # GetResidualSCTModel <- function( - object, - assay, - SCTModel, - new_features, - clip.range, - replace.value, - verbose + object, + assay, + SCTModel, + new_features, + clip.range, + replace.value, + verbose ) { clip.range <- clip.range %||% SCTResults(object = object[[assay]], slot = "clips", model = SCTModel)$sct model.features <- rownames(x = SCTResults(object = object[[assay]], slot = "feature.attributes", model = SCTModel)) @@ -4970,14 +4974,14 @@ GetResidualSCTModel <- function( sct.method <- SCTResults(object = object[[assay]], slot = "arguments", model = SCTModel)$sct.method %||% "default" scale.data.cells <- colnames(x = GetAssayData(object = object, assay = assay, slot = "scale.data")) if (length(x = setdiff(x = model.cells, y = scale.data.cells)) == 0) { - existing_features <- names(x = which(x = ! apply( - X = GetAssayData(object = object, assay = assay, slot = "scale.data")[, model.cells], - MARGIN = 1, - FUN = anyNA) - )) - } else { - existing_features <- character() - } + existing_features <- names(x = which(x = ! apply( + X = GetAssayData(object = object, assay = assay, slot = "scale.data")[, model.cells], + MARGIN = 1, + FUN = anyNA) + )) + } else { + existing_features <- character() + } if (replace.value) { features_to_compute <- new_features } else { @@ -4991,7 +4995,7 @@ GetResidualSCTModel <- function( } if (!umi.assay %in% Assays(object = object)) { warning("The umi assay (", umi.assay, ") is not present in the object. ", - "Cannot compute additional residuals.", call. = FALSE, immediate. = TRUE) + "Cannot compute additional residuals.", call. = FALSE, immediate. = TRUE) return(NULL) } diff_features <- setdiff(x = features_to_compute, y = model.features) @@ -5129,12 +5133,12 @@ NBResiduals <- function(fmla, regression.mat, gene, return.mode = FALSE) { #' @importFrom utils txtProgressBar setTxtProgressBar # RegressOutMatrix <- function( - data.expr, - latent.data = NULL, - features.regress = NULL, - model.use = NULL, - use.umi = FALSE, - verbose = TRUE + data.expr, + latent.data = NULL, + features.regress = NULL, + model.use = NULL, + use.umi = FALSE, + verbose = TRUE ) { # Do we bypass regression and simply return data.expr? bypass <- vapply( From 5ca10699f0d45cfd7adc9d844e24b96f508946ce Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 31 May 2023 22:26:56 +0200 Subject: [PATCH 10/24] major fix for `.parquet` segmentations --- R/preprocessing.R | 223 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 158 insertions(+), 65 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index cfbb2eaaf..1d67359f0 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -1,7 +1,5 @@ #' @include generics.R #' @importFrom progressr progressor -#' @import dplyr -#' @importFrom magrittr %>% %<>% #' NULL @@ -2397,8 +2395,8 @@ ReadVitessce <- function( #' @param z Z-index to load; must be between 0 and 6, inclusive #' @param use.BiocParallel If to use \code{BiocParallel::bplapply}, #' default is \code{TRUE}, if \code{FALSE}, uses \code{future} library -#' @param workers.total Number of cores to use for \code{BiocParallel::bplapply} -#' @param DTthreads.pct Set percentage eg \code{50} of total threads to use for \code{data.table::fread}, +#' @param workers.MulticoreParam Number of cores to use for \code{BiocParallel::bplapply} +#' @param DTthreads.pct Optional, set percentage eg \code{50} of total threads to use for \code{data.table::fread}, #' if set to \code{NULL} will use default setting as in \code{data.table::getDTthreads(verbose = T)} #' @param coord.space Type of molecule spatial coordinate space to use; #' choose one or more of: @@ -2406,6 +2404,8 @@ ReadVitessce <- function( #' \item \dQuote{pixel}: molecule coordinates in pixel space #' \item \dQuote{micron}: molecule coordinates in micron space #' } +#' @param use.furrr When working with lists, use \code{furrr} with \code{future} parallelization; +#' if \conde{FALSE} standard \code{purrr} will be used #' @param use.cellpose.out If \code{TRUE}, and \code{./Cellpose} folder exists, #' will load results from current MERSCOPE Instrument output. Default to \code{TRUE}; #' set it to \code{FALSE} if to use previous outputs (ie. non-Cellpose). @@ -2431,6 +2431,13 @@ ReadVitessce <- function( #' } #' #' @importFrom future.apply future_lapply +#' @import dplyr +#' @import tibble +#' @import purrr +#' @import furrr +#' @import magrittr +#' @importFrom spatstat.geom is.empty +#' @importFrom progressr progressor #' #' @export #' @@ -2449,38 +2456,38 @@ ReadVizgen <- function( transcripts = NULL, spatial = NULL, molecules = NULL, - type = 'segmentations', + type = c('segmentations', 'centroids'), mol.type = 'microns', metadata = NULL, filter = NA_character_, z = 3L, use.BiocParallel = TRUE, - workers.total = 12, + workers.MulticoreParam = 12, DTthreads.pct = NULL, - coord.space = "micron", + coord.space = 'micron', + use.furrr = TRUE, use.cellpose.out = TRUE ) { - # TODO: handle multiple segmentations per z-plane - if (!requireNamespace("data.table", quietly = TRUE)) { - stop("Please install 'data.table' for this function") - } - if (!requireNamespace("BiocParallel", quietly = TRUE)) { - stop("Please install 'BiocParallel' for parallelization") - } - - if (!requireNamespace("sfarrow", quietly = TRUE)) { - stop("Please install 'sfarrow' for reading cell boundaries from `.parquet` files") - } + # packages that needs to be installed a priori + pkgs <- c("data.table", "arrow", "sfarrow", + "tidyverse", "furrr", "BiocParallel") + lapply(pkgs %>% length %>% seq, function(i) + { !requireNamespace(pkgs[i], quietly = TRUE) } ) %>% + unlist %>% + { if (c(which(.) > 0) %>% any()) + { c("Please install ->", "\n", + paste0("'", pkgs[which(.)], "'", collapse = ", "), " for this function") %>% + stop(., call. = FALSE) } } # setting workers to use for parallel computing - `parallel` ---- if (use.BiocParallel) { message("Using parallelization with: `BiocParallel`") - if (is.null(workers.total)) { - workers.total <- quantile(BiocParallel::multicoreWorkers() %>% seq)[4] %>% ceiling - message(workers.total, " of total workers available will be used") - } else { message("Setting total workers to: ", workers.total) } + if (is.null(workers.MulticoreParam)) { + workers.MulticoreParam <- quantile(BiocParallel::multicoreWorkers() %>% seq)[4] %>% ceiling + message(workers.MulticoreParam, " of total workers available will be used") + } else { message("Setting total workers to: ", workers.MulticoreParam) } } else { message("Using parallelization with: `future`", "\n", "NOTE: set workers for parallelization, eg: `future::plan('multisession', workers = 10)`") } @@ -2529,19 +2536,25 @@ ReadVizgen <- function( )) if (use.dir && !dir.exists(paths = data.dir)) { stop("Cannot find Vizgen directory ", data.dir) + } else { + message("Reading data from:" , "\n", data.dir) } + # TODO: + # - optimize to find and read only ".parquet$" files + # - else read .hdf5 file from ./cell_boundaries dir + # use Cellpose output ---- if (use.cellpose.out && grep("Cellpose", list.files(data.dir)) %>% any) { message("Cellpose output will be used..") # Identify input files files <- c(transcripts = transcripts %||% paste0(data.dir, paste0("/", list.files(data.dir, pattern = "Cellpose"), - "/cellpose_cell_by_gene.csv")), + "/*cell_by_gene.csv")), spatial = spatial %||% paste0(data.dir, paste0("/", list.files(data.dir, pattern = "Cellpose"), - "/cellpose_cell_metadata.csv")), + "/*cell_metadata.csv")), molecules = molecules %||% "detected_transcripts[_a-zA-Z0-9]*.csv") } else { @@ -2610,9 +2623,6 @@ ReadVizgen <- function( immediate. = TRUE ) FALSE - } else if (!dir.exists(paths = h5dir) && !use.cellpose.out) { - warning("Cannot find cell boundary H5 files", immediate. = TRUE) - FALSE } else if (use.cellpose.out) { # added for Cellpose output ---- files2scan <- list.files(data.dir, pattern = ".parquet$", @@ -2621,7 +2631,7 @@ ReadVizgen <- function( recursive = TRUE) if (length(files2scan)) { message("Cell segmentations are found in `.parquet` file(s)", "\n", - "..using ", coord.space, " space coordinates") + ">>> using ", coord.space, " space coordinates") } TRUE } @@ -2733,70 +2743,151 @@ ReadVizgen <- function( grep(".parquet$", ., value = TRUE) } else { (.) } } } - # read .parquet file ---- + + # Read .parquet file parq <- sfarrow::st_read_parquet(file2read) # get all cell segmentations - segs <- filter(parq, ZIndex == z) %>% pull(Geometry) + segs <- filter(parq, ZIndex == z) %>% + pull(Geometry) %>% + # add cell ID + set_names(filter(parq, ZIndex == z) %>% + pull(EntityID) %>% as.character) + + ## Sanity checks on segmentation polygons - part 1 ---- + test.segs <- lapply(segs %>% seq, + function(i) segs[[i]] %>% length) %>% unlist() + if (any(test.segs > 1)) { + segs.art.index <- which(test.segs > 1) + segs.empty.index <- which(test.segs < 1) + message("Sanity checks on cell segmentation polygons:", "\n", + ">>> found ", segs.art.index %>% length, + " cells with > 1 (nested) polygon lists:", "\n", + ">>> flattening polygon lists", "\n", + ">>> removing ", segs.empty.index %>% length, + " empty polygon lists") + + # step 1 - find polygon nested lists with > 1 length + # (if exists) get originally planned future session + if (is(future::plan(), "multisession")) { + orig.plan <- future::plan() + } + segs.1 <- segs %>% + purrr::keep(., c(purrr::map(., length) %>% unlist > 1)) %>% + # flatten each list + { + if (use.furrr) { + # set temporary workers + f.plan <- future::plan("multisession", workers = 4L, gc = TRUE) + #on.exit(f.plan %>% future::plan()) # exiting doesn't to work with furrr + # using furrr (ie, faster purrr with future) + furrr::future_map(., purrr::flatten) + } else { + purrr::map(., purrr::flatten) + } + # since furrr loads future, hide startup messages + } %>% suppressPackageStartupMessages() + # set back originally plannned session + if (exists("orig.plan")) { + future::plan(orig.plan) + } else { + #..or plan sequential + future::plan("sequential") + } + + # step 2 - apply multiple filtering + segs.2 <- + segs %>% + # remove empty elements + purrr::keep(., !c(purrr::map(., length) %>% unlist < 1)) %>% + # keep lists with length == 1 polygon information + purrr::keep(., c(purrr::map(., length) %>% unlist == 1)) %>% + # collapse to a list + purrr::flatten(.) + + # step 3 - combine segmentaion lists + segs <- c(segs.1, segs.2) + } + + ## Sanity checks on segmentation polygons - part 2 ---- # check if any cells have > 1 segmentation boundary - test.segs <- - lapply(segs %>% seq, function(i) segs[[i]][[1]] %>% length) - if (which(unlist(test.segs) > 1) %>% any) { - segs.art.index <- which(unlist(test.segs) > 1) - message(segs.art.index %>% length, - " Cells have > 1 segmentaion boundary artifacts", "\n", - "..removing artifacts", "\n", - "..keeping cell boundaries with maximum coords") + test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() + if (any(test.segs %>% unlist > 1)) { + segs.art.index <- which(test.segs %>% unlist > 1) + message("Found ", segs.art.index %>% length, + c(" cells with > 1 polygon artifacts:", + ">>> removing artifacts", + ">>> keeping cell boundary with maximum coords") %>% + paste0(., "\n")) # usually artifacts have small boundaries/coords # find cell boundaries with maximum coords + + # TODO: (optionally) + # - calculate geometric properties: + # eg, use sphericity to keep single polygon with high sphericity values + for (i in segs.art.index %>% seq) { - dims <- lapply(segs[[segs.art.index[i]]][[1]] %>% seq(), - function(d) { dim(segs[[segs.art.index[i]]][[1]][[d]])[1] } ) + dims <- lapply(segs[[segs.art.index[i]]] %>% seq(), + function(d) { dim(segs[[segs.art.index[i]]][[d]])[1] } ) # get & keep boundaries with maximum coords - maxs.segs <- which(unlist(dims) == max(unlist(dims))) - segs[[segs.art.index[i]]][[1]] <- segs[[segs.art.index[i]]][[1]][maxs.segs] + maxs.segs <- which(unlist(dims) == max(unlist(dims))) + segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][maxs.segs] } - } else { message("All cells have 1 segmentaion boundary (no artifacts)") } - # add cell names - names(segs) <- filter(parq, ZIndex == z) %>% pull(EntityID) %>% as.character + } else { message("All cells have single polygon boundary (no artifacts)") } + # some cells might have > 1 polygon boundary with identical lenght + #..in this case, keep only the 1st polygon boundary + test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() + if (any(test.segs > 1)) { + segs.art.index <- which(test.segs > 1) + message("Additionally found ", segs.art.index %>% length, + c(" cells with > 1 polygons (identical length):", + ">>> only the 1st polygon boundary will be kept") %>% + paste0(., "\n")) + for (i in segs.art.index %>% seq) { + # TODO: (optionally) + # - calculate geometric properties: + # here too, use sphericity to keep single polygon with high sphericity values + segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][1] + } + } + + ## Extract cell boundaries per cells # TODO: (optionally) resample & make cell boundaries equidistant! + if (use.BiocParallel) { - # extract cell boundaries per cells (faster) gc() %>% invisible() # free up memory - message("Using fast extraction of cell boundaries..") + message("Extracting cell segmentations - using `BiocParallel`") segs_list <- BiocParallel::bplapply(segs %>% seq, function(i) { segs[[i]][[1]] %>% - data.table::as.data.table(x = .) %>% + data.table::as.data.table(.) %>% #as.data.frame %>% mutate(cell = names(segs)[i]) }, - BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, + tasks = 50L, force.GC = FALSE, progressbar = TRUE) ) } else { - # extract cell boundaries per cells - message("Extracting of cell boundaries..") + message("Extracting cell segmentations - using `future`") segs_list <- future.apply::future_lapply(segs %>% seq, function(i) { - segs[[i]][[1]] %>% - data.table::as.data.table(x = .) %>% + segs[[i]] %>% + data.table::as.data.table(.) %>% #as.data.frame %>% mutate(cell = names(segs)[i]) - } - ) + } + ) } - #segs_list %>% length # df of all cell segmentations - gc() %>% invisible() # free up memory - #segs <- do.call("rbind", segs_list) - segs <- data.table::rbindlist(segs_list) - names(segs)[1:2] <- c("x", "y") - message("All cell boundaries (with no artifacts) are loaded..") + segs <- + data.table::rbindlist(segs_list) %>% + data.table::setnames(c("x", "y", "cell")) + message("All cell segmentations are loaded..") segs } else { # use non-Cellpose segmentations ppoly <- progressor(steps = length(x = unique(x = sp$fov))) @@ -2827,12 +2918,12 @@ ReadVizgen <- function( }, error = function(...) { return(NULL) })) - }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, force.GC = FALSE, progressbar = TRUE)) ppoly() return(do.call(what = "rbind", args = df)) - }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, force.GC = FALSE, progressbar = TRUE)) } else { @@ -2891,7 +2982,7 @@ ReadVizgen <- function( df <- df[c(1, 3, 4, 2), , drop = FALSE] pbox() return(df) - }, BPPARAM = BiocParallel::MulticoreParam(workers.total, tasks = 100L, + }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, force.GC = FALSE, progressbar = TRUE) ) } else { @@ -2933,7 +3024,9 @@ ReadVizgen <- function( } # add z-slice index for cells ---- - outs$zIndex <- data.frame(z = rep_len(z, length.out = outs$centroids$cell %>% length), cell = outs$centroids$cell) + outs$zIndex <- + data.frame(z = rep_len(z, length.out = outs$centroids %>% pull(cell) %>% length), + cell = outs$centroids %>% pull(cell)) return(outs) } From 9389ce99c6ed892ef8a7019fa4a530c3b7a6bf09 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 31 May 2023 22:28:33 +0200 Subject: [PATCH 11/24] fix for `LoadVizgen` --- R/convenience.R | 165 +++++++++++++++++++++++++++--------------------- 1 file changed, 94 insertions(+), 71 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 5e5c16bfb..3e99f0dbd 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -1,6 +1,5 @@ #' @include generics.R #' @include visualization.R -#' @importFrom magrittr %>% %<>% #' NULL @@ -21,11 +20,11 @@ NULL #' @rdname ReadAkoya #' LoadAkoya <- function( - filename, - type = c('inform', 'processor', 'qupath'), - fov, - assay = 'Akoya', - ... + filename, + type = c('inform', 'processor', 'qupath'), + fov, + assay = 'Akoya', + ... ) { # read in matrix and centroids data <- ReadAkoya(filename = filename, type = type) @@ -116,7 +115,7 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { assay = assay ) obj <- CreateSeuratObject(counts = data$matrix, assay = assay) - + # subset both object and coords based on the cells shared by both cells <- intersect( Cells(x = coords, boundary = "segmentation"), @@ -136,6 +135,7 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV #' CreateSegmentation CreateSeuratObject +#' @import dplyr #' #' @export #' @@ -146,17 +146,32 @@ LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', ...) { data <- ReadVizgen(data.dir = data.dir, ...) - # if "segmentations" are not present, use cell bounding boxes instead - if (!"segmentations" %in% names(data)) { - bound.boxes <- CreateSegmentation(data[["boxes"]]) - cents <- CreateCentroids(data[["centroids"]]) - bound.boxes.data <- list(centroids = cents, boxes = bound.boxes) - message("Creating FOVs", "\n", - "..using box coordinates instead of segmentations") - coords <- CreateFOV(coords = bound.boxes.data, type = c("boxes", - "centroids"), molecules = data[[mol.type]], assay = assay) - message("Creating Seurat object") - obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) + #gc() %>% invisible() + message("Creating Seurat object..") + obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) + + # in case no segmentation is present, use boxes + if (!"segmentations" %in% names(data)) { + if ("boxes" %in% names(data)) { + bound.boxes <- CreateSegmentation(data[["boxes"]]) + cents <- CreateCentroids(data[["centroids"]]) + bound.boxes.data <- list(centroids = cents, + boxes = bound.boxes) + message("Creating FOVs..", "\n", + ">>> using box coordinates instead of segmentations") + coords <- CreateFOV(coords = bound.boxes.data, + type = c("boxes", "centroids"), + molecules = data[[mol.type]], + assay = assay) + } else { # in case no segmentation & no boxes are present, use centroids only + cents <- CreateCentroids(data[["centroids"]]) + message("Creating FOVs..", "\n", + ">>> using only centroids") + coords <- CreateFOV(coords = list(centroids = cents), + type = c("centroids"), + molecules = data[[mol.type]], + assay = assay) + } # only consider the cells we have counts and a segmentation for # Cells which don't have a segmentation are probably found in other z slices. coords <- subset(x = coords, @@ -166,24 +181,22 @@ LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', segs <- CreateSegmentation(data[["segmentations"]]) cents <- CreateCentroids(data[["centroids"]]) segmentations.data <- list(centroids = cents, segmentation = segs) - message("Creating FOVs", "\n", - "..using segmentations") - coords <- CreateFOV(coords = segmentations.data, type = c("segmentation", - "centroids"), molecules = data[[mol.type]], assay = assay) - message("Creating Seurat object..") - obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) - # only consider the cells we have counts and a segmentation for - # Cells which don't have a segmentation are probably found in other z slices. + message("Creating FOVs..", "\n", + ">>> using segmentations") + coords <- CreateFOV(coords = segmentations.data, + type = c("segmentation", "centroids"), + molecules = data[[mol.type]], + assay = assay) coords <- subset(x = coords, cells = intersect(x = Cells(x = coords[["segmentation"]]), y = Cells(x = obj))) } # add z-stack index for cells - if (add.zIndex) { obj$z <- data$zIndex$z } + if (add.zIndex) { obj$z <- data$zIndex %>% pull(z) } # add metadata vars - message("..adding metadata infos") + message(">>> adding metadata infos") if (c("metadata" %in% names(data))) { metadata <- match.arg(arg = "metadata", choices = names(data), several.ok = TRUE) meta.vars <- names(data[[metadata]]) @@ -196,16 +209,26 @@ LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', # sanity on fov name fov %<>% gsub("_|-", ".", .) - #message("..adding FOV") - # add coords to seurat object + message(">>> adding FOV") obj[[fov]] <- coords + ## filter - keep cells with counts > 0 + # small helper function to return metadata + callmeta <- function (object = NULL) { return(object@meta.data) } + nCount <- grep("nCount", callmeta(obj) %>% names, value = TRUE) + if (any(obj[[nCount]] > 0)) { + message(">>> filtering object - keep cells with counts > 0") + obj %<>% subset(subset = !!base::as.symbol(nCount) > 0) + } + if (update.object) { message("Updating object..") obj %<>% UpdateSeuratObject() } - #message("Object is ready!") + message("Object is ready!") return(obj) + + gc() %>% invisible() } #' @return \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object @@ -226,7 +249,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { data.dir = data.dir, type = c("centroids", "segmentations"), ) - + segmentations.data <- list( "centroids" = CreateCentroids(data$centroids), "segmentation" = CreateSegmentation(data$segmentations) @@ -237,7 +260,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { molecules = data$microns, assay = assay ) - + xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) if("Blank Codeword" %in% names(data$matrix)) xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Blank Codeword"]]) @@ -245,7 +268,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Unassigned Codeword"]]) xenium.obj[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]]) xenium.obj[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]]) - + xenium.obj[[fov]] <- coords return(xenium.obj) } @@ -279,27 +302,27 @@ PCAPlot <- function(object, ...) { #' @export #' SpatialDimPlot <- function( - object, - group.by = NULL, - images = NULL, - cols = NULL, - crop = TRUE, - cells.highlight = NULL, - cols.highlight = c('#DE2D26', 'grey50'), - facet.highlight = FALSE, - label = FALSE, - label.size = 7, - label.color = 'white', - repel = FALSE, - ncol = NULL, - combine = TRUE, - pt.size.factor = 1.6, - alpha = c(1, 1), - image.alpha = 1, - stroke = 0.25, - label.box = TRUE, - interactive = FALSE, - information = NULL + object, + group.by = NULL, + images = NULL, + cols = NULL, + crop = TRUE, + cells.highlight = NULL, + cols.highlight = c('#DE2D26', 'grey50'), + facet.highlight = FALSE, + label = FALSE, + label.size = 7, + label.color = 'white', + repel = FALSE, + ncol = NULL, + combine = TRUE, + pt.size.factor = 1.6, + alpha = c(1, 1), + image.alpha = 1, + stroke = 0.25, + label.box = TRUE, + interactive = FALSE, + information = NULL ) { return(SpatialPlot( object = object, @@ -332,22 +355,22 @@ SpatialDimPlot <- function( #' @export #' SpatialFeaturePlot <- function( - object, - features, - images = NULL, - crop = TRUE, - slot = 'data', - keep.scale = "feature", - min.cutoff = NA, - max.cutoff = NA, - ncol = NULL, - combine = TRUE, - pt.size.factor = 1.6, - alpha = c(1, 1), - image.alpha = 1, - stroke = 0.25, - interactive = FALSE, - information = NULL + object, + features, + images = NULL, + crop = TRUE, + slot = 'data', + keep.scale = "feature", + min.cutoff = NA, + max.cutoff = NA, + ncol = NULL, + combine = TRUE, + pt.size.factor = 1.6, + alpha = c(1, 1), + image.alpha = 1, + stroke = 0.25, + interactive = FALSE, + information = NULL ) { return(SpatialPlot( object = object, From dc029acbe9066a0a41f5fbdd786c14691bb55410 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Tue, 6 Jun 2023 11:22:08 +0200 Subject: [PATCH 12/24] major fix for `ReadVizgen()` ..reads old-new data, optimized parallelization, few bugs fixed --- R/preprocessing.R | 345 +++++++++++++++++++++++++++------------------- 1 file changed, 207 insertions(+), 138 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 1d67359f0..9bd1a8ef1 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2398,17 +2398,9 @@ ReadVitessce <- function( #' @param workers.MulticoreParam Number of cores to use for \code{BiocParallel::bplapply} #' @param DTthreads.pct Optional, set percentage eg \code{50} of total threads to use for \code{data.table::fread}, #' if set to \code{NULL} will use default setting as in \code{data.table::getDTthreads(verbose = T)} -#' @param coord.space Type of molecule spatial coordinate space to use; -#' choose one or more of: -#' \itemize{ -#' \item \dQuote{pixel}: molecule coordinates in pixel space -#' \item \dQuote{micron}: molecule coordinates in micron space -#' } #' @param use.furrr When working with lists, use \code{furrr} with \code{future} parallelization; #' if \conde{FALSE} standard \code{purrr} will be used -#' @param use.cellpose.out If \code{TRUE}, and \code{./Cellpose} folder exists, -#' will load results from current MERSCOPE Instrument output. Default to \code{TRUE}; -#' set it to \code{FALSE} if to use previous outputs (ie. non-Cellpose). +#' @param verbose If to print processing messages using \code{message}; default is to \code{FALSE} #' #' @return \code{ReadVizgen}: A list with some combination of the #' following values: @@ -2456,7 +2448,7 @@ ReadVizgen <- function( transcripts = NULL, spatial = NULL, molecules = NULL, - type = c('segmentations', 'centroids'), + type = c('centroids', 'segmentations'), mol.type = 'microns', metadata = NULL, filter = NA_character_, @@ -2464,11 +2456,11 @@ ReadVizgen <- function( use.BiocParallel = TRUE, workers.MulticoreParam = 12, DTthreads.pct = NULL, - coord.space = 'micron', use.furrr = TRUE, - use.cellpose.out = TRUE + verbose = FALSE ) { # TODO: handle multiple segmentations per z-plane + # NOTE: this is only needed when segmentations differ between z-planes # packages that needs to be installed a priori pkgs <- c("data.table", "arrow", "sfarrow", @@ -2481,25 +2473,37 @@ ReadVizgen <- function( paste0("'", pkgs[which(.)], "'", collapse = ", "), " for this function") %>% stop(., call. = FALSE) } } - # setting workers to use for parallel computing - `parallel` ---- + # setting workers to use for parallelization - `BiocParallel` ---- if (use.BiocParallel) { - message("Using parallelization with: `BiocParallel`") + if (verbose) { message("Using parallelization with: `BiocParallel`") } if (is.null(workers.MulticoreParam)) { workers.MulticoreParam <- quantile(BiocParallel::multicoreWorkers() %>% seq)[4] %>% ceiling - message(workers.MulticoreParam, " of total workers available will be used") - } else { message("Setting total workers to: ", workers.MulticoreParam) } - } else { message("Using parallelization with: `future`", "\n", - "NOTE: set workers for parallelization, eg: `future::plan('multisession', workers = 10)`") - } + if (verbose) { message(workers.MulticoreParam, " of total workers available will be used") } + } else { if (verbose) { message("Setting total workers to: ", workers.MulticoreParam) } } + } else { if (verbose) { message("Using parallelization with: `future`", "\n", + "NOTE: set workers for parallelization, eg: `future::plan('multisession', workers = 10)`") } + } + # support parallelization on unix and windows + # credit to https://github.com/Bioconductor/BiocParallel/issues/98 + BPParam <- + if (.Platform$OS.type == "windows") { + if (verbose) { message("Using parallelization for Windows with: `BiocParallel::SerialParam`") } + BiocParallel::SerialParam(force.GC = FALSE, progressbar = TRUE) + } else { + BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, + force.GC = FALSE, progressbar = TRUE) + } - # setting cores to use for parallel computing - `data.table` + # (optional) + # setting additional cores to use for parallelization in `data.table` + # might allow to read large files faster if (!is.null(DTthreads.pct)) { - message("Using parallelization with: `data.table`", "\n", - "..for `data.table::fread`") + if (verbose) { message("Using parallelization with: `data.table`", "\n", + "..for `data.table::fread`") } data.table::setDTthreads(threads = 0) # all cores DTthreads <- data.table::getDTthreads() # max cores DTthreads <- c((DTthreads * DTthreads.pct) / 100) %>% ceiling # percentage from total threads - message("Setting DTthreads to: ", DTthreads, " (", paste0(DTthreads.pct, "%"), ")") + if (verbose) { message("Setting DTthreads to: ", DTthreads, " (", paste0(DTthreads.pct, "%"), ")") } data.table::setDTthreads(threads = DTthreads) # set } @@ -2537,31 +2541,66 @@ ReadVizgen <- function( if (use.dir && !dir.exists(paths = data.dir)) { stop("Cannot find Vizgen directory ", data.dir) } else { - message("Reading data from:" , "\n", data.dir) + if (verbose) { message("Reading data from:" , "\n", data.dir) } } - # TODO: - # - optimize to find and read only ".parquet$" files - # - else read .hdf5 file from ./cell_boundaries dir + # Use segmentation output from ".parquet" file ---- + # check if file exists + parq <- + # look in the current directory + list.files(data.dir, + pattern = ".parquet$", + full.names = TRUE) %>% + { if (is.empty(.)) { + # look in the sub directory (if nothing is found) + list.files(data.dir, + pattern = ".parquet$", + full.names = TRUE, + recursive = TRUE) + } else { (.) }} + # set to use .parquet" file if present + use.parquet <- + parq %>% length() %>% any - # use Cellpose output ---- - if (use.cellpose.out && grep("Cellpose", list.files(data.dir)) %>% any) { - message("Cellpose output will be used..") - # Identify input files - files <- c(transcripts = transcripts %||% paste0(data.dir, - paste0("/", list.files(data.dir, pattern = "Cellpose"), - "/*cell_by_gene.csv")), - - spatial = spatial %||% paste0(data.dir, - paste0("/", list.files(data.dir, pattern = "Cellpose"), - "/*cell_metadata.csv")), - - molecules = molecules %||% "detected_transcripts[_a-zA-Z0-9]*.csv") - } else { - files <- c(transcripts = transcripts %||% "cell_by_gene[_a-zA-Z0-9]*.csv", - spatial = spatial %||% "cell_metadata[_a-zA-Z0-9]*.csv", - molecules = molecules %||% "detected_transcripts[_a-zA-Z0-9]*.csv") - } + if (use.parquet) { + if (verbose) { message("Cell segmentations are found in `.parquet` file", "\n", + ">>> using ", gsub("s", "", mol.type), " space coordinates") }} + # Identify input files.. + # if no files are found in the current directory.. + #..look for them in the sub directory + files <- c(transcripts = transcripts %||% + list.files(data.dir, + pattern = "cell_by_gene", + full.names = TRUE) %>% + { if (is.empty(.)) { + list.files(data.dir, + pattern = "cell_by_gene", + full.names = TRUE, + recursive = TRUE) + } else { (.) }}, + + spatial = spatial %||% + list.files(data.dir, + pattern = "cell_metadata", + full.names = TRUE) %>% + { if (is.empty(.)) { + list.files(data.dir, + pattern = "cell_metadata", + full.names = TRUE, + recursive = TRUE) + } else { (.) }}, + + molecules = molecules %||% + list.files(data.dir, + pattern = "detected_transcripts", + full.names = TRUE) %>% + { if (is.empty(.)) { + list.files(data.dir, + pattern = "detected_transcripts", + full.names = TRUE, + recursive = TRUE) + } else { (.) }} + ) files[is.na(x = files)] <- NA_character_ h5dir <- file.path( @@ -2614,24 +2653,25 @@ ReadVizgen <- function( ) pprecoord(type = 'finish') rownames(x = sp) <- as.character(x = sp[, 1]) - sp <- sp[, -1, drop = FALSE] + #sp <- sp[, -1, drop = FALSE] + if ((names(sp) == "transcript_count") %>% any) { + if (verbose) { message(">>> filtering `cell_metadata` - keep cells with `transcript_count` > 0") } + sp %<>% select(-1) %>% filter(transcript_count > 0) + } else { sp %<>% select(-1) } + # Check to see if we should load segmentations if ('segmentations' %in% type) { - poly <- if (isFALSE(x = hdf5) && !use.cellpose.out) { + poly <- if (isFALSE(x = hdf5) && !use.parquet) { warning( "Cannot find hdf5r; unable to load segmentation vertices", immediate. = TRUE ) FALSE - } else if (use.cellpose.out) { # added for Cellpose output ---- - files2scan <- - list.files(data.dir, pattern = ".parquet$", - all.files = TRUE, - full.names = TRUE, - recursive = TRUE) - if (length(files2scan)) { - message("Cell segmentations are found in `.parquet` file(s)", "\n", - ">>> using ", coord.space, " space coordinates") + } else if (!dir.exists(paths = h5dir) && !use.parquet) { + warning("Cannot find cell boundary H5 files", immediate. = TRUE) + FALSE + } else if (use.parquet) { # for non .hdf5 files + if (length(parq)) { } TRUE } @@ -2700,19 +2740,37 @@ ReadVizgen <- function( tx <- data.table::fread(file = files[[otype]], sep = ",", data.table = FALSE, verbose = FALSE) rownames(x = tx) <- as.character(x = tx[, 1]) - tx <- t(x = as.matrix(x = tx[, -1, drop = FALSE])) - if (!is.na(x = filter)) { - ptx(message = paste("Filtering genes with pattern", - filter), class = "sticky", amount = 0) - tx <- tx[!grepl(pattern = filter, x = rownames(x = tx)), - , drop = FALSE] + # avoid converting to dense matrix? + #tx <- t(x = as.matrix(x = tx[, -1, drop = FALSE])) + + # keep cells with `transcript_count` > 0 + if (verbose) { message(">>> filtering `cell_by_gene` - keep cells with counts > 0") } + tx %<>% select(-1) %>% + filter_all(any_vars(. > 0)) %>% { + if ((names(sp) == "transcript_count") %>% any) { + # match cells to filtered data from spatial (cell_metadata) + filter(., rownames(.) %in% rownames(sp)) + } else { (.) } } %>% # return filtered count data + t() %>% as.sparse() + + # filter cell metadata df + # match filtered cell IDs from count matrix to cell metadata df + if (!(names(sp) == "transcript_count") %>% any) { + sp %<>% filter(rownames(.) %in% colnames(tx)) } + ratio <- getOption(x = "Seurat.input.sparse_ratio", default = 0.4) if ((sum(tx == 0)/length(x = tx)) > ratio) { - ptx(message = "Converting counts to sparse matrix", + ptx(message = "Counts are converted to sparse matrix", class = "sticky", amount = 0) - tx <- as.sparse(x = tx) + #tx <- as.sparse(x = tx) + } + if (!is.na(x = filter)) { + ptx(message = paste("Filtering genes with pattern", + filter), class = "sticky", amount = 0) + tx <- tx[!grepl(pattern = filter, x = rownames(x = tx)), + , drop = FALSE] } ptx(type = "finish") tx @@ -2725,27 +2783,32 @@ ReadVizgen <- function( data.frame(x = sp$center_x, y = sp$center_y, cell = rownames(x = sp), stringsAsFactors = FALSE) - # use Cellpose segmentations + # use segmentations from ".parquet" }, segmentations = { - if (use.cellpose.out) { - files2scan <- - list.files(data.dir, pattern = ".parquet$", - all.files = TRUE, - full.names = TRUE, - recursive = TRUE) - if (length(files2scan)) { - file2read <- - files2scan %>% - grep(coord.space, ., value = TRUE) %>% + if (use.parquet) { + if (length(parq) > 1) { + # eg, if two files are present: + # `cellpose_micron_space.parquet` + # `cellpose_mosaic_space.parquet` + parq %<>% { + if (mol.type == "pixels") { + # for `cellpose_mosaic_space.parquet` + grep("mosaic", ., value = TRUE) + } else if (mol.type == "microns") { + # for `cellpose_micron_space.parquet` + grep(gsub("s", "", mol.type), ., value = TRUE) + } + } %>% { if (is.empty(.)) { - # support only single .parquet file - files2scan %>% + # only if single ".parquet" file present + # eg, `cell_boundaries.parquet` + parq %>% grep(".parquet$", ., value = TRUE) } else { (.) } } } # Read .parquet file - parq <- sfarrow::st_read_parquet(file2read) + parq %<>% sfarrow::st_read_parquet(.) # get all cell segmentations segs <- filter(parq, ZIndex == z) %>% @@ -2760,15 +2823,18 @@ ReadVizgen <- function( if (any(test.segs > 1)) { segs.art.index <- which(test.segs > 1) segs.empty.index <- which(test.segs < 1) - message("Sanity checks on cell segmentation polygons:", "\n", - ">>> found ", segs.art.index %>% length, - " cells with > 1 (nested) polygon lists:", "\n", - ">>> flattening polygon lists", "\n", - ">>> removing ", segs.empty.index %>% length, - " empty polygon lists") + if (verbose) { message("Sanity checks on cell segmentation polygons:", "\n", + ">>> found ", segs.art.index %>% length, + " cells with > 1 (nested) polygon lists:", "\n", + ">>> flattening polygon lists", "\n", + if (c(segs.empty.index %>% length) > 0) { + paste0(">>> removing ", + segs.empty.index %>% length, + " empty polygon lists") } + ) } # step 1 - find polygon nested lists with > 1 length - # (if exists) get originally planned future session + # get original planned session if exists if (is(future::plan(), "multisession")) { orig.plan <- future::plan() } @@ -2780,18 +2846,17 @@ ReadVizgen <- function( # set temporary workers f.plan <- future::plan("multisession", workers = 4L, gc = TRUE) #on.exit(f.plan %>% future::plan()) # exiting doesn't to work with furrr - # using furrr (ie, faster purrr with future) + # using furrr (faster purrr with future) furrr::future_map(., purrr::flatten) } else { purrr::map(., purrr::flatten) } - # since furrr loads future, hide startup messages } %>% suppressPackageStartupMessages() - # set back originally plannned session + # set originally plannned session back if (exists("orig.plan")) { future::plan(orig.plan) } else { - #..or plan sequential + # or plan sequential future::plan("sequential") } @@ -2811,20 +2876,23 @@ ReadVizgen <- function( ## Sanity checks on segmentation polygons - part 2 ---- # check if any cells have > 1 segmentation boundary + # TODO: (optionally) optimize sanity code using purrr? test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() if (any(test.segs %>% unlist > 1)) { segs.art.index <- which(test.segs %>% unlist > 1) - message("Found ", segs.art.index %>% length, - c(" cells with > 1 polygon artifacts:", - ">>> removing artifacts", - ">>> keeping cell boundary with maximum coords") %>% - paste0(., "\n")) + if (verbose) { + message("Found ", segs.art.index %>% length, + c(" cells with > 1 polygon artifacts:", + ">>> removing artifacts", + ">>> keeping cell boundary with maximum coords") %>% + paste0(., "\n")) + } # usually artifacts have small boundaries/coords # find cell boundaries with maximum coords # TODO: (optionally) - # - calculate geometric properties: - # eg, use sphericity to keep single polygon with high sphericity values + # - calculate geometric properties, eg circularity: + # - keep single polygon with high circularity values (likely a cell?) for (i in segs.art.index %>% seq) { dims <- lapply(segs[[segs.art.index[i]]] %>% seq(), @@ -2833,51 +2901,46 @@ ReadVizgen <- function( maxs.segs <- which(unlist(dims) == max(unlist(dims))) segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][maxs.segs] } - } else { message("All cells have single polygon boundary (no artifacts)") } + } else { if (verbose) { message("All cells have single polygon boundary (no artifacts)") } } # some cells might have > 1 polygon boundary with identical lenght #..in this case, keep only the 1st polygon boundary test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() if (any(test.segs > 1)) { segs.art.index <- which(test.segs > 1) - message("Additionally found ", segs.art.index %>% length, - c(" cells with > 1 polygons (identical length):", - ">>> only the 1st polygon boundary will be kept") %>% - paste0(., "\n")) + if (verbose) { message("Additionally found ", segs.art.index %>% length, + c(" cells with > 1 polygons (identical length):", + ">>> only the 1st polygon boundary will be kept") %>% + paste0(., "\n")) } for (i in segs.art.index %>% seq) { # TODO: (optionally) - # - calculate geometric properties: - # here too, use sphericity to keep single polygon with high sphericity values + # - calculate geometric properties, eg circularity: + # - keep single polygon with high circularity values (likely a cell?) + segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][1] } } ## Extract cell boundaries per cells - # TODO: (optionally) resample & make cell boundaries equidistant! - + # TODO: (optionally) resample & make cell boundaries equidistant? + # TODO: (optionally) optimize with purrr::map to get df per list instance? if (use.BiocParallel) { gc() %>% invisible() # free up memory - message("Extracting cell segmentations - using `BiocParallel`") + if (verbose) { message("Extracting cell segmentations - using `BiocParallel`") } segs_list <- BiocParallel::bplapply(segs %>% seq, function(i) { segs[[i]][[1]] %>% data.table::as.data.table(.) %>% - #as.data.frame %>% mutate(cell = names(segs)[i]) }, - BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, - tasks = 50L, - force.GC = FALSE, - progressbar = TRUE) - ) + BPPARAM = BPParam) } else { - message("Extracting cell segmentations - using `future`") + if (verbose) { message("Extracting cell segmentations - using `future`") } segs_list <- future.apply::future_lapply(segs %>% seq, function(i) { segs[[i]] %>% data.table::as.data.table(.) %>% - #as.data.frame %>% mutate(cell = names(segs)[i]) } ) @@ -2887,14 +2950,17 @@ ReadVizgen <- function( segs <- data.table::rbindlist(segs_list) %>% data.table::setnames(c("x", "y", "cell")) - message("All cell segmentations are loaded..") + #names(segs)[1:2] <- c("x", "y") + if (verbose) { message("All cell segmentations are loaded..") } segs - } else { # use non-Cellpose segmentations + } else { + # else use ".hdf5" files from ./cell_boundaries (older version) ppoly <- progressor(steps = length(x = unique(x = sp$fov))) ppoly(message = "Creating polygon coordinates", class = "sticky", amount = 0) - # use parallel or future - if (use.BiocParallel) { + # use `BiocParallel` or `future` + if (use.BiocParallel) { + if (verbose) { message("Reading '.hdf5' files..") } pg <- BiocParallel::bplapply(X = unique(x = sp$fov), FUN = function(f, ...) { fname <- file.path(h5dir, paste0("feature_data_", f, ".hdf5")) if (!file.exists(fname)) { @@ -2908,7 +2974,7 @@ ReadVizgen <- function( on.exit(expr = hfile$close_all()) cells <- rownames(x = subset(x = sp, subset = fov == f)) # creating df for cell boundaries - df <- BiocParallel::bplapply(X = cells, FUN = function(x) { + df <- lapply(X = cells, FUN = function(x) { return(tryCatch(expr = { cc <- hfile[["featuredata"]][[x]][[zidx]][["p_0"]][["coordinates"]]$read() cc <- as.data.frame(x = t(x = cc)) @@ -2918,14 +2984,11 @@ ReadVizgen <- function( }, error = function(...) { return(NULL) })) - }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, - force.GC = FALSE, - progressbar = TRUE)) + }) ppoly() - return(do.call(what = "rbind", args = df)) - }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, - force.GC = FALSE, - progressbar = TRUE)) + #return(do.call(what = "rbind", args = df)) + return(data.table::rbindlist(df)) + }, BPPARAM = BPParam) } else { pg <- future.apply::future_lapply(X = unique(x = sp$fov), @@ -2973,25 +3036,31 @@ ReadVizgen <- function( amount = 0) # use parallel or future if (use.BiocParallel) { - message("Creating box coordinates..") + if (verbose) { message("Creating box coordinates..") } bx <- BiocParallel::bplapply(X = rownames(x = sp), FUN = function(cell) { row <- sp[cell, ] - df <- expand.grid(x = c(row$min_x, row$max_x), - y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, - stringsAsFactors = FALSE) - df <- df[c(1, 3, 4, 2), , drop = FALSE] + # faster version for grid construction + df <- data.table::CJ(x = c(row$min_x, row$max_x), + y = c(row$min_y, row$max_y), + cell = cell) %>% + slice(c(1, 3, 4, 2)) + #df <- expand.grid(x = c(row$min_x, row$max_x), + # y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, + # stringsAsFactors = FALSE) + #df <- df[c(1, 3, 4, 2), , drop = FALSE] pbox() return(df) - }, BPPARAM = BiocParallel::MulticoreParam(workers.MulticoreParam, tasks = 50L, - force.GC = FALSE, progressbar = TRUE) - ) + }, BPPARAM = BPParam) } else { bx <- future.apply::future_lapply(X = rownames(x = sp), FUN = function(cell) { row <- sp[cell, ] - df <- expand.grid(x = c(row$min_x, row$max_x), - y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, - stringsAsFactors = FALSE) - df <- df[c(1, 3, 4, 2), , drop = FALSE] + df <- data.table::CJ(x = c(row$min_x, row$max_x), + y = c(row$min_y, row$max_y), cell = cell) %>% + slice(c(1, 3, 4, 2)) + #df <- expand.grid(x = c(row$min_x, row$max_x), + #y = c(row$min_y, row$max_y), cell = cell, KEEP.OUT.ATTRS = FALSE, + #stringsAsFactors = FALSE) + #df <- df[c(1, 3, 4, 2), , drop = FALSE] pbox() return(df) }) From 8f7153e69c9c558f1b2010819f7ea768dd9400b6 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Tue, 6 Jun 2023 11:31:43 +0200 Subject: [PATCH 13/24] fix for `LoadVizgen()` ..supports args: `mol.type`, `filter`, `z` added `verbose` for optional verbosity --- R/convenience.R | 92 +++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 3e99f0dbd..fb6f33eea 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -129,8 +129,10 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' @return \code{LoadVizgen}: A \code{\link[SeuratObject]{Seurat}} object #' +#' @param fov Name to store FOV as +#' @param assay Name to store expression matrix as #' @param add.zIndex If to add \code{z} slice index to a cell -#' @param update.object If to update final object, default to TRUE. +#' @param update.object If to update final object, default to TRUE #' @param ... Arguments passed to \code{ReadVizgen} #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV @@ -141,13 +143,27 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' #' @rdname ReadVizgen #' -LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', - add.zIndex = TRUE, update.object = TRUE, - ...) { - data <- ReadVizgen(data.dir = data.dir, ...) +LoadVizgen <- function( + data.dir, + fov = 'vz', + assay = 'Vizgen', + mol.type = 'microns', + filter = '^Blank-', + z = 3L, + add.zIndex = TRUE, + update.object = TRUE, + verbose, + ...) +{ + # reading data.. + data <- ReadVizgen(data.dir = data.dir, + mol.type = mol.type, + filter = filter, + z = z, + verbose = verbose, + ...) - #gc() %>% invisible() - message("Creating Seurat object..") + if (verbose) { message("Creating Seurat object..") } obj <- CreateSeuratObject(counts = data[["transcripts"]], assay = assay) # in case no segmentation is present, use boxes @@ -157,46 +173,53 @@ LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', cents <- CreateCentroids(data[["centroids"]]) bound.boxes.data <- list(centroids = cents, boxes = bound.boxes) - message("Creating FOVs..", "\n", - ">>> using box coordinates instead of segmentations") + if (verbose) { + message("Creating FOVs..", "\n", + ">>> using box coordinates instead of segmentations") + } coords <- CreateFOV(coords = bound.boxes.data, - type = c("boxes", "centroids"), + type = c("boxes", "centroids"), molecules = data[[mol.type]], assay = assay) - } else { # in case no segmentation & no boxes are present, use centroids only + } else { + # in case no segmentation & no boxes are present, use centroids only cents <- CreateCentroids(data[["centroids"]]) - message("Creating FOVs..", "\n", - ">>> using only centroids") + if (verbose) { + message("Creating FOVs..", "\n", + ">>> using only centroids") + } coords <- CreateFOV(coords = list(centroids = cents), type = c("centroids"), molecules = data[[mol.type]], assay = assay) + coords <- subset(x = coords, + cells = intersect(x = Cells(x = coords[["centroids"]]), + y = Cells(x = obj))) } - # only consider the cells we have counts and a segmentation for - # Cells which don't have a segmentation are probably found in other z slices. - coords <- subset(x = coords, - cells = intersect(x = Cells(x = coords[["boxes"]]), - y = Cells(x = obj))) - } else { + } else if ("segmentations" %in% names(data)) { segs <- CreateSegmentation(data[["segmentations"]]) cents <- CreateCentroids(data[["centroids"]]) segmentations.data <- list(centroids = cents, segmentation = segs) - message("Creating FOVs..", "\n", - ">>> using segmentations") + if (verbose) { + message("Creating FOVs..", "\n", + ">>> using segmentations") + } coords <- CreateFOV(coords = segmentations.data, type = c("segmentation", "centroids"), molecules = data[[mol.type]], assay = assay) + # only consider the cells we have counts and a segmentation. + # Cells which don't have a segmentation are probably found in other z slices. coords <- subset(x = coords, cells = intersect(x = Cells(x = coords[["segmentation"]]), - y = Cells(x = obj))) + y = Cells(x = obj))) } # add z-stack index for cells if (add.zIndex) { obj$z <- data$zIndex %>% pull(z) } # add metadata vars - message(">>> adding metadata infos") + if (verbose) { message(">>> adding metadata infos") } if (c("metadata" %in% names(data))) { metadata <- match.arg(arg = "metadata", choices = names(data), several.ok = TRUE) meta.vars <- names(data[[metadata]]) @@ -207,28 +230,31 @@ LoadVizgen <- function(data.dir, fov = "vz", assay = 'Vizgen', } # sanity on fov name - fov %<>% gsub("_|-", ".", .) + fov %<>% gsub("_|-", ".", .) - message(">>> adding FOV") + if (verbose) { message(">>> adding FOV") } obj[[fov]] <- coords ## filter - keep cells with counts > 0 - # small helper function to return metadata + # helper function to return metadata callmeta <- function (object = NULL) { return(object@meta.data) } nCount <- grep("nCount", callmeta(obj) %>% names, value = TRUE) - if (any(obj[[nCount]] > 0)) { - message(">>> filtering object - keep cells with counts > 0") + if (any(obj[[nCount]] == 0)) { + if (verbose) { message(">>> filtering object - keeping cells with counts > 0") } obj %<>% subset(subset = !!base::as.symbol(nCount) > 0) - } + } else { if (verbose) { message(">>> all counts are > 0") } } if (update.object) { - message("Updating object..") - obj %<>% UpdateSeuratObject() } + if (verbose) { message("Updating object:") + obj %<>% UpdateSeuratObject() + } else { + obj %<>% + UpdateSeuratObject() %>% + suppressMessages() } } - message("Object is ready!") + if (verbose) { message("Object is ready!") } return(obj) - gc() %>% invisible() } #' @return \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object From 799323da6ddd18fed2aa548532992eebb4032825 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:50:02 +0200 Subject: [PATCH 14/24] update `ReadVizgen()` ..resolve some small conflict for `Read10X_Image()` --- R/preprocessing.R | 51 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 9bd1a8ef1..9b47aeb8c 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -519,10 +519,8 @@ Load10X_Spatial <- function( } object <- CreateSeuratObject(counts = data, assay = assay) if (is.null(x = image)) { - image <- Read10X_Image( - image.dir = file.path(data.dir, 'spatial'), - filter.matrix = filter.matrix - ) + image <- Read10X_Image(image.dir = file.path(data.dir,"spatial"), + filter.matrix = filter.matrix) } else { if (!inherits(x = image, what = "VisiumV1")) stop("Image must be an object of class 'VisiumV1'.") @@ -2464,7 +2462,7 @@ ReadVizgen <- function( # packages that needs to be installed a priori pkgs <- c("data.table", "arrow", "sfarrow", - "tidyverse", "furrr", "BiocParallel") + "tidyverse", "furrr", "BiocParallel", "Matrix") lapply(pkgs %>% length %>% seq, function(i) { !requireNamespace(pkgs[i], quietly = TRUE) } ) %>% unlist %>% @@ -2524,7 +2522,7 @@ ReadVizgen <- function( if (!is.null(x = metadata)) { metadata <- match.arg( arg = metadata, - choices = c("volume", "fov"), + choices = c('volume', 'fov'), several.ok = TRUE ) } @@ -2565,10 +2563,11 @@ ReadVizgen <- function( if (use.parquet) { if (verbose) { message("Cell segmentations are found in `.parquet` file", "\n", ">>> using ", gsub("s", "", mol.type), " space coordinates") }} + # Identify input files.. # if no files are found in the current directory.. #..look for them in the sub directory - files <- c(transcripts = transcripts %||% + files <- c(transcripts = NULL %||% list.files(data.dir, pattern = "cell_by_gene", full.names = TRUE) %>% @@ -2577,9 +2576,15 @@ ReadVizgen <- function( pattern = "cell_by_gene", full.names = TRUE, recursive = TRUE) + } else { (.) }} %>% + { if (length(.) > 1) { + stop("There are > 1 `cell_by_gene` files", + "\n", "make sure only 1 file is read") + } else if (!length(.)) { + stop("No `cell_by_gene` file is available") } else { (.) }}, - spatial = spatial %||% + spatial = NULL %||% list.files(data.dir, pattern = "cell_metadata", full.names = TRUE) %>% @@ -2588,9 +2593,15 @@ ReadVizgen <- function( pattern = "cell_metadata", full.names = TRUE, recursive = TRUE) + } else { (.) }} %>% + { if (length(.) > 1) { + stop("There are > 1 `cell_metadata` files", + "\n", "make sure only 1 file is read") + } else if (!length(.)) { + stop("No `cell_metadata` file is available") } else { (.) }}, - molecules = molecules %||% + molecules = NULL %||% list.files(data.dir, pattern = "detected_transcripts", full.names = TRUE) %>% @@ -2599,6 +2610,12 @@ ReadVizgen <- function( pattern = "detected_transcripts", full.names = TRUE, recursive = TRUE) + } else { (.) }} %>% + { if(length(.) > 1) { + stop("There are > 1 `detected_transcripts` files", + "\n", "make sure only 1 file is read") + } else if (!length(.)) { + stop("No `detected_transcripts` file is available") } else { (.) }} ) @@ -2668,7 +2685,7 @@ ReadVizgen <- function( ) FALSE } else if (!dir.exists(paths = h5dir) && !use.parquet) { - warning("Cannot find cell boundary H5 files", immediate. = TRUE) + warning("Cannot find cell boundary H5 or `.parquet` file(s)", immediate. = TRUE) FALSE } else if (use.parquet) { # for non .hdf5 files if (length(parq)) { @@ -2678,11 +2695,12 @@ ReadVizgen <- function( else { TRUE } - if (isFALSE(x = poly)) { + if (isFALSE(x = poly) && isFALSE(use.parquet)) { type <- setdiff(x = type, y = 'segmentations') } } - spatials <- rep_len(x = files[['spatial']], length.out = length(x = type)) + spatials <- rep_len(x = files[['spatial']], + length.out = length(x = type)) names(x = spatials) <- type files <- c(files, spatials) files <- files[setdiff(x = names(x = files), y = 'spatial')] @@ -2701,6 +2719,8 @@ ReadVizgen <- function( class = 'sticky', amount = 0 ) + + # load molecules mx <- data.table::fread( file = files[['molecules']], sep = ',', @@ -2751,7 +2771,10 @@ ReadVizgen <- function( # match cells to filtered data from spatial (cell_metadata) filter(., rownames(.) %in% rownames(sp)) } else { (.) } } %>% # return filtered count data - t() %>% as.sparse() + # transpose data.frame without converting to dense matrix + #t() %>% + as.sparse() %>% # convert to sparse matrix + Matrix::t() # transpose sparse matrix # filter cell metadata df # match filtered cell IDs from count matrix to cell metadata df @@ -2798,7 +2821,7 @@ ReadVizgen <- function( # for `cellpose_micron_space.parquet` grep(gsub("s", "", mol.type), ., value = TRUE) } - } %>% + } %>% { if (is.empty(.)) { # only if single ".parquet" file present # eg, `cell_boundaries.parquet` From 9ea4458e338f961fc7875aa6bba891d2f85ff894 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:54:37 +0200 Subject: [PATCH 15/24] update `LoadVizgen()` ..adding of molecules to the object is optional `add.molecules = TRUE` --- R/convenience.R | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index fb6f33eea..5e9b0cf74 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -133,6 +133,8 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { #' @param assay Name to store expression matrix as #' @param add.zIndex If to add \code{z} slice index to a cell #' @param update.object If to update final object, default to TRUE +#' @param add.molecules If to add \code{molecules} coordinates to FOV of the object, +#' default to TRUE #' @param ... Arguments passed to \code{ReadVizgen} #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV @@ -152,6 +154,7 @@ LoadVizgen <- function( z = 3L, add.zIndex = TRUE, update.object = TRUE, + add.molecules = TRUE, verbose, ...) { @@ -174,27 +177,40 @@ LoadVizgen <- function( bound.boxes.data <- list(centroids = cents, boxes = bound.boxes) if (verbose) { - message("Creating FOVs..", "\n", + message("Creating FOVs..", "\n", + if (!add.molecules) { ">>> `molecules` coordidates will be skipped" }, + "\n", ">>> using box coordinates instead of segmentations") } - coords <- CreateFOV(coords = bound.boxes.data, - type = c("boxes", "centroids"), - molecules = data[[mol.type]], - assay = assay) + coords <- + CreateFOV(coords = bound.boxes.data, + type = c("boxes", "centroids"), + molecules = + if (add.molecules) { + data[[mol.type]] } else { NULL }, + assay = assay) %>% + subset(x = ., + cells = intersect(x = Cells(x = .[["boxes"]]), + y = Cells(x = obj))) } else { # in case no segmentation & no boxes are present, use centroids only cents <- CreateCentroids(data[["centroids"]]) if (verbose) { - message("Creating FOVs..", "\n", + message("Creating FOVs..", "\n", + if (!add.molecules) { ">>> `molecules` coordidates will be skipped" }, + "\n", ">>> using only centroids") } - coords <- CreateFOV(coords = list(centroids = cents), - type = c("centroids"), - molecules = data[[mol.type]], - assay = assay) - coords <- subset(x = coords, - cells = intersect(x = Cells(x = coords[["centroids"]]), - y = Cells(x = obj))) + coords <- + CreateFOV(coords = list(centroids = cents), + type = c("centroids"), + molecules = + if (add.molecules) { + data[[mol.type]] } else { NULL }, + assay = assay) %>% + subset(x = ., + cells = intersect(x = Cells(x = .[["centroids"]]), + y = Cells(x = obj))) } } else if ("segmentations" %in% names(data)) { segs <- CreateSegmentation(data[["segmentations"]]) @@ -202,17 +218,22 @@ LoadVizgen <- function( segmentations.data <- list(centroids = cents, segmentation = segs) if (verbose) { message("Creating FOVs..", "\n", + if (!add.molecules) { ">>> `molecules` coordidates will be skipped" }, + "\n", ">>> using segmentations") } - coords <- CreateFOV(coords = segmentations.data, - type = c("segmentation", "centroids"), - molecules = data[[mol.type]], - assay = assay) - # only consider the cells we have counts and a segmentation. - # Cells which don't have a segmentation are probably found in other z slices. - coords <- subset(x = coords, - cells = intersect(x = Cells(x = coords[["segmentation"]]), - y = Cells(x = obj))) + coords <- + CreateFOV(coords = segmentations.data, + type = c("segmentation", "centroids"), + molecules = + if (add.molecules) { + data[[mol.type]] } else { NULL }, + assay = assay) %>% + # only consider the cells we have counts and a segmentation. + # Cells which don't have a segmentation are probably found in other z slices. + subset(x = ., + cells = intersect(x = Cells(x = .[["segmentation"]]), + y = Cells(x = obj))) } # add z-stack index for cells From 38fad2a9bc58981f7c7e25382a0f656ce2c8bfea Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:57:33 +0200 Subject: [PATCH 16/24] resolving some conflicts in preprocessing.R --- R/preprocessing.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 9b47aeb8c..b6644c838 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -521,7 +521,8 @@ Load10X_Spatial <- function( if (is.null(x = image)) { image <- Read10X_Image(image.dir = file.path(data.dir,"spatial"), filter.matrix = filter.matrix) - } else { + } + else { if (!inherits(x = image, what = "VisiumV1")) stop("Image must be an object of class 'VisiumV1'.") } From bf15b6e7a1fa25867f568193f8627efd8b48bc37 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Wed, 26 Jul 2023 10:08:44 +0200 Subject: [PATCH 17/24] ..update preprocessing.R from `develop` --- R/preprocessing.R | 756 +++++++++++++++++++++++++++------------------- 1 file changed, 447 insertions(+), 309 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index b6644c838..c53f20e43 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -56,11 +56,11 @@ globalVariables( #' CalculateBarcodeInflections(pbmc_small, group.column = 'groups') #' CalculateBarcodeInflections <- function( - object, - barcode.column = "nCount_RNA", - group.column = "orig.ident", - threshold.low = NULL, - threshold.high = NULL + object, + barcode.column = "nCount_RNA", + group.column = "orig.ident", + threshold.low = NULL, + threshold.high = NULL ) { ## Check that barcode.column exists in meta.data if (!(barcode.column %in% colnames(x = object[[]]))) { @@ -201,15 +201,15 @@ CalculateBarcodeInflections <- function( #' } #' HTODemux <- function( - object, - assay = "HTO", - positive.quantile = 0.99, - init = NULL, - nstarts = 100, - kfunc = "clara", - nsamples = 100, - seed = 42, - verbose = TRUE + object, + assay = "HTO", + positive.quantile = 0.99, + init = NULL, + nstarts = 100, + kfunc = "clara", + nsamples = 100, + seed = 42, + verbose = TRUE ) { if (!is.null(x = seed)) { set.seed(seed = seed) @@ -368,14 +368,14 @@ HTODemux <- function( #' pbmc_small <- GetResidual(object = pbmc_small, features = c('MS4A1', 'TCL1A')) #' GetResidual <- function( - object, - features, - assay = NULL, - umi.assay = NULL, - clip.range = NULL, - replace.value = FALSE, - na.rm = TRUE, - verbose = TRUE + object, + features, + assay = NULL, + umi.assay = NULL, + clip.range = NULL, + replace.value = FALSE, + na.rm = TRUE, + verbose = TRUE ) { assay <- assay %||% DefaultAssay(object = object) if (IsSCT(assay = object[[assay]])) { @@ -500,24 +500,39 @@ GetResidual <- function( #' } #' Load10X_Spatial <- function( - data.dir, - filename = 'filtered_feature_bc_matrix.h5', - assay = 'Spatial', - slice = 'slice1', - filter.matrix = TRUE, - to.upper = FALSE, - image = NULL, - ... + data.dir, + filename = 'filtered_feature_bc_matrix.h5', + assay = 'Spatial', + slice = 'slice1', + filter.matrix = TRUE, + to.upper = FALSE, + image = NULL, + ... ) { if (length(x = data.dir) > 1) { - warning("'Load10X_Spatial' accepts only one 'data.dir'", immediate. = TRUE) + warning("'Load10X_Spatial' accepts only one 'data.dir'", + immediate. = TRUE) data.dir <- data.dir[1] } - data <- Read10X_h5(filename = file.path(data.dir, filename), ...) + data <- Read10X_h5(filename = file.path(data.dir, filename), + ...) + if (to.upper) { - rownames(x = data) <- toupper(x = rownames(x = data)) + data <- imap(data, ~{ + rownames(.x) <- toupper(rownames(.x)) + .x + }) + } + if (is.list(data) & "Antibody Capture" %in% names(data)) { + matrix_gex <- data$`Gene Expression` + matrix_protein <- data$`Antibody Capture` + object <- CreateSeuratObject(counts = matrix_gex, assay = assay) + object_protein <- CreateAssayObject(counts = matrix_protein) + object[["Protein"]] <- object_protein + } + else { + object <- CreateSeuratObject(counts = data, assay = assay) } - object <- CreateSeuratObject(counts = data, assay = assay) if (is.null(x = image)) { image <- Read10X_Image(image.dir = file.path(data.dir,"spatial"), filter.matrix = filter.matrix) @@ -529,9 +544,47 @@ Load10X_Spatial <- function( image <- image[Cells(x = object)] DefaultAssay(object = image) <- assay object[[slice]] <- image + + # if using the meta-data available for probes add to @misc slot + file_path <- file.path(data.dir, filename) + infile <- hdf5r::H5File$new(filename = file_path, mode = 'r') + if("matrix/features/probe_region" %in% hdf5r::list.objects(infile)) { + probe.metadata <- Read10X_probe_metadata(data.dir, filename) + Misc(object = object[['Spatial']], slot = "probe_metadata") <- probe.metadata + } return(object) } +#' Read10x Probe Metadata +#' +#' This function reads the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format. +#' +#' @param data.dir The directory where the file is located. +#' @param filename The name of the file containing the raw probe barcode matrix in HDF5 format. The default filename is 'raw_probe_bc_matrix.h5'. +#' +#' @return Returns a data.frame containing the probe metadata. +#' +#' @export +Read10X_probe_metadata <- function( + data.dir, + filename = 'raw_probe_bc_matrix.h5' +) { + if (!requireNamespace('hdf5r', quietly = TRUE)) { + stop("Please install hdf5r to read HDF5 files") + } + file.path = paste0(data.dir,"/", filename) + if (!file.exists(file.path)) { + stop("File not found") + } + infile <- hdf5r::H5File$new(filename = file.path, mode = 'r') + if("matrix/features/probe_region" %in% hdf5r::list.objects(infile)) { + probe.name <- infile[['matrix/features/name']][] + probe.region<- infile[['matrix/features/probe_region']][] + meta.data <- data.frame(probe.name, probe.region) + return(meta.data) + } +} + #' Load STARmap data #' #' @param data.dir location of data directory that contains the counts matrix, @@ -554,13 +607,13 @@ Load10X_Spatial <- function( #' @concept preprocessing #' LoadSTARmap <- function( - data.dir, - counts.file = "cell_barcode_count.csv", - gene.file = "genes.csv", - qhull.file = "qhulls.tsv", - centroid.file = "centroids.tsv", - assay = "Spatial", - image = "image" + data.dir, + counts.file = "cell_barcode_count.csv", + gene.file = "genes.csv", + qhull.file = "qhulls.tsv", + centroid.file = "centroids.tsv", + assay = "Spatial", + image = "image" ) { if (!dir.exists(paths = data.dir)) { stop("Cannot find directory ", data.dir, call. = FALSE) @@ -603,6 +656,91 @@ LoadSTARmap <- function( return(starmap) } +#' Load Curio Seeker data +#' +#' @param data.dir location of data directory that contains the counts matrix, +#' gene names, barcodes/beads, and barcodes/bead location files. +#' @param assay Name of assay to associate spatial data to +#' +#' @return A \code{\link{Seurat}} object +#' +#' @importFrom Matrix readMM +#' +#' @export +#' @concept preprocessing +#' +LoadCurioSeeker <- function(data.dir, assay = "Spatial") { + # check and find input files + if (length(x = data.dir) > 1) { + warning("'LoadCurioSeeker' accepts only one 'data.dir'", + immediate. = TRUE) + data.dir <- data.dir[1] + } + mtx.file <- list.files( + data.dir, + pattern = "*MoleculesPerMatchedBead.mtx", + full.names = TRUE) + if (length(x = mtx.file) > 1) { + warning("Multiple files matched the pattern '*MoleculesPerMatchedBead.mtx'", + immediate. = TRUE) + } else if (length(x = mtx.file) == 0) { + stop("No file matched the pattern '*MoleculesPerMatchedBead.mtx'", call. = FALSE) + } + mtx.file <- mtx.file[1] + barcodes.file <- list.files( + data.dir, + pattern = "*barcodes.tsv", + full.names = TRUE) + if (length(x = barcodes.file) > 1) { + warning("Multiple files matched the pattern '*barcodes.tsv'", + immediate. = TRUE) + } else if (length(x = barcodes.file) == 0) { + stop("No file matched the pattern '*barcodes.tsv'", call. = FALSE) + } + barcodes.file <- barcodes.file[1] + genes.file <- list.files( + data.dir, + pattern = "*genes.tsv", + full.names = TRUE) + if (length(x = genes.file) > 1) { + warning("Multiple files matched the pattern '*genes.tsv'", + immediate. = TRUE) + } else if (length(x = genes.file) == 0) { + stop("No file matched the pattern '*genes.tsv'", call. = FALSE) + } + genes.file <- genes.file[1] + coordinates.file <- list.files( + data.dir, + pattern = "*MatchedBeadLocation.csv", + full.names = TRUE) + if (length(x = coordinates.file) > 1) { + warning("Multiple files matched the pattern '*MatchedBeadLocation.csv'", + immediate. = TRUE) + } else if (length(x = coordinates.file) == 0) { + stop("No file matched the pattern '*MatchedBeadLocation.csv'", call. = FALSE) + } + coordinates.file <- coordinates.file[1] + + # load counts matrix and create seurat object + mtx <- readMM(mtx.file) + mtx <- as.sparse(mtx) + barcodes <- read.csv(barcodes.file, header = FALSE) + genes <- read.csv(genes.file, header = FALSE) + colnames(mtx) <- barcodes$V1 + rownames(mtx) <- genes$V1 + object <- CreateSeuratObject(counts = mtx, assay = assay) + + # load positions of each bead and store in a SlideSeq object in images slot + coords <- read.csv(coordinates.file) + colnames(coords) <- c("cell", "x", "y") + coords$y <- -coords$y + rownames(coords) <- coords$cell + coords$cell <- NULL + image <- new(Class = 'SlideSeq', assay = assay, coordinates = coords) + object[["Slice"]] <- image + return(object) +} + #' Normalize raw data #' #' Normalize count data per cell and transform to log scale @@ -666,13 +804,13 @@ LogNormalize <- function(data, scale.factor = 1e4, verbose = TRUE) { #' } #' MULTIseqDemux <- function( - object, - assay = "HTO", - quantile = 0.7, - autoThresh = FALSE, - maxiter = 5, - qrange = seq(from = 0.1, to = 0.9, by = 0.05), - verbose = TRUE + object, + assay = "HTO", + quantile = 0.7, + autoThresh = FALSE, + maxiter = 5, + qrange = seq(from = 0.1, to = 0.9, by = 0.05), + verbose = TRUE ) { assay <- assay %||% DefaultAssay(object = object) multi_data_norm <- t(x = GetAssayData( @@ -784,11 +922,11 @@ MULTIseqDemux <- function( #' } #' Read10X <- function( - data.dir, - gene.column = 2, - cell.column = 1, - unique.features = TRUE, - strip.suffix = FALSE + data.dir, + gene.column = 2, + cell.column = 1, + unique.features = TRUE, + strip.suffix = FALSE ) { full.data <- list() has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) @@ -825,7 +963,7 @@ Read10X <- function( } else { cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL) } - + if (ncol(x = cell.barcodes) > 1) { cell.names <- cell.barcodes[, cell.column] } else { @@ -848,7 +986,7 @@ Read10X <- function( } else { colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names) } - + if (has_dt) { feature.names <- as.data.frame(data.table::fread(ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc), header = FALSE)) } else { @@ -858,7 +996,7 @@ Read10X <- function( stringsAsFactors = FALSE ) } - + if (any(is.na(x = feature.names[, gene.column]))) { warning( 'Some features names are NA. Replacing NA names with ID from the opposite column requested', @@ -1105,10 +1243,10 @@ Read10X_Image <- function(image.dir, image.name = "tissue_lowres_image.png", fil #' @template note-reqdpkg #' ReadAkoya <- function( - filename, - type = c('inform', 'processor', 'qupath'), - filter = 'DAPI|Blank|Empty', - inform.quant = c('mean', 'total', 'min', 'max', 'std') + filename, + type = c('inform', 'processor', 'qupath'), + filter = 'DAPI|Blank|Empty', + inform.quant = c('mean', 'total', 'min', 'max', 'std') ) { if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") @@ -1426,18 +1564,18 @@ ReadAkoya <- function( #' } #' ReadMtx <- function( - mtx, - cells, - features, - cell.column = 1, - feature.column = 2, - cell.sep = "\t", - feature.sep = "\t", - skip.cell = 0, - skip.feature = 0, - mtx.transpose = FALSE, - unique.features = TRUE, - strip.suffix = FALSE + mtx, + cells, + features, + cell.column = 1, + feature.column = 2, + cell.sep = "\t", + feature.sep = "\t", + skip.cell = 0, + skip.feature = 0, + mtx.transpose = FALSE, + unique.features = TRUE, + strip.suffix = FALSE ) { all.files <- list( "expression matrix" = mtx, @@ -1536,7 +1674,7 @@ ReadMtx <- function( feature.column, ". Try specifiying a different column.", call. = FALSE - ) + ) } else { warning( "Some features names are NA in column ", @@ -1545,7 +1683,7 @@ ReadMtx <- function( replacement.column, ".", call. = FALSE - ) + ) } feature.names[na.features, feature.column] <- feature.names[na.features, replacement.column] } @@ -1569,7 +1707,7 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } if (length(x = feature.names) != nrow(x = data)) { stop( @@ -1583,9 +1721,9 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } - + colnames(x = data) <- cell.names rownames(x = data) <- feature.names data <- as.sparse(x = data) @@ -1669,24 +1807,24 @@ ReadMtx <- function( #' @template note-reqdpkg #' ReadNanostring <- function( - data.dir, - mtx.file = NULL, - metadata.file = NULL, - molecules.file = NULL, - segmentations.file = NULL, - type = 'centroids', - mol.type = 'pixels', - metadata = NULL, - mols.filter = NA_character_, - genes.filter = NA_character_, - fov.filter = NULL, - subset.counts.matrix = NULL, - cell.mols.only = TRUE + data.dir, + mtx.file = NULL, + metadata.file = NULL, + molecules.file = NULL, + segmentations.file = NULL, + type = 'centroids', + mol.type = 'pixels', + metadata = NULL, + mols.filter = NA_character_, + genes.filter = NA_character_, + fov.filter = NULL, + subset.counts.matrix = NULL, + cell.mols.only = TRUE ) { if (!requireNamespace("data.table", quietly = TRUE)) { stop("Please install 'data.table' for this function") } - + # Argument checking type <- match.arg( arg = type, @@ -1709,7 +1847,7 @@ ReadNanostring <- function( several.ok = TRUE ) } - + use.dir <- all(vapply( X = c(mtx.file, metadata.file, molecules.file), FUN = function(x) { @@ -1717,7 +1855,7 @@ ReadNanostring <- function( }, FUN.VALUE = logical(length = 1L) )) - + if (use.dir && !dir.exists(paths = data.dir)) { stop("Cannot find Nanostring directory ", data.dir) } @@ -1728,7 +1866,7 @@ ReadNanostring <- function( molecules.file = molecules.file %||% '[_a-zA-Z0-9]*_tx_file.csv', segmentations.file = segmentations.file %||% '[_a-zA-Z0-9]*-polygons.csv' ) - + files <- vapply( X = files, FUN = function(x) { @@ -1749,7 +1887,7 @@ ReadNanostring <- function( USE.NAMES = TRUE ) files[!file.exists(files)] <- NA_character_ - + if (all(is.na(x = files))) { stop("Cannot find Nanostring input files in ", data.dir) } @@ -1767,7 +1905,7 @@ ReadNanostring <- function( data.table = FALSE, verbose = FALSE ) - + # filter metadata file by FOVs if (!is.null(x = fov.filter)) { md <- md[md$fov %in% fov.filter,] @@ -1787,7 +1925,7 @@ ReadNanostring <- function( data.table = FALSE, verbose = FALSE ) - + # filter metadata file by FOVs if (!is.null(x = fov.filter)) { segs <- segs[segs$fov %in% fov.filter,] @@ -1807,17 +1945,17 @@ ReadNanostring <- function( sep = ',', verbose = FALSE ) - + # filter molecules file by FOVs if (!is.null(x = fov.filter)) { mx <- mx[mx$fov %in% fov.filter,] } - + # Molecules outside of a cell have a cell_ID of 0 if (cell.mols.only) { mx <- mx[mx$cell_ID != 0,] } - + if (!is.na(x = mols.filter)) { ppremol( message = paste("Filtering molecules with pattern", mols.filter), @@ -1833,7 +1971,7 @@ ReadNanostring <- function( files <- files[setdiff(x = names(x = files), y = 'molecules.file')] } files <- files[!is.na(x = files)] - + outs <- list("matrix"=NULL, "pixels"=NULL, "centroids"=NULL) if (!is.null(metadata)) { outs <- append(outs, list("metadata" = NULL)) @@ -1841,7 +1979,7 @@ ReadNanostring <- function( if ("segmentations" %in% type) { outs <- append(outs, list("segmentations" = NULL)) } - + for (otype in names(x = outs)) { outs[[otype]] <- switch( EXPR = otype, @@ -1868,8 +2006,8 @@ ReadNanostring <- function( } tx <- subset(tx, select = -c(fov, cell_ID)) } - - tx <- as.data.frame(t(x = as.matrix(x = tx[, -1, drop = FALSE]))) + + tx <- as.data.frame(t(x = as.matrix(x = tx))) if (!is.na(x = genes.filter)) { ptx( message = paste("Filtering genes with pattern", genes.filter), @@ -1881,7 +2019,7 @@ ReadNanostring <- function( # only keep cells with counts greater than 0 tx <- tx[, which(colSums(tx) != 0)] ratio <- getOption(x = 'Seurat.input.sparse_ratio', default = 0.4) - + if ((sum(tx == 0) / length(x = tx)) > ratio) { ptx( message = 'Converting counts to sparse matrix', @@ -1890,9 +2028,9 @@ ReadNanostring <- function( ) tx <- as.sparse(x = tx) } - + ptx(type = 'finish') - + tx }, 'centroids' = { @@ -2011,10 +2149,10 @@ ReadNanostring <- function( #' @concept preprocessing #' ReadXenium <- function( - data.dir, - outs = c("matrix", "microns"), - type = "centroids", - mols.qv.threshold = 20 + data.dir, + outs = c("matrix", "microns"), + type = "centroids", + mols.qv.threshold = 20 ) { # Argument checking type <- match.arg( @@ -2022,17 +2160,17 @@ ReadXenium <- function( choices = c("centroids", "segmentations"), several.ok = TRUE ) - + outs <- match.arg( arg = outs, choices = c("matrix", "microns"), several.ok = TRUE ) - + outs <- c(outs, type) - + has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) - + data <- sapply(outs, function(otype) { switch( EXPR = otype, @@ -2071,7 +2209,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + # load cell boundaries if (has_dt) { cell_boundaries_df <- as.data.frame(data.table::fread(file.path(data.dir, "cell_boundaries.csv.gz"))) @@ -2089,7 +2227,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + # molecules if (has_dt) { tx_dt <- as.data.frame(data.table::fread(file.path(data.dir, "transcripts.csv.gz"))) @@ -2098,7 +2236,7 @@ ReadXenium <- function( transcripts <- read.csv(file.path(data.dir, "transcripts.csv.gz")) transcripts <- subset(transcripts, qv >= mols.qv.threshold) } - + df <- data.frame( x = transcripts$x_location, @@ -2216,11 +2354,11 @@ ReadSlideSeq <- function(coord.file, assay = 'Spatial') { #' } #' ReadVitessce <- function( - counts = NULL, - coords = NULL, - molecules = NULL, - type = c('segmentations', 'centroids'), - filter = NA_character_ + counts = NULL, + coords = NULL, + molecules = NULL, + type = c('segmentations', 'centroids'), + filter = NA_character_ ) { if (!requireNamespace('jsonlite', quietly = TRUE)) { stop("Please install 'jsonlite' for this function") @@ -3179,9 +3317,9 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) { #' @concept preprocessing #' RunMarkVario <- function( - spatial.location, - data, - ... + spatial.location, + data, + ... ) { pp <- ppp( x = spatial.location[, 1], @@ -3297,10 +3435,10 @@ RunMoransI <- function(data, pos, verbose = TRUE) { #' head(x = downsampled) #' SampleUMI <- function( - data, - max.umi = 1000, - upsample = FALSE, - verbose = FALSE + data, + max.umi = 1000, + upsample = FALSE, + verbose = FALSE ) { data <- as.sparse(x = data) if (length(x = max.umi) == 1) { @@ -3327,7 +3465,7 @@ SampleUMI <- function( #' Use regularized negative binomial regression to normalize UMI count data #' #' This function calls sctransform::vst. The sctransform package is available at -#' https://github.com/ChristophH/sctransform. +#' https://github.com/satijalab/sctransform. #' Use this function as an alternative to the NormalizeData, #' FindVariableFeatures, ScaleData workflow. Results are saved in a new assay #' (named SCT by default) with counts being (corrected) counts, data being log1p(counts), @@ -3385,24 +3523,24 @@ SampleUMI <- function( #' SCTransform(object = pbmc_small) #' SCTransform <- function( - object, - assay = 'RNA', - new.assay.name = 'SCT', - reference.SCT.model = NULL, - do.correct.umi = TRUE, - ncells = 5000, - residual.features = NULL, - variable.features.n = 3000, - variable.features.rv.th = 1.3, - vars.to.regress = NULL, - do.scale = FALSE, - do.center = TRUE, - clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), - conserve.memory = FALSE, - return.only.var.genes = TRUE, - seed.use = 1448145, - verbose = TRUE, - ... + object, + assay = 'RNA', + new.assay.name = 'SCT', + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... ) { if (!is.null(x = seed.use)) { set.seed(seed = seed.use) @@ -3432,7 +3570,7 @@ SCTransform <- function( if (reference.SCT.model$model_str != 'y ~ log_umi') { stop('reference.SCT.model must be derived using default SCT regression formula, `y ~ log_umi`') } - + } # check for latent_var in meta data if ('latent_var' %in% names(x = vst.args)) { @@ -3471,7 +3609,7 @@ SCTransform <- function( vst.args[['n_cells']] <- min(ncells, ncol(x = umi)) residual.type <- vst.args[['residual_type']] %||% 'pearson' res.clip.range <- vst.args[['res_clip_range']] %||% c(-sqrt(x = ncol(x = umi)), sqrt(x = ncol(x = umi))) - # set sct normalization method + # set sct normalization method if (!is.null( reference.SCT.model)) { sct.method <- "reference.model" } else if (!is.null(x = residual.features)) { @@ -3481,7 +3619,7 @@ SCTransform <- function( } else { sct.method <- "default" } - + # set vst model vst.out <- switch( EXPR = sct.method, @@ -3538,7 +3676,7 @@ SCTransform <- function( vst.out <- do.call(what = 'vst', args = vst.args) vst.out }) - + feature.variance <- vst.out$gene_attr[,"residual_variance"] names(x = feature.variance) <- rownames(x = vst.out$gene_attr) if (verbose) { @@ -3550,7 +3688,7 @@ SCTransform <- function( } else { top.features <- names(x = feature.variance)[feature.variance >= variable.features.rv.th] } - + # get residuals vst.out <- switch( EXPR = sct.method, @@ -3760,16 +3898,16 @@ SubsetByBarcodeInflections <- function(object) { #' @export #' FindVariableFeatures.default <- function( - object, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - verbose = TRUE, - ... + object, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + verbose = TRUE, + ... ) { CheckDots(...) if (!inherits(x = object, 'Matrix')) { @@ -3860,19 +3998,19 @@ FindVariableFeatures.default <- function( #' @method FindVariableFeatures Assay #' FindVariableFeatures.Assay <- function( - object, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - nfeatures = 2000, - mean.cutoff = c(0.1, 8), - dispersion.cutoff = c(1, Inf), - verbose = TRUE, - ... + object, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + nfeatures = 2000, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + verbose = TRUE, + ... ) { if (length(x = mean.cutoff) != 2 || length(x = dispersion.cutoff) != 2) { stop("Both 'mean.cutoff' and 'dispersion.cutoff' must be two numbers") @@ -3939,9 +4077,9 @@ FindVariableFeatures.Assay <- function( #' @method FindVariableFeatures SCTAssay #' FindVariableFeatures.SCTAssay <- function( - object, - nfeatures = 2000, - ... + object, + nfeatures = 2000, + ... ) { if (length(x = slot(object = object, name = "SCTModel.list")) > 1) { stop("SCT assay is comprised of multiple SCT models. To change the variable features, please set manually with VariableFeatures<-", call. = FALSE) @@ -3961,20 +4099,20 @@ FindVariableFeatures.SCTAssay <- function( #' @method FindVariableFeatures Seurat #' FindVariableFeatures.Seurat <- function( - object, - assay = NULL, - selection.method = "vst", - loess.span = 0.3, - clip.max = 'auto', - mean.function = FastExpMean, - dispersion.function = FastLogVMR, - num.bin = 20, - binning.method = "equal_width", - nfeatures = 2000, - mean.cutoff = c(0.1, 8), - dispersion.cutoff = c(1, Inf), - verbose = TRUE, - ... + object, + assay = NULL, + selection.method = "vst", + loess.span = 0.3, + clip.max = 'auto', + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + nfeatures = 2000, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -4030,14 +4168,14 @@ FindVariableFeatures.Seurat <- function( #' #' FindSpatiallyVariableFeatures.default <- function( - object, - spatial.location, - selection.method = c('markvariogram', 'moransi'), - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - verbose = TRUE, - ... + object, + spatial.location, + selection.method = c('markvariogram', 'moransi'), + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + verbose = TRUE, + ... ) { # error check dimensions if (ncol(x = object) != nrow(x = spatial.location)) { @@ -4082,17 +4220,17 @@ FindSpatiallyVariableFeatures.default <- function( #' @export #' FindSpatiallyVariableFeatures.Assay <- function( - object, - slot = "scale.data", - spatial.location, - selection.method = c('markvariogram', 'moransi'), - features = NULL, - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - nfeatures = nfeatures, - verbose = TRUE, - ... + object, + slot = "scale.data", + spatial.location, + selection.method = c('markvariogram', 'moransi'), + features = NULL, + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = nfeatures, + verbose = TRUE, + ... ) { features <- features %||% rownames(x = object) if (selection.method == "markvariogram" && "markvariogram" %in% names(x = Misc(object = object))) { @@ -4174,18 +4312,18 @@ FindSpatiallyVariableFeatures.Assay <- function( #' @export #' FindSpatiallyVariableFeatures.Seurat <- function( - object, - assay = NULL, - slot = "scale.data", - features = NULL, - image = NULL, - selection.method = c('markvariogram', 'moransi'), - r.metric = 5, - x.cuts = NULL, - y.cuts = NULL, - nfeatures = 2000, - verbose = TRUE, - ... + object, + assay = NULL, + slot = "scale.data", + features = NULL, + image = NULL, + selection.method = c('markvariogram', 'moransi'), + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = 2000, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) features <- features %||% rownames(x = object[[assay]]) @@ -4233,13 +4371,13 @@ FindSpatiallyVariableFeatures.Seurat <- function( #' @export #' NormalizeData.default <- function( - object, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - block.size = NULL, - verbose = TRUE, - ... + object, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + block.size = NULL, + verbose = TRUE, + ... ) { CheckDots(...) if (is.null(x = normalization.method)) { @@ -4340,12 +4478,12 @@ NormalizeData.default <- function( #' @method NormalizeData Assay #' NormalizeData.Assay <- function( - object, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - verbose = TRUE, - ... + object, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + verbose = TRUE, + ... ) { object <- SetAssayData( object = object, @@ -4377,13 +4515,13 @@ NormalizeData.Assay <- function( #' } #' NormalizeData.Seurat <- function( - object, - assay = NULL, - normalization.method = "LogNormalize", - scale.factor = 1e4, - margin = 1, - verbose = TRUE, - ... + object, + assay = NULL, + normalization.method = "LogNormalize", + scale.factor = 1e4, + margin = 1, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -4433,20 +4571,20 @@ NormalizeData.Seurat <- function( #' @export #' ScaleData.default <- function( - object, - features = NULL, - vars.to.regress = NULL, - latent.data = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + vars.to.regress = NULL, + latent.data = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { CheckDots(...) features <- features %||% rownames(x = object) @@ -4674,20 +4812,20 @@ ScaleData.default <- function( #' @method ScaleData Assay #' ScaleData.Assay <- function( - object, - features = NULL, - vars.to.regress = NULL, - latent.data = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + vars.to.regress = NULL, + latent.data = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { use.umi <- ifelse(test = model.use != 'linear', yes = TRUE, no = use.umi) slot.use <- ifelse(test = use.umi, yes = 'counts', no = 'data') @@ -4732,20 +4870,20 @@ ScaleData.Assay <- function( #' @method ScaleData Seurat #' ScaleData.Seurat <- function( - object, - features = NULL, - assay = NULL, - vars.to.regress = NULL, - split.by = NULL, - model.use = 'linear', - use.umi = FALSE, - do.scale = TRUE, - do.center = TRUE, - scale.max = 10, - block.size = 1000, - min.cells.to.block = 3000, - verbose = TRUE, - ... + object, + features = NULL, + assay = NULL, + vars.to.regress = NULL, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + verbose = TRUE, + ... ) { assay <- assay %||% DefaultAssay(object = object) assay.data <- GetAssay(object = object, assay = assay) @@ -5145,13 +5283,13 @@ FindThresh <- function(call.list) { #' @importFrom sctransform get_residuals # GetResidualSCTModel <- function( - object, - assay, - SCTModel, - new_features, - clip.range, - replace.value, - verbose + object, + assay, + SCTModel, + new_features, + clip.range, + replace.value, + verbose ) { clip.range <- clip.range %||% SCTResults(object = object[[assay]], slot = "clips", model = SCTModel)$sct model.features <- rownames(x = SCTResults(object = object[[assay]], slot = "feature.attributes", model = SCTModel)) @@ -5160,14 +5298,14 @@ GetResidualSCTModel <- function( sct.method <- SCTResults(object = object[[assay]], slot = "arguments", model = SCTModel)$sct.method %||% "default" scale.data.cells <- colnames(x = GetAssayData(object = object, assay = assay, slot = "scale.data")) if (length(x = setdiff(x = model.cells, y = scale.data.cells)) == 0) { - existing_features <- names(x = which(x = ! apply( - X = GetAssayData(object = object, assay = assay, slot = "scale.data")[, model.cells], - MARGIN = 1, - FUN = anyNA) - )) - } else { - existing_features <- character() - } + existing_features <- names(x = which(x = ! apply( + X = GetAssayData(object = object, assay = assay, slot = "scale.data")[, model.cells], + MARGIN = 1, + FUN = anyNA) + )) + } else { + existing_features <- character() + } if (replace.value) { features_to_compute <- new_features } else { @@ -5181,7 +5319,7 @@ GetResidualSCTModel <- function( } if (!umi.assay %in% Assays(object = object)) { warning("The umi assay (", umi.assay, ") is not present in the object. ", - "Cannot compute additional residuals.", call. = FALSE, immediate. = TRUE) + "Cannot compute additional residuals.", call. = FALSE, immediate. = TRUE) return(NULL) } diff_features <- setdiff(x = features_to_compute, y = model.features) @@ -5319,12 +5457,12 @@ NBResiduals <- function(fmla, regression.mat, gene, return.mode = FALSE) { #' @importFrom utils txtProgressBar setTxtProgressBar # RegressOutMatrix <- function( - data.expr, - latent.data = NULL, - features.regress = NULL, - model.use = NULL, - use.umi = FALSE, - verbose = TRUE + data.expr, + latent.data = NULL, + features.regress = NULL, + model.use = NULL, + use.umi = FALSE, + verbose = TRUE ) { # Do we bypass regression and simply return data.expr? bypass <- vapply( From f8461ca9161ea033b6c483118d3b0ec99029b39c Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Fri, 18 Aug 2023 14:30:06 +0200 Subject: [PATCH 18/24] cleaning `ReadVizgen` few small edits for cleaner code. --- R/preprocessing.R | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index c53f20e43..e5bc7d83b 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2936,8 +2936,8 @@ ReadVizgen <- function( } ptx(type = "finish") tx - - }, centroids = { + }, + centroids = { pcents <- progressor() pcents(message = "Creating centroid coordinates", class = "sticky", amount = 0) @@ -2946,7 +2946,8 @@ ReadVizgen <- function( stringsAsFactors = FALSE) # use segmentations from ".parquet" - }, segmentations = { + }, + segmentations = { if (use.parquet) { if (length(parq) > 1) { # eg, if two files are present: @@ -3040,8 +3041,8 @@ ReadVizgen <- function( # check if any cells have > 1 segmentation boundary # TODO: (optionally) optimize sanity code using purrr? test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() - if (any(test.segs %>% unlist > 1)) { - segs.art.index <- which(test.segs %>% unlist > 1) + if (any(test.segs > 1)) { + segs.art.index <- which(test.segs > 1) if (verbose) { message("Found ", segs.art.index %>% length, c(" cells with > 1 polygon artifacts:", @@ -3192,7 +3193,8 @@ ReadVizgen <- function( } pg # final segmentaions out.. } - }, boxes = { + }, + boxes = { pbox <- progressor(steps = nrow(x = sp)) pbox(message = "Creating box coordinates", class = "sticky", amount = 0) @@ -3229,13 +3231,15 @@ ReadVizgen <- function( } pbox(type = "finish") do.call(what = "rbind", args = bx) - }, metadata = { + }, + metadata = { pmeta <- progressor() pmeta(message = "Loading metadata", class = "sticky", amount = 0) pmeta(type = "finish") sp[, metadata, drop = FALSE] - }, pixels = { + }, + pixels = { ppixels <- progressor() ppixels(message = "Creating pixel-level molecule coordinates", class = "sticky", amount = 0) @@ -3243,7 +3247,8 @@ ReadVizgen <- function( stringsAsFactors = FALSE) ppixels(type = "finish") df - }, microns = { + }, + microns = { pmicrons <- progressor() pmicrons(message = "Creating micron-level molecule coordinates", class = "sticky", amount = 0) From a7be25a7c209a7b958c673d648896bced133ebb1 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Tue, 22 Aug 2023 14:17:18 +0200 Subject: [PATCH 19/24] small bug fix in `ReadVizgen` --- R/preprocessing.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index e5bc7d83b..3b4badf8d 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3102,7 +3102,7 @@ ReadVizgen <- function( segs_list <- future.apply::future_lapply(segs %>% seq, function(i) { - segs[[i]] %>% + segs[[i]][[1]] %>% data.table::as.data.table(.) %>% mutate(cell = names(segs)[i]) } From 6352d56af8e839cfc5c1237e96e4a068f22f40ea Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:50:05 +0200 Subject: [PATCH 20/24] added `sf` & filter polygons -> `ReadVizgen()` ..inspired by [`SpatialFeatureExperiment`](https://github.com/pachterlab/SpatialFeatureExperiment) using`sf` package to work/filter segmentation geometries. Updates: - adding support to efficiently filter segmentation polygons (removing small artifacts etc..) - added helper function `.filter_polygons()` (a bit modified version from `SpatialFeatureExperiment:::.filter_polygons`) for filtering given `min.area` - minimal polygon area as a threshold - allow for multiple segmentation polygons per cell (..still using single z-plane) --- R/preprocessing.R | 197 +++++++++++++--------------------------------- 1 file changed, 53 insertions(+), 144 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 3b4badf8d..34d291284 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2535,8 +2535,8 @@ ReadVitessce <- function( #' @param workers.MulticoreParam Number of cores to use for \code{BiocParallel::bplapply} #' @param DTthreads.pct Optional, set percentage eg \code{50} of total threads to use for \code{data.table::fread}, #' if set to \code{NULL} will use default setting as in \code{data.table::getDTthreads(verbose = T)} -#' @param use.furrr When working with lists, use \code{furrr} with \code{future} parallelization; -#' if \conde{FALSE} standard \code{purrr} will be used +#' @param min.area Minimal polygon area to use as a threshold for filtering, if set to \code{NULL} +#' the maximal area (ie, the largest polygon) will be used instead. #' @param verbose If to print processing messages using \code{message}; default is to \code{FALSE} #' #' @return \code{ReadVizgen}: A list with some combination of the @@ -2561,11 +2561,9 @@ ReadVitessce <- function( #' #' @importFrom future.apply future_lapply #' @import dplyr -#' @import tibble -#' @import purrr -#' @import furrr +#' @importFrom purrr lmap +#' @import sf #' @import magrittr -#' @importFrom spatstat.geom is.empty #' @importFrom progressr progressor #' #' @export @@ -2593,15 +2591,15 @@ ReadVizgen <- function( use.BiocParallel = TRUE, workers.MulticoreParam = 12, DTthreads.pct = NULL, - use.furrr = TRUE, + min.area = 5, verbose = FALSE ) { # TODO: handle multiple segmentations per z-plane # NOTE: this is only needed when segmentations differ between z-planes - # packages that needs to be installed a priori + # packages that need to be installed a priori pkgs <- c("data.table", "arrow", "sfarrow", - "tidyverse", "furrr", "BiocParallel", "Matrix") + "tidyverse", "sf", "BiocParallel", "Matrix") lapply(pkgs %>% length %>% seq, function(i) { !requireNamespace(pkgs[i], quietly = TRUE) } ) %>% unlist %>% @@ -2688,7 +2686,7 @@ ReadVizgen <- function( list.files(data.dir, pattern = ".parquet$", full.names = TRUE) %>% - { if (is.empty(.)) { + { if (length(.) == 0) { # look in the sub directory (if nothing is found) list.files(data.dir, pattern = ".parquet$", @@ -2710,7 +2708,7 @@ ReadVizgen <- function( list.files(data.dir, pattern = "cell_by_gene", full.names = TRUE) %>% - { if (is.empty(.)) { + { if (length(.) == 0) { list.files(data.dir, pattern = "cell_by_gene", full.names = TRUE, @@ -2727,7 +2725,7 @@ ReadVizgen <- function( list.files(data.dir, pattern = "cell_metadata", full.names = TRUE) %>% - { if (is.empty(.)) { + { if (length(.) == 0) { list.files(data.dir, pattern = "cell_metadata", full.names = TRUE, @@ -2744,7 +2742,7 @@ ReadVizgen <- function( list.files(data.dir, pattern = "detected_transcripts", full.names = TRUE) %>% - { if (is.empty(.)) { + { if (length(.) == 0) { list.files(data.dir, pattern = "detected_transcripts", full.names = TRUE, @@ -2858,8 +2856,8 @@ ReadVizgen <- function( class = 'sticky', amount = 0 ) - - # load molecules + + # optionally - load molecules mx <- data.table::fread( file = files[['molecules']], sep = ',', @@ -2899,9 +2897,6 @@ ReadVizgen <- function( tx <- data.table::fread(file = files[[otype]], sep = ",", data.table = FALSE, verbose = FALSE) rownames(x = tx) <- as.character(x = tx[, 1]) - # avoid converting to dense matrix? - #tx <- t(x = as.matrix(x = tx[, -1, drop = FALSE])) - # keep cells with `transcript_count` > 0 if (verbose) { message(">>> filtering `cell_by_gene` - keep cells with counts > 0") } tx %<>% select(-1) %>% @@ -2910,8 +2905,6 @@ ReadVizgen <- function( # match cells to filtered data from spatial (cell_metadata) filter(., rownames(.) %in% rownames(sp)) } else { (.) } } %>% # return filtered count data - # transpose data.frame without converting to dense matrix - #t() %>% as.sparse() %>% # convert to sparse matrix Matrix::t() # transpose sparse matrix @@ -2936,6 +2929,7 @@ ReadVizgen <- function( } ptx(type = "finish") tx + }, centroids = { pcents <- progressor() @@ -2944,10 +2938,9 @@ ReadVizgen <- function( pcents(type = "finish") data.frame(x = sp$center_x, y = sp$center_y, cell = rownames(x = sp), stringsAsFactors = FALSE) - - # use segmentations from ".parquet" }, segmentations = { + # use segmentations from ".parquet" if (use.parquet) { if (length(parq) > 1) { # eg, if two files are present: @@ -2962,7 +2955,7 @@ ReadVizgen <- function( grep(gsub("s", "", mol.type), ., value = TRUE) } } %>% - { if (is.empty(.)) { + { if (length(.) == 0) { # only if single ".parquet" file present # eg, `cell_boundaries.parquet` parq %>% @@ -2970,131 +2963,42 @@ ReadVizgen <- function( } else { (.) } } } - # Read .parquet file - parq %<>% sfarrow::st_read_parquet(.) + # Read .parquet file ---- + parq %<>% sfarrow::st_read_parquet(.) %>% + # keep only selected z-plane + filter(., ZIndex == z) - # get all cell segmentations - segs <- filter(parq, ZIndex == z) %>% - pull(Geometry) %>% - # add cell ID - set_names(filter(parq, ZIndex == z) %>% - pull(EntityID) %>% as.character) + # Sanity checks on segmentation polygons + # Using `sf` for filtering polygons ---- + # keep multiple polygons pieces per single cell id + parq %<>% + .filter_polygons(., + min.area = min.area, + BPPARAM = + if (exists("BPParam")) + { BPParam } else { SerialParam() }, + verbose = verbose) - ## Sanity checks on segmentation polygons - part 1 ---- - test.segs <- lapply(segs %>% seq, - function(i) segs[[i]] %>% length) %>% unlist() - if (any(test.segs > 1)) { - segs.art.index <- which(test.segs > 1) - segs.empty.index <- which(test.segs < 1) - if (verbose) { message("Sanity checks on cell segmentation polygons:", "\n", - ">>> found ", segs.art.index %>% length, - " cells with > 1 (nested) polygon lists:", "\n", - ">>> flattening polygon lists", "\n", - if (c(segs.empty.index %>% length) > 0) { - paste0(">>> removing ", - segs.empty.index %>% length, - " empty polygon lists") } - ) } - - # step 1 - find polygon nested lists with > 1 length - # get original planned session if exists - if (is(future::plan(), "multisession")) { - orig.plan <- future::plan() - } - segs.1 <- segs %>% - purrr::keep(., c(purrr::map(., length) %>% unlist > 1)) %>% - # flatten each list - { - if (use.furrr) { - # set temporary workers - f.plan <- future::plan("multisession", workers = 4L, gc = TRUE) - #on.exit(f.plan %>% future::plan()) # exiting doesn't to work with furrr - # using furrr (faster purrr with future) - furrr::future_map(., purrr::flatten) - } else { - purrr::map(., purrr::flatten) - } - } %>% suppressPackageStartupMessages() - # set originally plannned session back - if (exists("orig.plan")) { - future::plan(orig.plan) - } else { - # or plan sequential - future::plan("sequential") - } - - # step 2 - apply multiple filtering - segs.2 <- - segs %>% - # remove empty elements - purrr::keep(., !c(purrr::map(., length) %>% unlist < 1)) %>% - # keep lists with length == 1 polygon information - purrr::keep(., c(purrr::map(., length) %>% unlist == 1)) %>% - # collapse to a list - purrr::flatten(.) - - # step 3 - combine segmentaion lists - segs <- c(segs.1, segs.2) - } - - ## Sanity checks on segmentation polygons - part 2 ---- - # check if any cells have > 1 segmentation boundary - # TODO: (optionally) optimize sanity code using purrr? - test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() - if (any(test.segs > 1)) { - segs.art.index <- which(test.segs > 1) - if (verbose) { - message("Found ", segs.art.index %>% length, - c(" cells with > 1 polygon artifacts:", - ">>> removing artifacts", - ">>> keeping cell boundary with maximum coords") %>% - paste0(., "\n")) - } - # usually artifacts have small boundaries/coords - # find cell boundaries with maximum coords - - # TODO: (optionally) - # - calculate geometric properties, eg circularity: - # - keep single polygon with high circularity values (likely a cell?) - - for (i in segs.art.index %>% seq) { - dims <- lapply(segs[[segs.art.index[i]]] %>% seq(), - function(d) { dim(segs[[segs.art.index[i]]][[d]])[1] } ) - # get & keep boundaries with maximum coords - maxs.segs <- which(unlist(dims) == max(unlist(dims))) - segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][maxs.segs] - } - } else { if (verbose) { message("All cells have single polygon boundary (no artifacts)") } } - - # some cells might have > 1 polygon boundary with identical lenght - #..in this case, keep only the 1st polygon boundary - test.segs <- lapply(segs %>% seq, function(i) segs[[i]] %>% length) %>% unlist() - if (any(test.segs > 1)) { - segs.art.index <- which(test.segs > 1) - if (verbose) { message("Additionally found ", segs.art.index %>% length, - c(" cells with > 1 polygons (identical length):", - ">>> only the 1st polygon boundary will be kept") %>% - paste0(., "\n")) } - for (i in segs.art.index %>% seq) { - # TODO: (optionally) - # - calculate geometric properties, eg circularity: - # - keep single polygon with high circularity values (likely a cell?) - - segs[[segs.art.index[i]]] <- segs[[segs.art.index[i]]][1] - } - } + # Get filtered segmentation geometries/polygons + segs <- + parq %>% + st_geometry() %>% + # add cell ID + set_names(pull(parq, EntityID) %>% as.character) - ## Extract cell boundaries per cells + # Extract cell boundaries per cell id ---- # TODO: (optionally) resample & make cell boundaries equidistant? - # TODO: (optionally) optimize with purrr::map to get df per list instance? if (use.BiocParallel) { gc() %>% invisible() # free up memory if (verbose) { message("Extracting cell segmentations - using `BiocParallel`") } segs_list <- BiocParallel::bplapply(segs %>% seq, function(i) { - segs[[i]][[1]] %>% - data.table::as.data.table(.) %>% + # keep multiple polygons per cell + segs[[i]] %>% + unique() %>% # remove any duplicates + purrr::lmap(.f = data.table::as.data.table) %>% + data.table::rbindlist() %>% mutate(cell = names(segs)[i]) }, BPPARAM = BPParam) } else { @@ -3102,8 +3006,10 @@ ReadVizgen <- function( segs_list <- future.apply::future_lapply(segs %>% seq, function(i) { - segs[[i]][[1]] %>% - data.table::as.data.table(.) %>% + segs[[i]] %>% + unique() %>% # remove any duplicates + purrr::lmap(.f = data.table::as.data.table) %>% + data.table::rbindlist() %>% mutate(cell = names(segs)[i]) } ) @@ -3113,10 +3019,13 @@ ReadVizgen <- function( segs <- data.table::rbindlist(segs_list) %>% data.table::setnames(c("x", "y", "cell")) - #names(segs)[1:2] <- c("x", "y") - if (verbose) { message("All cell segmentations are loaded..") } - segs - } else { + if (verbose) { message("All cell segmentations are loaded..") } + npg <- length(x = unique(x = segs$cell)) + if (npg < nrow(x = sp)) { + warning(nrow(x = sp) - npg, " cells missing polygon information", + immediate. = TRUE) } + segs # final segmentaions out.. + } else { # else use ".hdf5" files from ./cell_boundaries (older version) ppoly <- progressor(steps = length(x = unique(x = sp$fov))) ppoly(message = "Creating polygon coordinates", class = "sticky", From f43fed0f89b3c52dcbf6024e5b1e8e7e11996cb4 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:50:37 +0200 Subject: [PATCH 21/24] ..updated `LoadVizgen()` --- R/convenience.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/convenience.R b/R/convenience.R index 5e9b0cf74..82222b409 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -155,6 +155,7 @@ LoadVizgen <- function( add.zIndex = TRUE, update.object = TRUE, add.molecules = TRUE, + min.area = 5, verbose, ...) { @@ -163,6 +164,7 @@ LoadVizgen <- function( mol.type = mol.type, filter = filter, z = z, + min.area = min.area, verbose = verbose, ...) @@ -275,7 +277,6 @@ LoadVizgen <- function( if (verbose) { message("Object is ready!") } return(obj) - } #' @return \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object From 69ba89d5bd84266dd47d48bd231cf6729a3e372b Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:14:08 +0200 Subject: [PATCH 22/24] adding `.filter_polygons()` --- R/preprocessing.R | 79 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/R/preprocessing.R b/R/preprocessing.R index 34d291284..8a1ad46ed 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2484,6 +2484,85 @@ ReadVitessce <- function( return(outs) } +#' @import dplyr +#' @import sf +#' @import magrittr +#' Helper function to filter geometryy polygons using \code{sf} package +#' modified function from \code{SpatialFeatureExperiment:::.filter_polygons} +.filter_polygons <- + function(polys, # object of class \code{sf} and \code{data.frame} + min.area, # minimal polygon area to use as a threshold + BPPARAM = BiocParallel::SerialParam(), + verbose) { + # Sanity check on nested polygon lists + test.segs <- lapply(polys %>% st_geometry() %>% seq, function(i) { + polys %>% + st_geometry() %>% .[[i]] %>% length() }) %>% unlist() + if (any(test.segs > 1)) { + segs.art.index <- which(test.segs > 1) + if (verbose) { message("Sanity checks on cell segmentation polygons:", "\n", + ">>> ..found ", segs.art.index %>% length, + " cells with (nested) polygon lists", "\n", + ">>> ..applying filtering") } + } + # remove empty elements + polys.orig <- polys + polys %<>% filter(!st_is_empty(.)) + empty.inds <- which(!polys.orig$ID %in% polys$ID) + if (verbose && length(empty.inds)) { message(">>> ..removing ", + length(empty.inds), " empty polygons") } + + if (st_geometry_type(polys, by_geometry = FALSE) == "MULTIPOLYGON") { + polys_sep <- lapply(st_geometry(polys), function(x) { + st_cast(st_sfc(x), "POLYGON") + }) + areas <- lapply(polys_sep, st_area) + + if (!is.null(min.area)) { + which_keep <- lapply(areas, function(x) which(x > min.area)) + multi_inds <- which(lengths(which_keep) > 1L) + if (length(multi_inds)) { + if (verbose) { message("There are ", length(multi_inds), " cells with multiple", + " pieces in cell segmentation larger than min.area,", + " whose first 10 indices are: ", + paste(multi_inds %>% head(10), # print only first 10 indices + collapse = ", "), + ". The largest piece is kept.") } + which_keep[multi_inds] <- lapply(areas[multi_inds], which.max) + } + inds <- lengths(which_keep) > 0L + polys <- polys[inds,] + # using parallelization, else can take a while when `which_keep` length is towards 100K + which_keep <- unlist(which_keep[inds]) + geo <- st_geometry(polys) + new_geo <- + BiocParallel::bplapply(seq_along(which_keep), function(i) { + geo[[i]] <- st_cast(geo[i], "POLYGON")[[which_keep[i]]] %>% + unique() %>% # remove any duplicates + st_polygon() + }, BPPARAM = BPPARAM) |> st_sfc() + st_geometry(polys) <- new_geo + } else if (is.null(min.area)) { + # use only maximal area, ie the largest polygon + if (verbose) { message(">>> ..keeping polygons with the largest area only") } + which_keep <- + lapply(areas, function(x) which.max(x)) %>% unlist() + geo <- st_geometry(polys) + new_geo <- + BiocParallel::bplapply(seq_along(which_keep), function(i) { + geo[[i]] <- st_cast(geo[i], "POLYGON")[[which_keep[i]]] %>% + unique() %>% # remove any duplicates + st_polygon() + }, BPPARAM = BPPARAM) |> st_sfc() + st_geometry(polys) <- new_geo + } + } else { + inds <- st_area(st_geometry(polys)) > min.area + polys <- polys[inds,] + } + polys + } + #' Read and Load MERFISH Input from Vizgen #' #' Read and load in MERFISH data from Vizgen-formatted files From 038271fe97d2a4d184cfd540e6ed5ee22e9b8e21 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:35:29 +0200 Subject: [PATCH 23/24] optimized parallelization for `.filter_polygons()` ..optimized parallelization `BiocParallel` for `.filter_polygons()`, ie using 2 workers less that total provided workers --- R/preprocessing.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 8a1ad46ed..ba87ffd07 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3054,8 +3054,12 @@ ReadVizgen <- function( .filter_polygons(., min.area = min.area, BPPARAM = - if (exists("BPParam")) - { BPParam } else { SerialParam() }, + if (BPParam$workers > 2) { + # use 2 workers less than total workers + BiocParallel::MulticoreParam(14 - 2, tasks = 50L, + force.GC = FALSE, progressbar = TRUE) + } else if (!BPParam$workers > 2) + { BPParam } else { SerialParam() }, verbose = verbose) # Get filtered segmentation geometries/polygons From 9db188a99af6881d449f05dabec7d9e20d581c48 Mon Sep 17 00:00:00 2001 From: alikhuseynov <52053807+alikhuseynov@users.noreply.github.com> Date: Thu, 15 Feb 2024 12:03:12 +0100 Subject: [PATCH 24/24] rm space-only changes in `convenience.R` --- R/convenience.R | 94 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 82222b409..4a496e513 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -20,11 +20,11 @@ NULL #' @rdname ReadAkoya #' LoadAkoya <- function( - filename, - type = c('inform', 'processor', 'qupath'), - fov, - assay = 'Akoya', - ... + filename, + type = c('inform', 'processor', 'qupath'), + fov, + assay = 'Akoya', + ... ) { # read in matrix and centroids data <- ReadAkoya(filename = filename, type = type) @@ -114,8 +114,8 @@ LoadNanostring <- function(data.dir, fov, assay = 'Nanostring') { molecules = data$pixels, assay = assay ) - obj <- CreateSeuratObject(counts = data$matrix, assay = assay) - +obj <- CreateSeuratObject(counts = data$matrix, assay = assay) + # subset both object and coords based on the cells shared by both cells <- intersect( Cells(x = coords, boundary = "segmentation"), @@ -298,7 +298,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { type = c("centroids", "segmentations"), ) - segmentations.data <- list( +segmentations.data <- list( "centroids" = CreateCentroids(data$centroids), "segmentation" = CreateSegmentation(data$segmentations) ) @@ -309,7 +309,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { assay = assay ) - xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) +xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) if("Blank Codeword" %in% names(data$matrix)) xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Blank Codeword"]]) else @@ -317,7 +317,7 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { xenium.obj[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]]) xenium.obj[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]]) - xenium.obj[[fov]] <- coords +xenium.obj[[fov]] <- coords return(xenium.obj) } @@ -350,27 +350,27 @@ PCAPlot <- function(object, ...) { #' @export #' SpatialDimPlot <- function( - object, - group.by = NULL, - images = NULL, - cols = NULL, - crop = TRUE, - cells.highlight = NULL, - cols.highlight = c('#DE2D26', 'grey50'), - facet.highlight = FALSE, - label = FALSE, - label.size = 7, - label.color = 'white', - repel = FALSE, - ncol = NULL, - combine = TRUE, - pt.size.factor = 1.6, - alpha = c(1, 1), - image.alpha = 1, - stroke = 0.25, - label.box = TRUE, - interactive = FALSE, - information = NULL + object, + group.by = NULL, + images = NULL, + cols = NULL, + crop = TRUE, + cells.highlight = NULL, + cols.highlight = c('#DE2D26', 'grey50'), + facet.highlight = FALSE, + label = FALSE, + label.size = 7, + label.color = 'white', + repel = FALSE, + ncol = NULL, + combine = TRUE, + pt.size.factor = 1.6, + alpha = c(1, 1), + image.alpha = 1, + stroke = 0.25, + label.box = TRUE, + interactive = FALSE, + information = NULL ) { return(SpatialPlot( object = object, @@ -403,22 +403,22 @@ SpatialDimPlot <- function( #' @export #' SpatialFeaturePlot <- function( - object, - features, - images = NULL, - crop = TRUE, - slot = 'data', - keep.scale = "feature", - min.cutoff = NA, - max.cutoff = NA, - ncol = NULL, - combine = TRUE, - pt.size.factor = 1.6, - alpha = c(1, 1), - image.alpha = 1, - stroke = 0.25, - interactive = FALSE, - information = NULL + object, + features, + images = NULL, + crop = TRUE, + slot = 'data', + keep.scale = "feature", + min.cutoff = NA, + max.cutoff = NA, + ncol = NULL, + combine = TRUE, + pt.size.factor = 1.6, + alpha = c(1, 1), + image.alpha = 1, + stroke = 0.25, + interactive = FALSE, + information = NULL ) { return(SpatialPlot( object = object,