diff --git a/R/preprocessing.R b/R/preprocessing.R index 4f6765faa..324dcff65 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -3479,3 +3479,185 @@ RegressOutMatrix <- function( dimnames(x = data.resid) <- dimnames(x = data.expr) return(data.resid) } + + +#' Load in data from DRAGEN +#' +#' Enables easy loading of sparse data matrices provided by DRAGEN genomics. +#' +#' @param data.dir Directory containing the *.scRNA.matrix.mtx, *.scRNA.genes.tsv (or features.tsv), *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv +#' files provided by DRAGEN. A vector or named vector can be given in order to load +#' several data directories. If a named vector is given, the cell barcode names +#' will be prefixed with the name. +#' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2 +#' @param cell.column Specify which column of barcodes.tsv to use for cell names; default is 1 +#' @param unique.features Make feature names unique (default TRUE) +#' @param strip.suffix Remove trailing "-1" if present in all cell barcodes. +#' +#' @return If features.csv indicates the data has multiple data types, a list +#' containing a sparse matrix of the data from each type will be returned. +#' Otherwise a sparse matrix containing the expression data will be returned. +#' +#' +#' @export +#' @concept preprocessing +#' +#' @examples +#' \dontrun{ +#' # For output from CellRanger < 3.0 +#' data_dir <- 'path/to/data/directory' +#' list.files(data_dir) # Should show *.scRNA.matrix.mtx, *.scRNA.genes.tsv, *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv +#' expression_matrix <- ReadDRAGEN(data.dir = data_dir) +#' seurat_object = CreateSeuratObject(counts = expression_matrix) +#' +#' # For output from CellRanger >= 3.0 with multiple data types +#' data_dir <- 'path/to/data/directory' +#' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz +#' data <- ReadDRAGEN(data.dir = data_dir) +#' seurat_object = CreateSeuratObject(counts = data$`Gene Expression`) +#' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`) +#' } +#' +ReadDRAGEN <- function( + data.dir, + gene.column = 2, + cell.column = 1, + unique.features = TRUE, + strip.suffix = FALSE +) { + full.data <- list() + for (i in seq_along(along.with = data.dir)) { + run <- data.dir[i] + if (!dir.exists(paths = run)) { + stop("Directory provided does not exist") + } + barcode.loc <- file.path(run, list.files(path=run, pattern='*barcodes\\.tsv', recursive=FALSE)[1]) + gene.loc <- file.path(run, list.files(path=run, pattern='*genes\\.tsv', recursive=FALSE)[1]) + features.loc <- file.path(run, list.files(path=run, pattern='*features\\.tsv', recursive=FALSE)[1]) + matrix.loc <- file.path(run, list.files(path=run, pattern='*matrix\\.mtx', recursive=FALSE)[1]) + barcodeSummary.loc <- file.path(run, list.files(path=run, pattern='*barcodeSummary\\.tsv', recursive=FALSE)[1]) + + # Flag to indicate if this data is from CellRanger >= 3.0 + pre_ver_3 <- file.exists(gene.loc) + if (!file.exists(barcode.loc)) { + stop("Barcode file missing. Expecting ", basename(path = barcode.loc)) + } + if (!pre_ver_3 && !file.exists(features.loc) ) { + stop("Gene name or features file missing. Expecting ", basename(path = features.loc)) + } + if (!file.exists(matrix.loc)) { + stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc)) + } + if (!file.exists(barcodeSummary.loc)) { + warning(paste0("Barcode summary file missing, DRAGEN barcode filtering will be skipped!. Expecting ", basename(path = barcodeSummary.loc))) + } + data <- Matrix::readMM(file = matrix.loc) + cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL) + if (ncol(x = cell.barcodes) > 1) { + cell.names <- cell.barcodes[, cell.column] + } else { + cell.names <- readLines(con = barcode.loc) + } + if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) { + cell.names <- as.vector(x = as.character(x = sapply( + X = cell.names, + FUN = ExtractField, + field = 1, + delim = "-" + ))) + } + if (is.null(x = names(x = data.dir))) { + if (length(x = data.dir) < 2) { + ##TRUE + colnames(x = data) <- cell.names + } else { + colnames(x = data) <- paste0(i, "_", cell.names) + } + } else { + colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names) + } + feature.names <- utils::read.delim( + file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc), + header = FALSE, + stringsAsFactors = FALSE + ) + if (any(is.na(x = feature.names[, gene.column]))) { + warning( + 'Some features names are NA. Replacing NA names with ID from the opposite column requested', + call. = FALSE, + immediate. = TRUE + ) + na.features <- which(x = is.na(x = feature.names[, gene.column])) + replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2) + feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column] + } + if (unique.features) { + fcols = ncol(x = feature.names) + if (fcols < gene.column) { + stop(paste0("gene.column was set to ", gene.column, + " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.", + " Try setting the gene.column argument to a value <= to ", fcols, ".")) + } + rownames(x = data) <- make.unique(names = feature.names[, gene.column]) + } + + # DRAGEN filter using barcodeSummary file + if(file.exists(barcodeSummary.loc)){ + print(barcodeSummary.loc) + barcodeSummary <- utils::read.delim( + file=barcodeSummary.loc, + header = TRUE, + stringsAsFactors = FALSE + ) + tryCatch({ + data <- data[, barcodeSummary[barcodeSummary$Filter=="PASS","Barcode"]] + print(dim(data)) + }, + interrupt = function(x){warning("Warning, could not filter DRAGEN dataset!")}, + error = function(x){warning("Warning, could not filter DRAGEN dataset!")}, + warning = function(x){warning("Warning, could not filter DRAGEN dataset!")} + ) + } + + + # In cell ranger 3.0, a third column specifying the type of data was added + # and we will return each type of data as a separate matrix + if (ncol(x = feature.names) > 2) { + data_types <- factor(x = feature.names$V3) + lvls <- levels(x = data_types) + if (length(x = lvls) > 1 && length(x = full.data) == 0) { + message("10X data contains more than one type and is being returned as a list containing matrices of each type.") + } + expr_name <- "Gene Expression" + if (expr_name %in% lvls) { # Return Gene Expression first + lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)]) + } + data <- lapply( + X = lvls, + FUN = function(l) { + return(data[data_types == l, , drop = FALSE]) + } + ) + names(x = data) <- lvls + } else{ + data <- list(data) + } + full.data[[length(x = full.data) + 1]] <- data + } + # Combine all the data from different directories into one big matrix, note this + # assumes that all data directories essentially have the same features files + list_of_data <- list() + for (j in 1:length(x = full.data[[1]])) { + list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j)) + # Fix for Issue #913 + list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "dgCMatrix") + } + names(x = list_of_data) <- names(x = full.data[[1]]) + # If multiple features, will return a list, otherwise + # a matrix. + if (length(x = list_of_data) == 1) { + return(list_of_data[[1]]) + } else { + return(list_of_data) + } +}