Merge pull request #246 from immunomind/repload-fixes

Aleksandr Popov · web-flow · commit 2b6d707f998b · 2022-05-30T20:35:59.000+02:00
diff --git a/R/io-parsers.R b/R/io-parsers.R
@@ -1040,8 +1040,6 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
     .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
     .add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
   )
-  # setnames(df, "raw_clonotype_id", "RawClonotypeID")
-  # setnames(df, "raw_consensus_id", "RawConsensusID")
 
   # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
   df <- df[order(df$chain), ]
@@ -1052,16 +1050,17 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       lazy_dt() %>%
       group_by(barcode, raw_clonotype_id) %>%
       summarise(
-        CDR3.nt = paste0(CDR3.nt, collapse = IMMCOL_ADD$scsep),
-        CDR3.aa = paste0(CDR3.aa, collapse = IMMCOL_ADD$scsep),
-        V.name = paste0(V.name, collapse = IMMCOL_ADD$scsep),
-        J.name = paste0(J.name, collapse = IMMCOL_ADD$scsep),
-        D.name = paste0(D.name, collapse = IMMCOL_ADD$scsep),
-        chain = paste0(chain, collapse = IMMCOL_ADD$scsep),
-        # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)),
-        # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)),
-        contig_id = gsub("_contig_", "", paste0(contig_id, collapse = IMMCOL_ADD$scsep)),
-        c_gene = paste0(c_gene, collapse = IMMCOL_ADD$scsep)
+        CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
+        CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
+        V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
+        J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
+        D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
+        chain = paste0(get("chain"), collapse = IMMCOL_ADD$scsep),
+        contig_id = gsub(
+          "_contig_", "",
+          paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep)
+        ),
+        c_gene = paste0(get("c_gene"), collapse = IMMCOL_ADD$scsep)
       ) %>%
       as.data.table()
   }
@@ -1070,16 +1069,17 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
     lazy_dt() %>%
     group_by(CDR3.nt, V.name, J.name) %>%
     summarise(
-      Clones = length(unique(barcode)),
-      CDR3.aa = first(CDR3.aa),
-      D.name = first(D.name),
-      chain = first(chain),
-      barcode = paste0(unique(barcode), collapse = IMMCOL_ADD$scsep),
-      raw_clonotype_id = gsub("clonotype|None", "", paste0(unique(raw_clonotype_id), collapse = IMMCOL_ADD$scsep)),
-      # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)),
-      # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)),
-      contig_id = paste0(contig_id, collapse = IMMCOL_ADD$scsep),
-      c_gene = first(c_gene)
+      Clones = length(unique(get("barcode"))),
+      CDR3.aa = first(get("CDR3.aa")),
+      D.name = first(get("D.name")),
+      chain = first(get("chain")),
+      barcode = paste0(unique(get("barcode")), collapse = IMMCOL_ADD$scsep),
+      raw_clonotype_id = gsub(
+        "clonotype|None", "",
+        paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
+      ),
+      contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
+      c_gene = first(get("c_gene"))
     ) %>%
     as.data.table()
 
diff --git a/R/io.R b/R/io.R
@@ -51,10 +51,6 @@ if (getRversion() >= "2.15.1") {
 #' R data frames with only one type of chain and cell presented. The metadata file will have additional columns specifying
 #' cell and chain types for different samples.
 #'
-#' @param .format A character string specifying what format to use. Do NOT use it. See "Details" for more information on supported formats.
-#'
-#' Leave NA (which is default) if you want `immunarch` to detect formats automatically.
-#'
 #' @param .mode Either "single" for single chain data or "paired" for paired chain data.
 #'
 #' Currently "single" works for every format, and "paired" works only for 10X Genomics data.
@@ -75,10 +71,6 @@ if (getRversion() >= "2.15.1") {
 #'  immunoseq_3 \tab FALSE \tab 3 \tab A
 #' }
 #'
-#' \code{repLoad} has the ".format" argument that sets the format for input repertoire files.
-#' Immunarch detects the file format automatically, and the argument is left only for the compatability
-#' purposes. It will be soon removed. Do not pass it or your code will stop working!
-#'
 #' Currently, Immunarch support the following formats:
 #'
 #' - "immunoseq" - ImmunoSEQ of any version. http://www.adaptivebiotech.com/immunoseq
@@ -143,27 +135,16 @@ if (getRversion() >= "2.15.1") {
 #' # > names(immdata)
 #' # [1] "data" "meta"
 #' @export repLoad
-repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) {
-  if (!is.na(.format)) {
-    warning("Please don't provide the .format argument,
-            immunarch detects the format automatically.
-            The .format argument will soon be removed.")
-  }
-
+repLoad <- function(.path, .mode = "paired", .coding = TRUE) {
   exclude_extensions <- c(
     "so", "exe", "bam", "fasta", "fai", "fastq", "bed", "rds", "report", "vdjca"
   )
 
   # Process a repertoire file: detect format and load the data
   # Return: a named list with a repertoire data frame and it's name
-  .read_repertoire <- function(.path, .format, .mode, .coding) {
+  .read_repertoire <- function(.path, .mode, .coding) {
     parse_res <- list()
-
-    # Detect format
-    cur_format <- .format
-    if (is.na(.format)) {
-      cur_format <- .detect_format(.path)
-    }
+    cur_format <- .detect_format(.path)
 
     # Parse the file
     if (is.na(cur_format)) {
@@ -221,7 +202,7 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) {
   # just load all repertoire files.
   # Do NOT (!) create a dummy metadata, return en empty data frame instead
   # Return: list with data, metadata and barcodes (if necessary)
-  .process_batch <- function(.files, .format, .mode, .coding) {
+  .process_batch <- function(.files, .mode, .coding) {
     parsed_batch <- list()
     metadata <- tibble()
 
@@ -252,7 +233,7 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) {
         } else if (stringr::str_detect(.filepath, "barcode")) {
           # TODO: add the barcode processing subroutine to split by samples
         } else {
-          repertoire <- .read_repertoire(.filepath, .format, .mode, .coding)
+          repertoire <- .read_repertoire(.filepath, .mode, .coding)
           if (length(repertoire) != 0) {
             parsed_batch <- c(parsed_batch, repertoire)
           }
@@ -358,7 +339,7 @@ repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) {
   for (batch_i in seq_along(batches)) {
     if (length(batches[[batch_i]])) {
       message('Processing "', names(batches)[batch_i], '" ...')
-      parsed_batches[[names(batches)[batch_i]]] <- .process_batch(batches[[batch_i]], .format, .mode, .coding)
+      parsed_batches[[names(batches)[batch_i]]] <- .process_batch(batches[[batch_i]], .mode, .coding)
     }
   }
 
diff --git a/man/repLoad.Rd b/man/repLoad.Rd