Skip to content

Commit 09b0012

Browse files
authored
Merge pull request #326 from immunomind/new-bcr-input-formats
Support for BCR columns in new formats
2 parents a114d3c + 5a6adc2 commit 09b0012

File tree

7 files changed

+80
-21
lines changed

7 files changed

+80
-21
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,6 @@ Suggests:
8484
rmarkdown
8585
VignetteBuilder: knitr
8686
Encoding: UTF-8
87-
RoxygenNote: 7.2.1
87+
RoxygenNote: 7.2.2
8888
LazyData: true
8989
LazyDataCompression: xz

NAMESPACE

+3
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
147147
importFrom(dplyr,left_join)
148148
importFrom(dplyr,mutate)
149149
importFrom(dplyr,n)
150+
importFrom(dplyr,one_of)
150151
importFrom(dplyr,pull)
151152
importFrom(dplyr,rename)
152153
importFrom(dplyr,rowwise)
153154
importFrom(dplyr,select)
155+
importFrom(dplyr,select_)
154156
importFrom(dplyr,select_if)
155157
importFrom(dplyr,summarise)
156158
importFrom(dplyr,tally)
@@ -290,6 +292,7 @@ importFrom(tibble,tibble)
290292
importFrom(tidyr,drop_na)
291293
importFrom(tidyr,unite)
292294
importFrom(tidyr,unnest)
295+
importFrom(tidyselect,all_of)
293296
importFrom(tidyselect,any_of)
294297
importFrom(tidyselect,starts_with)
295298
importFrom(utils,capture.output)

R/diversity.R

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
1414
#' @importFrom dplyr mutate group_by_at pull
1515
#' @importFrom stats qnorm
1616
#' @importFrom rlang sym
17+
#' @importFrom tidyselect all_of
1718
#'
1819
#' @description
1920
#' This is a utility function to estimate the diversity of species or objects in the given distribution.

R/io-parsers.R

+66-16
Original file line numberDiff line numberDiff line change
@@ -834,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
834834
}
835835

836836
parse_vdjtools <- function(.filename, .mode) {
837-
skip <- 0
838-
839837
# Check for different VDJtools outputs
840838
f <- file(.filename, "r")
841839
l <- readLines(f, 1)
@@ -964,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
964962
.as_tsv() %>%
965963
airr::read_rearrangement()
966964

967-
df <- df %>%
968-
select(
969-
sequence, v_call, d_call, j_call, junction, junction_aa,
970-
contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
971-
contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
972-
contains("duplicate_count")
965+
df %<>%
966+
select_(
967+
"sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
968+
~contains("v_germline_end"), ~contains("d_germline_start"),
969+
~contains("d_germline_end"), ~contains("j_germline_start"),
970+
~contains("np1_length"), ~contains("np2_length"),
971+
~contains("duplicate_count"),
972+
"cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
973+
"fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
973974
)
974975

975976
namekey <- c(
976977
duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
977978
v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
978979
d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
979-
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
980+
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
981+
cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
982+
cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
983+
fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
984+
fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
985+
fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
986+
fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
980987
)
981988

982989
names(df) <- namekey[names(df)]
@@ -998,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
9981005
}
9991006
}
10001007

1001-
for (column in IMMCOL$order) {
1008+
order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])
1009+
1010+
for (column in order) {
10021011
if (!(column %in% colnames(df))) {
10031012
df[column] <- NA
10041013
}
10051014
}
10061015

1007-
df <- df[IMMCOL$order]
1016+
df <- df[order]
10081017
total <- sum(df$Clones)
10091018
df[IMMCOL$prop] <- df[IMMCOL$count] / total
10101019
df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
@@ -1044,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
10441053
.vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
10451054
.vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
10461055
.vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
1047-
.skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
1048-
.add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
1056+
.skip = 0, .sep = ",",
1057+
.add = c(
1058+
"chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
1059+
"cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
1060+
"fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
1061+
)
10491062
)
10501063

1064+
setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
1065+
setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
1066+
setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
1067+
setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
1068+
setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
1069+
setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
1070+
setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
1071+
setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
1072+
setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
1073+
setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
1074+
setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
1075+
setnames(df, "fwr4", IMMCOL_EXT$fr4aa)
1076+
10511077
# Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
10521078
df <- df[order(df$chain), ]
10531079
setDT(df)
10541080

10551081
if (.mode == "paired") {
10561082
df %<>%
10571083
lazy_dt() %>%
1058-
group_by(barcode, raw_clonotype_id) %>%
1084+
group_by_colnames("barcode", "raw_clonotype_id") %>%
10591085
summarise(
1086+
CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
1087+
CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
1088+
CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
1089+
CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
10601090
CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
10611091
CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
1092+
FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
1093+
FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
1094+
FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
1095+
FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
1096+
FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
1097+
FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
1098+
FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
1099+
FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
10621100
V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
10631101
J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
10641102
D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
@@ -1079,7 +1117,7 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
10791117
V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
10801118
J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
10811119
) %>%
1082-
group_by(CDR3.nt.sorted, V.name.sorted, J.name.sorted) %>%
1120+
group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
10831121
summarise(
10841122
Clones = length(unique(get("barcode"))),
10851123
CDR3.nt = first(get("CDR3.nt")),
@@ -1094,7 +1132,19 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
10941132
paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
10951133
),
10961134
contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
1097-
c_gene = first(get("c_gene"))
1135+
c_gene = first(get("c_gene")),
1136+
CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
1137+
CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
1138+
CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
1139+
CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
1140+
FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
1141+
FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
1142+
FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
1143+
FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
1144+
FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
1145+
FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
1146+
FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
1147+
FR4.aa = first(get(IMMCOL_EXT$fr4aa))
10981148
) %>%
10991149
as.data.table() %>%
11001150
subset(

R/io-utility.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676

7777

7878
.make_names <- function(.char) {
79-
if (is.na(.char[1])) {
79+
if (has_no_data(.char)) {
8080
NA
8181
} else {
8282
tolower(.char)
@@ -136,8 +136,8 @@
136136
.vend, .jstart, .dstart, .dend,
137137
.vd.insertions, .dj.insertions, .total.insertions
138138
))
139-
if (!is.na(.add[1])) {
140-
swlist <- c(swlist, rep(col_guess(), length(.add)))
139+
if (!has_no_data(.add)) {
140+
swlist <- c(swlist, rep(list(col_guess()), length(.add)))
141141
names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
142142
}
143143

R/io.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if (getRversion() >= "2.15.1") {
2020
#' @importFrom jsonlite read_json
2121
#' @importFrom stringr str_split str_detect str_replace_all str_trim
2222
#' @importFrom methods as
23-
#' @importFrom dplyr contains first
23+
#' @importFrom dplyr contains first select_ group_by_at one_of
2424
#' @importFrom utils read.table
2525
#' @importFrom data.table setDF
2626
#'

R/tools.R

+5
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ has_no_data <- function(.data) {
494494
any(sapply(list(NA, NULL, NaN), identical, .data)) | all(is.na(.data))
495495
}
496496

497+
# variant of group_by that takes column names as strings
498+
group_by_colnames <- function(.data, ...) {
499+
group_by_at(.data, vars(one_of(...)))
500+
}
501+
497502
# apply function to .data if it's a single sample or to each sample if .data is a list of samples
498503
apply_to_sample_or_list <- function(.data, .function, .with_names = FALSE, .validate = TRUE, ...) {
499504
if (has_no_data(.data)) {

0 commit comments

Comments
 (0)