Skip to content

Commit ac2c840

Browse files
authored
Merge pull request #333 from immunomind/dev
Immunarch 0.9.0 release
2 parents bc90fed + 09b0012 commit ac2c840

16 files changed

+518
-66
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: immunarch
22
Type: Package
33
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
4-
Version: 0.8.0
4+
Version: 0.9.0
55
Authors@R: c(
66
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
77
person("Vasily O.", "Tsvetkov", , role = "aut"),
@@ -84,6 +84,6 @@ Suggests:
8484
rmarkdown
8585
VignetteBuilder: knitr
8686
Encoding: UTF-8
87-
RoxygenNote: 7.2.1
87+
RoxygenNote: 7.2.2
8888
LazyData: true
8989
LazyDataCompression: xz

NAMESPACE

+3-1
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
147147
importFrom(dplyr,left_join)
148148
importFrom(dplyr,mutate)
149149
importFrom(dplyr,n)
150+
importFrom(dplyr,one_of)
150151
importFrom(dplyr,pull)
151152
importFrom(dplyr,rename)
152153
importFrom(dplyr,rowwise)
153154
importFrom(dplyr,select)
155+
importFrom(dplyr,select_)
154156
importFrom(dplyr,select_if)
155157
importFrom(dplyr,summarise)
156158
importFrom(dplyr,tally)
@@ -290,13 +292,13 @@ importFrom(tibble,tibble)
290292
importFrom(tidyr,drop_na)
291293
importFrom(tidyr,unite)
292294
importFrom(tidyr,unnest)
295+
importFrom(tidyselect,all_of)
293296
importFrom(tidyselect,any_of)
294297
importFrom(tidyselect,starts_with)
295298
importFrom(utils,capture.output)
296299
importFrom(utils,packageVersion)
297300
importFrom(utils,read.table)
298301
importFrom(utils,setTxtProgressBar)
299-
importFrom(utils,str)
300302
importFrom(utils,tail)
301303
importFrom(utils,txtProgressBar)
302304
importFrom(uuid,UUIDgenerate)

R/align_lineage.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#' @importFrom plyr dlply .
1010
#' @importFrom purrr map_dfr
1111
#' @importFrom rlist list.remove
12-
#' @importFrom utils str
1312
#' @importFrom ape as.DNAbin clustal
1413
#' @importFrom doParallel registerDoParallel stopImplicitCluster
1514
#' @importFrom parallel mclapply
@@ -117,7 +116,7 @@ align_single_df <- function(data,
117116
"Found dataframe without required column ",
118117
required_column,
119118
";\nexisting columns: ",
120-
utils::str(colnames(data))
119+
toString(colnames(data))
121120
)
122121
}
123122
}

R/diversity.R

+5-8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
1414
#' @importFrom dplyr mutate group_by_at pull
1515
#' @importFrom stats qnorm
1616
#' @importFrom rlang sym
17+
#' @importFrom tidyselect all_of
1718
#'
1819
#' @description
1920
#' This is a utility function to estimate the diversity of species or objects in the given distribution.
@@ -188,15 +189,14 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min
188189
.data <- list(Sample = .data)
189190
}
190191

191-
192192
.col <- process_col_argument(.col) # ToDo: refactor this and the next branches
193193

194194
vec <- lapply(.data, function(x) {
195195
if (has_class(x, "data.table")) {
196196
x <- x %>% lazy_dt()
197197
}
198198
x %>%
199-
select(.col, IMMCOL$count) %>%
199+
select(all_of(.col), IMMCOL$count) %>%
200200
group_by_at(vars(.col)) %>%
201201
summarise(Div.count = sum(!!sym(IMMCOL$count))) %>%
202202
pull(Div.count)
@@ -350,20 +350,20 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
350350
if (is.na(.step)) {
351351
.step <- min(sapply(.data, function(x) sum(as.numeric(x)))) %/% 50.
352352
}
353+
.step <- max(1, .step)
353354

354355
if (is.na(.extrapolation)) {
355356
.extrapolation <- max(sapply(.data, function(x) sum(as.numeric(x)))) * 20
356357
}
357358

358359
.alpha <- function(n, Xi, m) {
359-
k <- Xi
360360
return((1 - m / n)^Xi)
361361
}
362362

363363
if (.verbose) {
364364
pb <- set_pb(sum(sapply(seq_along(.data), function(i) {
365365
bc.vec <- .data[[i]]
366-
bc.sum <- sum(.data[[i]])
366+
bc.sum <- sum(bc.vec)
367367
sizes <- seq(.step, bc.sum, .step)
368368
if (sizes[length(sizes)] != bc.sum) {
369369
sizes <- c(sizes, bc.sum)
@@ -381,9 +381,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
381381
}
382382
n <- sum(bc.vec)
383383
sizes <- seq(.step, n, .step)
384-
# if (sizes[length(sizes)] != n) {
385-
# sizes <- c(sizes, n)
386-
# }
384+
387385
counts <- table(bc.vec)
388386
muc.res <- t(sapply(sizes, function(sz) {
389387
freqs <- as.numeric(names(counts))
@@ -413,7 +411,6 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
413411
}))
414412

415413
if (.extrapolation > 0) {
416-
# sizes <- seq(sum(.data[[i]]), .extrapolation + max(sapply(.data, function (x) sum(x))), .step)
417414
sizes <- seq(
418415
tail(seq(.step, sum(.data[[i]]), .step), 1) + .step,
419416
.extrapolation,

R/io-parsers.R

+88-22
Original file line numberDiff line numberDiff line change
@@ -399,10 +399,10 @@ parse_mitcr <- function(.filename, .mode) {
399399
)
400400
}
401401

402-
parse_mixcr <- function(.filename, .mode) {
402+
parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount")) {
403403
.filename <- .filename
404404
.id <- "cloneid"
405-
.count <- "clonecount"
405+
.count %<>% tolower()
406406
.sep <- "\t"
407407
.vd.insertions <- "VD.insertions"
408408
.dj.insertions <- "DJ.insertions"
@@ -677,16 +677,21 @@ parse_mixcr <- function(.filename, .mode) {
677677
df[[pos_extra_headers[["j3del"]]]] <- sapply(df[["refpoints"]], get_ref_point_position, 18)
678678
}
679679

680-
if (!(.count %in% table.colnames)) {
680+
if (!any(.count %in% table.colnames)) {
681681
warn_msg <- c(" [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
682682
warn_msg <- c(warn_msg, "\n Did you apply repLoad to MiXCR file *_alignments.txt?")
683683
warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
684684
warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.")
685685
warn_msg <- c(warn_msg, "\n Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.")
686686
message(warn_msg)
687687

688+
.count <- .count[1]
688689
df[[.count]] <- 1
690+
} else if (length(.count) > 1) {
691+
# if multiple column name options specified for .count, keep only the first valid
692+
.count <- .count[.count %in% table.colnames][1]
689693
}
694+
690695
.freq <- "Proportion"
691696
df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE)
692697

@@ -829,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
829834
}
830835

831836
parse_vdjtools <- function(.filename, .mode) {
832-
skip <- 0
833-
834837
# Check for different VDJtools outputs
835838
f <- file(.filename, "r")
836839
l <- readLines(f, 1)
@@ -959,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
959962
.as_tsv() %>%
960963
airr::read_rearrangement()
961964

962-
df <- df %>%
963-
select(
964-
sequence, v_call, d_call, j_call, junction, junction_aa,
965-
contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
966-
contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
967-
contains("duplicate_count")
965+
df %<>%
966+
select_(
967+
"sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
968+
~contains("v_germline_end"), ~contains("d_germline_start"),
969+
~contains("d_germline_end"), ~contains("j_germline_start"),
970+
~contains("np1_length"), ~contains("np2_length"),
971+
~contains("duplicate_count"),
972+
"cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
973+
"fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
968974
)
969975

970976
namekey <- c(
971977
duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
972978
v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
973979
d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
974-
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
980+
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
981+
cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
982+
cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
983+
fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
984+
fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
985+
fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
986+
fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
975987
)
976988

977989
names(df) <- namekey[names(df)]
@@ -993,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
9931005
}
9941006
}
9951007

996-
for (column in IMMCOL$order) {
1008+
order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])
1009+
1010+
for (column in order) {
9971011
if (!(column %in% colnames(df))) {
9981012
df[column] <- NA
9991013
}
10001014
}
10011015

1002-
df <- df[IMMCOL$order]
1016+
df <- df[order]
10031017
total <- sum(df$Clones)
10041018
df[IMMCOL$prop] <- df[IMMCOL$count] / total
10051019
df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
@@ -1039,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
10391053
.vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
10401054
.vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
10411055
.vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
1042-
.skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
1043-
.add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
1056+
.skip = 0, .sep = ",",
1057+
.add = c(
1058+
"chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
1059+
"cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
1060+
"fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
1061+
)
10441062
)
10451063

1064+
setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
1065+
setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
1066+
setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
1067+
setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
1068+
setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
1069+
setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
1070+
setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
1071+
setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
1072+
setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
1073+
setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
1074+
setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
1075+
setnames(df, "fwr4", IMMCOL_EXT$fr4aa)
1076+
10461077
# Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
10471078
df <- df[order(df$chain), ]
10481079
setDT(df)
10491080

10501081
if (.mode == "paired") {
1051-
df <- df %>%
1082+
df %<>%
10521083
lazy_dt() %>%
1053-
group_by(barcode, raw_clonotype_id) %>%
1084+
group_by_colnames("barcode", "raw_clonotype_id") %>%
10541085
summarise(
1086+
CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
1087+
CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
1088+
CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
1089+
CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
10551090
CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
10561091
CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
1092+
FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
1093+
FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
1094+
FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
1095+
FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
1096+
FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
1097+
FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
1098+
FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
1099+
FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
10571100
V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
10581101
J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
10591102
D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
@@ -1067,23 +1110,46 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
10671110
as.data.table()
10681111
}
10691112

1070-
df <- df %>%
1113+
df %<>%
10711114
lazy_dt() %>%
1072-
group_by(CDR3.nt, V.name, J.name) %>%
1115+
mutate(
1116+
CDR3.nt.sorted = sort_string(get("CDR3.nt"), IMMCOL_ADD$scsep),
1117+
V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
1118+
J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
1119+
) %>%
1120+
group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
10731121
summarise(
10741122
Clones = length(unique(get("barcode"))),
1123+
CDR3.nt = first(get("CDR3.nt")),
10751124
CDR3.aa = first(get("CDR3.aa")),
1125+
V.name = first(get("V.name")),
10761126
D.name = first(get("D.name")),
1127+
J.name = first(get("J.name")),
10771128
chain = first(get("chain")),
10781129
barcode = paste0(unique(get("barcode")), collapse = IMMCOL_ADD$scsep),
10791130
raw_clonotype_id = gsub(
10801131
"clonotype|None", "",
10811132
paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
10821133
),
10831134
contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
1084-
c_gene = first(get("c_gene"))
1135+
c_gene = first(get("c_gene")),
1136+
CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
1137+
CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
1138+
CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
1139+
CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
1140+
FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
1141+
FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
1142+
FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
1143+
FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
1144+
FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
1145+
FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
1146+
FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
1147+
FR4.aa = first(get(IMMCOL_EXT$fr4aa))
10851148
) %>%
1086-
as.data.table()
1149+
as.data.table() %>%
1150+
subset(
1151+
select = -c(get("CDR3.nt.sorted"), get("V.name.sorted"), get("J.name.sorted"))
1152+
)
10871153

10881154
df$V.end <- NA
10891155
df$J.start <- NA

R/io-utility.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676

7777

7878
.make_names <- function(.char) {
79-
if (is.na(.char[1])) {
79+
if (has_no_data(.char)) {
8080
NA
8181
} else {
8282
tolower(.char)
@@ -136,8 +136,8 @@
136136
.vend, .jstart, .dstart, .dend,
137137
.vd.insertions, .dj.insertions, .total.insertions
138138
))
139-
if (!is.na(.add[1])) {
140-
swlist <- c(swlist, rep(col_guess(), length(.add)))
139+
if (!has_no_data(.add)) {
140+
swlist <- c(swlist, rep(list(col_guess()), length(.add)))
141141
names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
142142
}
143143

0 commit comments

Comments
 (0)