Skip to content

Commit bc90fed

Browse files
authored
Merge pull request #304 from immunomind/dev
Immunarch 0.8.0 release
2 parents 7a70786 + 138e7fe commit bc90fed

21 files changed

+500
-713
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: immunarch
22
Type: Package
33
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
4-
Version: 0.7.0
4+
Version: 0.8.0
55
Authors@R: c(
66
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
77
person("Vasily O.", "Tsvetkov", , role = "aut"),
@@ -84,6 +84,6 @@ Suggests:
8484
rmarkdown
8585
VignetteBuilder: knitr
8686
Encoding: UTF-8
87-
RoxygenNote: 7.2.0
87+
RoxygenNote: 7.2.1
8888
LazyData: true
8989
LazyDataCompression: xz

NAMESPACE

+2
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ importFrom(purrr,imap)
208208
importFrom(purrr,map)
209209
importFrom(purrr,map2)
210210
importFrom(purrr,map2_chr)
211+
importFrom(purrr,map2_df)
211212
importFrom(purrr,map2_lgl)
212213
importFrom(purrr,map_chr)
213214
importFrom(purrr,map_df)
@@ -289,6 +290,7 @@ importFrom(tibble,tibble)
289290
importFrom(tidyr,drop_na)
290291
importFrom(tidyr,unite)
291292
importFrom(tidyr,unnest)
293+
importFrom(tidyselect,any_of)
292294
importFrom(tidyselect,starts_with)
293295
importFrom(utils,capture.output)
294296
importFrom(utils,packageVersion)

R/align_lineage.R

+60-131
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,14 @@
2323
#'
2424
#' @usage
2525
#'
26-
#' repAlignLineage(.data,
27-
#' .min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail)
26+
#' repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail)
2827
#'
2928
#' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table}
3029
#' or a list of these objects.
3130
#'
3231
#' @param .min_lineage_sequences If number of sequences in the same clonal lineage and the same
3332
#' cluster (not including germline) is lower than this threshold, this group of sequences
34-
#' will not be aligned and will not be used in next steps of BCR pipeline
35-
#' (will be saved in output table only if .verbose_output parameter is set to TRUE).
33+
#' will be filtered out from the dataframe; so only large enough lineages will be included.
3634
#'
3735
#' @param .prepare_threads Number of threads to prepare results table.
3836
#' Please note that high number can cause heavy memory usage!
@@ -43,11 +41,6 @@
4341
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
4442
#' column, which is added by repGermline() function.
4543
#'
46-
#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
47-
#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
48-
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
49-
#' repSomaticHypermutation() calculation will be included in the output.
50-
#'
5144
#' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
5245
#' Used to avoid raising errors in examples on computers where Clustal W is not installed.
5346
#'
@@ -57,21 +50,13 @@
5750
#' The dataframe has these columns:
5851
#' * Cluster: cluster name
5952
#' * Germline: germline sequence
60-
#' * V.germline.nt: germline V gene sequence
61-
#' * J.germline.nt: germline J gene sequence
62-
#' * CDR3.germline.length: length of CDR3 in germline
63-
#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
64-
#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
65-
#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
66-
#' * V.length: shortest length of V gene part outside of CDR3 region in this
67-
#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
68-
#' * J.length: shortest length of J gene part outside of CDR3 region in this
69-
#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
53+
#' * Alignment: DNAbin object with alignment
7054
#' * Sequences: nested dataframe containing all sequences for this combination
7155
#' of cluster and germline; it has columns
72-
#' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
73-
#' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
74-
#' all values taken from the input dataframe
56+
#' * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele,
57+
#' V.aa, J.aa: all values taken from the input dataframe
58+
#' * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing
59+
#' * Clones: taken from the input dataframe, or created (filled with '1' values) if missing
7560
#'
7661
#' @examples
7762
#'
@@ -87,36 +72,45 @@ repAlignLineage <- function(.data,
8772
.min_lineage_sequences = 3,
8873
.prepare_threads = 2,
8974
.align_threads = 4,
90-
.verbose_output = FALSE,
9175
.nofail = FALSE) {
92-
if (!require_system_package("clustalw", error_message = paste0(
76+
if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0(
9377
"repAlignLineage requires Clustal W app to be installed!\n",
9478
"Please download it from here: http://www.clustal.org/download/current/\n",
9579
"or install it with your system package manager (such as apt or dnf)."
9680
), .nofail)) {
9781
return(get_empty_object_with_class("step_failure_ignored"))
9882
}
83+
if (.min_lineage_sequences < 2) {
84+
warning(
85+
".min_lineage_sequences is set to less than 2; ",
86+
"results will not be valid to build trees with repClonalLineage()!"
87+
)
88+
}
9989

100-
doParallel::registerDoParallel(cores = .prepare_threads)
90+
parallel_prepare <- .prepare_threads > 1
91+
if (parallel_prepare) {
92+
doParallel::registerDoParallel(cores = .prepare_threads)
93+
}
10194
.data %<>%
10295
apply_to_sample_or_list(
10396
align_single_df,
10497
.min_lineage_sequences = .min_lineage_sequences,
105-
.parallel_prepare = .prepare_threads > 1,
106-
.align_threads = .align_threads,
107-
.verbose_output = .verbose_output
98+
.parallel_prepare = parallel_prepare,
99+
.align_threads = .align_threads
108100
)
109-
doParallel::stopImplicitCluster()
101+
if (parallel_prepare) {
102+
doParallel::stopImplicitCluster()
103+
}
110104
return(.data)
111105
}
112106

113107
align_single_df <- function(data,
114108
.min_lineage_sequences,
115109
.parallel_prepare,
116-
.align_threads,
117-
.verbose_output) {
110+
.align_threads) {
118111
for (required_column in c(
119-
"Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
112+
"Cluster", "Germline.sequence", "V.allele", "J.allele",
113+
"FR1.nt", "CDR1.nt", "FR2.nt", "CDR2.nt", "FR3.nt", "CDR3.nt", "FR4.nt", "V.aa", "J.aa"
120114
)) {
121115
if (!(required_column %in% colnames(data))) {
122116
stop(
@@ -129,11 +123,11 @@ align_single_df <- function(data,
129123
}
130124

131125
results <- data %>%
126+
fill_missing_columns() %>%
132127
plyr::dlply(
133128
.variables = .(get("Cluster"), get("Germline.sequence")),
134129
.fun = prepare_results_row,
135130
.min_lineage_sequences = .min_lineage_sequences,
136-
.verbose_output = .verbose_output,
137131
.parallel = .parallel_prepare
138132
) %>%
139133
`[`(!is.na(.)) %>%
@@ -143,134 +137,69 @@ align_single_df <- function(data,
143137
stop("There are no lineages containing at least ", .min_lineage_sequences, " sequences!")
144138
}
145139

146-
# only required columns are passed to alignment function to reduce consumed memory
147-
if (.verbose_output) {
148-
alignments <- lapply(results, "[", c("Aligned", "Alignment"))
149-
} else {
150-
alignments <- lapply(results, "[", "Alignment")
151-
}
152-
alignments %<>% parallel::mclapply(
153-
align_sequences,
154-
.verbose_output = .verbose_output,
155-
mc.preschedule = TRUE,
156-
mc.cores = .align_threads
157-
)
140+
# only Alignment column are passed to alignment function to reduce consumed memory
141+
alignments <- lapply(results, "[", "Alignment") %>%
142+
par_or_normal_lapply(mc.preschedule = TRUE, mc.cores = .align_threads, function(df_row) {
143+
df_row[["Alignment"]] %<>% ape::clustal()
144+
})
158145

159146
return(convert_results_to_df(results, alignments))
160147
}
161148

149+
# fill Clone.ID and Clones columns if they are missing
150+
fill_missing_columns <- function(data) {
151+
if (!("Clone.ID" %in% colnames(data))) {
152+
data[["Clone.ID"]] <- seq.int(nrow(data))
153+
}
154+
if (!("Clones" %in% colnames(data))) {
155+
data[["Clones"]] <- as.integer(1)
156+
}
157+
return(data)
158+
}
159+
162160
# this function accepts dataframe subset containing rows only for current lineage
163161
# and returns named list containing 1 row for results dataframe
164-
prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
165-
cluster_name <- lineage_subset[[1, "Cluster"]]
166-
germline_seq <- lineage_subset[[1, "Germline.sequence"]]
167-
germline_v <- lineage_subset[[1, "V.germline.nt"]]
168-
germline_j <- lineage_subset[[1, "J.germline.nt"]]
169-
germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
170-
aligned <- nrow(lineage_subset) >= .min_lineage_sequences
171-
172-
if (!aligned & !.verbose_output) {
162+
prepare_results_row <- function(lineage_subset, .min_lineage_sequences) {
163+
if (nrow(lineage_subset) < .min_lineage_sequences) {
164+
# NA rows will be filtered out
173165
return(NA)
174166
}
175167

176-
lineage_subset[["V.lengths"]] <- v_len_outside_cdr3(
177-
lineage_subset[["V.end"]], lineage_subset[["CDR3.start"]]
178-
)
179-
lineage_subset[["J.lengths"]] <- j_len_outside_cdr3(
180-
lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
181-
)
168+
cluster_name <- lineage_subset[[1, "Cluster"]]
169+
germline_seq <- lineage_subset[[1, "Germline.sequence"]]
182170

183171
sequences_columns <- c(
184-
"Sequence", "Clone.ID", "Clones",
185-
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
172+
"Sequence", "Clone.ID", "Clones", "V.allele", "J.allele",
173+
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt", "V.aa", "J.aa"
186174
)
187-
if (.verbose_output) {
188-
sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
189-
}
175+
190176
sequences <- lineage_subset[sequences_columns]
191177
sequences[["Clone.ID"]] %<>% as.integer()
192178
sequences[["Clones"]] %<>% as.integer()
193179

194-
germline_v_len <- str_length(germline_v)
195-
germline_j_len <- str_length(germline_j)
196-
v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
197-
j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)
198-
199-
germline_trimmed <- trim_seq(germline_seq, germline_v_len, v_min_len, germline_j_len, j_min_len)
200-
clonotypes_trimmed <- trim_seq(
201-
lineage_subset[["Sequence"]],
202-
lineage_subset[["V.lengths"]],
203-
v_min_len,
204-
lineage_subset[["J.lengths"]],
205-
j_min_len
206-
)
207-
208180
clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
209181
paste0("ID_", id)
210182
})
211-
all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
183+
all_sequences_list <- c(list(germline_seq), as.list(lineage_subset[["Sequence"]]))
212184
names(all_sequences_list) <- c("Germline", clonotypes_names)
213185
alignment <- convert_seq_list_to_dnabin(all_sequences_list)
214186

215-
if (.verbose_output) {
216-
return(list(
217-
Cluster = cluster_name,
218-
Germline = germline_seq,
219-
V.germline.nt = germline_v,
220-
J.germline.nt = germline_j,
221-
CDR3.germline.length = germline_cdr3_len,
222-
Aligned = aligned,
223-
Alignment = alignment,
224-
V.length = v_min_len,
225-
J.length = j_min_len,
226-
Sequences = sequences
227-
))
228-
} else {
229-
return(list(
230-
Cluster = cluster_name,
231-
Germline = germline_seq,
232-
V.germline.nt = germline_v,
233-
J.germline.nt = germline_j,
234-
CDR3.germline.length = germline_cdr3_len,
235-
Alignment = alignment,
236-
V.length = v_min_len,
237-
J.length = j_min_len,
238-
Sequences = sequences
239-
))
240-
}
187+
return(list(
188+
Cluster = cluster_name,
189+
Germline = germline_seq,
190+
Alignment = alignment,
191+
Sequences = sequences
192+
))
241193
}
242194

243-
# trim V/J tails in sequence to the specified lenghts v_min, j_min
244-
trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
245-
str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
246-
}
247-
248-
convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
249-
alignments <- nested_alignments_list %>%
250-
lapply(magrittr::extract2, "Alignment") %>%
251-
tibble(Alignment = .)
195+
convert_results_to_df <- function(nested_results_list, alignments_list) {
196+
alignments <- tibble(Alignment = alignments_list)
252197
sequences <- nested_results_list %>%
253198
lapply(magrittr::extract2, "Sequences") %>%
254199
tibble(Sequences = .)
255200
df <- nested_results_list %>%
256201
lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
257202
purrr::map_dfr(~.) %>%
258203
cbind(alignments, sequences)
259-
# fix column types after dataframe rebuilding
260-
for (column in c("CDR3.germline.length", "V.length", "J.length")) {
261-
df[[column]] %<>% as.integer()
262-
}
263204
return(df)
264205
}
265-
266-
align_sequences <- function(df_row, .verbose_output) {
267-
if (.verbose_output) {
268-
aligned <- df_row[["Aligned"]]
269-
} else {
270-
aligned <- TRUE
271-
}
272-
if (aligned) {
273-
df_row[["Alignment"]] %<>% ape::clustal()
274-
}
275-
return(df_row)
276-
}

R/clustering.R

-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ immunr_hclust <- function(.data, .k = 2, .k.max = nrow(.data) - 1, .method = "co
7373
}
7474

7575
immunr_kmeans <- function(.data, .k = 2, .k.max = as.integer(sqrt(nrow(.data))) + 1, .method = c("silhouette", "gap_stat")) {
76-
# res = list(kmeans = add_class(kmeans(as.dist(.data), .k), "immunr_kmeans"),
7776
res <- list(
7877
kmeans = add_class(kmeans(.data, .k), "immunr_kmeans"),
7978
nbclust = add_class(fviz_nbclust(.data, kmeans, k.max = .k.max, .method[1]), "immunr_nbclust"),

0 commit comments

Comments
 (0)