Skip to content

Commit 7a70786

Browse files
author
Aleksandr Popov
authored
Merge pull request #276 from immunomind/dev
Immunarch 0.7.0 release
2 parents 2fdb2d0 + afbd98d commit 7a70786

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1791
-378
lines changed

DESCRIPTION

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
Package: immunarch
22
Type: Package
33
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
4-
Version: 0.6.9
4+
Version: 0.7.0
55
Authors@R: c(
66
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
77
person("Vasily O.", "Tsvetkov", , role = "aut"),
88
person("Eugene", "Rumynskiy", , role = "aut"),
99
person("Aleksandr A.", "Popov", , role = "aut"),
1010
person("Ivan", "Balashov", , role = "aut"),
11-
person("Maria", "Volobueva", , role = "aut"),
11+
person("Maria", "Samokhina", , role = "aut"),
1212
person("Anna", "Lorenc", , role = "ctb"),
1313
person("Daniel J.", "Moore", , role = "ctb"),
1414
person("Victor", "Greiff", , role = "ctb"),
@@ -18,7 +18,7 @@ Contact: [email protected]
1818
Description: A comprehensive framework for bioinformatics exploratory analysis of bulk and single-cell
1919
T-cell receptor and antibody repertoires. It provides seamless data loading, analysis and
2020
visualisation for AIRR (Adaptive Immune Receptor Repertoire) data, both bulk immunosequencing (RepSeq)
21-
and single-cell sequencing (scRNAseq). It implements most of the widely used AIRR analysis methods,
21+
and single-cell sequencing (scRNAseq). Immunarch implements most of the widely used AIRR analysis methods,
2222
such as: clonality analysis, estimation of repertoire similarities in distribution of clonotypes
2323
and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor
2424
databases and clonotype tracking in vaccination and cancer studies. A successor to our
@@ -65,7 +65,8 @@ Imports:
6565
glue,
6666
phangorn,
6767
uuid,
68-
stringi
68+
stringi,
69+
ggraph
6970
Depends:
7071
R (>= 4.0.0),
7172
ggplot2 (>= 3.1.0),

NAMESPACE

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
S3method(vis,clonal_family)
4+
S3method(vis,clonal_family_tree)
35
S3method(vis,immunr_chao1)
46
S3method(vis,immunr_clonal_prop)
57
S3method(vis,immunr_dbscan)
@@ -35,6 +37,7 @@ S3method(vis,immunr_spectr)
3537
S3method(vis,immunr_spectr_nogene)
3638
S3method(vis,immunr_top_prop)
3739
S3method(vis,immunr_tsne)
40+
S3method(vis,step_failure_ignored)
3841
export(apply_asymm)
3942
export(apply_symm)
4043
export(bunch_translate)
@@ -87,6 +90,7 @@ export(repOverlap)
8790
export(repOverlapAnalysis)
8891
export(repSample)
8992
export(repSave)
93+
export(repSomaticHypermutation)
9094
export(select_barcodes)
9195
export(select_clusters)
9296
export(seqCluster)
@@ -111,7 +115,7 @@ importFrom(Rcpp,sourceCpp)
111115
importFrom(UpSetR,fromExpression)
112116
importFrom(UpSetR,upset)
113117
importFrom(ape,as.DNAbin)
114-
importFrom(ape,muscle)
118+
importFrom(ape,clustal)
115119
importFrom(ape,read.tree)
116120
importFrom(circlize,chordDiagram)
117121
importFrom(data.table,":=")
@@ -125,6 +129,7 @@ importFrom(data.table,setDT)
125129
importFrom(data.table,setcolorder)
126130
importFrom(data.table,setnames)
127131
importFrom(doParallel,registerDoParallel)
132+
importFrom(doParallel,stopImplicitCluster)
128133
importFrom(dplyr,arrange)
129134
importFrom(dplyr,as_tibble)
130135
importFrom(dplyr,collect)
@@ -166,6 +171,10 @@ importFrom(ggpubr,ggscatter)
166171
importFrom(ggpubr,rotate_x_text)
167172
importFrom(ggpubr,stat_compare_means)
168173
importFrom(ggpubr,theme_pubr)
174+
importFrom(ggraph,geom_edge_diagonal)
175+
importFrom(ggraph,geom_node_point)
176+
importFrom(ggraph,ggraph)
177+
importFrom(ggraph,theme_graph)
169178
importFrom(ggseqlogo,geom_logo)
170179
importFrom(ggseqlogo,theme_logo)
171180
importFrom(glue,glue)
@@ -181,13 +190,18 @@ importFrom(magrittr,"%>%")
181190
importFrom(magrittr,extract2)
182191
importFrom(magrittr,set_attr)
183192
importFrom(methods,as)
193+
importFrom(parallel,clusterExport)
184194
importFrom(parallel,detectCores)
195+
importFrom(parallel,makeCluster)
185196
importFrom(parallel,mclapply)
197+
importFrom(parallel,parApply)
198+
importFrom(parallel,stopCluster)
186199
importFrom(patchwork,plot_annotation)
187200
importFrom(patchwork,wrap_plots)
188201
importFrom(phangorn,write.phyDat)
189202
importFrom(pheatmap,pheatmap)
190203
importFrom(plyr,.)
204+
importFrom(plyr,adply)
191205
importFrom(plyr,dlply)
192206
importFrom(plyr,mapvalues)
193207
importFrom(purrr,imap)
@@ -256,8 +270,10 @@ importFrom(stringdist,stringdistmatrix)
256270
importFrom(stringi,stri_replace_all_fixed)
257271
importFrom(stringr,boundary)
258272
importFrom(stringr,fixed)
273+
importFrom(stringr,str_c)
259274
importFrom(stringr,str_count)
260275
importFrom(stringr,str_detect)
276+
importFrom(stringr,str_extract)
261277
importFrom(stringr,str_extract_all)
262278
importFrom(stringr,str_length)
263279
importFrom(stringr,str_match)
@@ -270,7 +286,9 @@ importFrom(stringr,str_sub)
270286
importFrom(stringr,str_trim)
271287
importFrom(tibble,rownames_to_column)
272288
importFrom(tibble,tibble)
289+
importFrom(tidyr,drop_na)
273290
importFrom(tidyr,unite)
291+
importFrom(tidyr,unnest)
274292
importFrom(tidyselect,starts_with)
275293
importFrom(utils,capture.output)
276294
importFrom(utils,packageVersion)

R/align_lineage.R

+74-49
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
#' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
2-
#' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
3-
#' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
4-
#' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
5-
#' Making alignment of the sequence is the first step towards sequence analysis including BCR.
1+
#' Aligns all sequences incliding germline within each clonal lineage within each cluster
62
#'
73
#' @concept align_lineage
84
#'
@@ -14,11 +10,16 @@
1410
#' @importFrom purrr map_dfr
1511
#' @importFrom rlist list.remove
1612
#' @importFrom utils str
17-
#' @importFrom ape as.DNAbin muscle
18-
#' @importFrom doParallel registerDoParallel
13+
#' @importFrom ape as.DNAbin clustal
14+
#' @importFrom doParallel registerDoParallel stopImplicitCluster
1915
#' @importFrom parallel mclapply
2016

21-
#' @description Aligns all sequences incliding germline within each clonal lineage within each cluster
17+
#' @description This function aligns all sequences (incliding germline) that belong to one clonal
18+
#' lineage and one cluster. After clustering and building the clonal lineage and germline, the next
19+
#' step is to analyze the degree of mutation and maturity of each clonal lineage. This allows for
20+
#' finding high mature cells and cells with a large number of offspring. The phylogenetic analysis
21+
#' will find mutations that increase the affinity of BCR. Making alignment of the sequence
22+
#' is the first step towards sequence analysis including BCR.
2223
#'
2324
#' @usage
2425
#'
@@ -39,13 +40,13 @@
3940
#' @param .align_threads Number of threads for lineage alignment.
4041
#'
4142
#' It must have columns in the immunarch compatible format \link{immunarch_data_format}, and also
42-
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Sequence.germline'
43+
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
4344
#' column, which is added by repGermline() function.
4445
#'
4546
#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
4647
#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
47-
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() calculation
48-
#' will be included in the output.
48+
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
49+
#' repSomaticHypermutation() calculation will be included in the output.
4950
#'
5051
#' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
5152
#' Used to avoid raising errors in examples on computers where Clustal W is not installed.
@@ -56,16 +57,21 @@
5657
#' The dataframe has these columns:
5758
#' * Cluster: cluster name
5859
#' * Germline: germline sequence
60+
#' * V.germline.nt: germline V gene sequence
61+
#' * J.germline.nt: germline J gene sequence
62+
#' * CDR3.germline.length: length of CDR3 in germline
5963
#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
6064
#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
6165
#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
62-
#' * V.length (included if .verbose_output=TRUE): shortest length of V gene part outside of CDR3 region in this
66+
#' * V.length: shortest length of V gene part outside of CDR3 region in this
6367
#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
64-
#' * J.length (included if .verbose_output=TRUE): shortest length of J gene part outside of CDR3 region in this
68+
#' * J.length: shortest length of J gene part outside of CDR3 region in this
6569
#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
66-
#' * Sequences (included if .verbose_output=TRUE): nested dataframe containing all sequences for this combination
70+
#' * Sequences: nested dataframe containing all sequences for this combination
6771
#' of cluster and germline; it has columns
68-
#' Sequence, V.end, J.start, CDR3.start, CDR3.end; all values taken from the input dataframe
72+
#' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
73+
#' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
74+
#' all values taken from the input dataframe
6975
#'
7076
#' @examples
7177
#'
@@ -74,7 +80,7 @@
7480
#'
7581
#' bcr_data %>%
7682
#' seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>%
77-
#' repGermline() %>%
83+
#' repGermline(.threads = 1) %>%
7884
#' repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE)
7985
#' @export repAlignLineage
8086
repAlignLineage <- function(.data,
@@ -88,22 +94,30 @@ repAlignLineage <- function(.data,
8894
"Please download it from here: http://www.clustal.org/download/current/\n",
8995
"or install it with your system package manager (such as apt or dnf)."
9096
), .nofail)) {
91-
return(NA)
97+
return(get_empty_object_with_class("step_failure_ignored"))
9298
}
9399

94100
doParallel::registerDoParallel(cores = .prepare_threads)
95101
.data %<>%
96102
apply_to_sample_or_list(
97103
align_single_df,
98104
.min_lineage_sequences = .min_lineage_sequences,
105+
.parallel_prepare = .prepare_threads > 1,
99106
.align_threads = .align_threads,
100107
.verbose_output = .verbose_output
101108
)
109+
doParallel::stopImplicitCluster()
102110
return(.data)
103111
}
104112

105-
align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbose_output) {
106-
for (required_column in c("Cluster", "Germline.sequence")) {
113+
align_single_df <- function(data,
114+
.min_lineage_sequences,
115+
.parallel_prepare,
116+
.align_threads,
117+
.verbose_output) {
118+
for (required_column in c(
119+
"Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
120+
)) {
107121
if (!(required_column %in% colnames(data))) {
108122
stop(
109123
"Found dataframe without required column ",
@@ -120,7 +134,7 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
120134
.fun = prepare_results_row,
121135
.min_lineage_sequences = .min_lineage_sequences,
122136
.verbose_output = .verbose_output,
123-
.parallel = TRUE
137+
.parallel = .parallel_prepare
124138
) %>%
125139
`[`(!is.na(.)) %>%
126140
unname()
@@ -142,14 +156,17 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
142156
mc.cores = .align_threads
143157
)
144158

145-
return(convert_results_to_df(results, alignments, .verbose_output))
159+
return(convert_results_to_df(results, alignments))
146160
}
147161

148162
# this function accepts dataframe subset containing rows only for current lineage
149163
# and returns named list containing 1 row for results dataframe
150164
prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
151165
cluster_name <- lineage_subset[[1, "Cluster"]]
152166
germline_seq <- lineage_subset[[1, "Germline.sequence"]]
167+
germline_v <- lineage_subset[[1, "V.germline.nt"]]
168+
germline_j <- lineage_subset[[1, "J.germline.nt"]]
169+
germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
153170
aligned <- nrow(lineage_subset) >= .min_lineage_sequences
154171

155172
if (!aligned & !.verbose_output) {
@@ -163,13 +180,19 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
163180
lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
164181
)
165182

183+
sequences_columns <- c(
184+
"Sequence", "Clone.ID", "Clones",
185+
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
186+
)
166187
if (.verbose_output) {
167-
sequences <- lineage_subset[c("Sequence", "V.end", "J.start", "CDR3.start", "CDR3.end")]
188+
sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
168189
}
190+
sequences <- lineage_subset[sequences_columns]
191+
sequences[["Clone.ID"]] %<>% as.integer()
192+
sequences[["Clones"]] %<>% as.integer()
169193

170-
germline_parts <- strsplit(germline_seq, "N")[[1]]
171-
germline_v_len <- stringr::str_length(germline_parts[1])
172-
germline_j_len <- stringr::str_length(tail(germline_parts, 1))
194+
germline_v_len <- str_length(germline_v)
195+
germline_j_len <- str_length(germline_j)
173196
v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
174197
j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)
175198

@@ -181,12 +204,21 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
181204
lineage_subset[["J.lengths"]],
182205
j_min_len
183206
)
184-
alignment <- convert_to_dnabin(germline_trimmed, clonotypes_trimmed)
207+
208+
clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
209+
paste0("ID_", id)
210+
})
211+
all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
212+
names(all_sequences_list) <- c("Germline", clonotypes_names)
213+
alignment <- convert_seq_list_to_dnabin(all_sequences_list)
185214

186215
if (.verbose_output) {
187216
return(list(
188217
Cluster = cluster_name,
189218
Germline = germline_seq,
219+
V.germline.nt = germline_v,
220+
J.germline.nt = germline_j,
221+
CDR3.germline.length = germline_cdr3_len,
190222
Aligned = aligned,
191223
Alignment = alignment,
192224
V.length = v_min_len,
@@ -197,43 +229,36 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
197229
return(list(
198230
Cluster = cluster_name,
199231
Germline = germline_seq,
200-
Alignment = alignment
232+
V.germline.nt = germline_v,
233+
J.germline.nt = germline_j,
234+
CDR3.germline.length = germline_cdr3_len,
235+
Alignment = alignment,
236+
V.length = v_min_len,
237+
J.length = j_min_len,
238+
Sequences = sequences
201239
))
202240
}
203241
}
204242

205-
convert_to_dnabin <- function(germline_seq, clonotypes) {
206-
all_sequences_list <- c(list(germline = germline_seq), as.list(clonotypes))
207-
dnabin <- all_sequences_list %>%
208-
lapply(
209-
function(sequence) {
210-
sequence %>%
211-
stringr::str_extract_all(stringr::boundary("character")) %>%
212-
unlist()
213-
}
214-
) %>%
215-
ape::as.DNAbin()
216-
return(dnabin)
217-
}
218-
219243
# trim V/J tails in sequence to the specified lenghts v_min, j_min
220244
trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
221-
stringr::str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
245+
str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
222246
}
223247

224-
convert_results_to_df <- function(nested_results_list, nested_alignments_list, .verbose_output) {
248+
convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
225249
alignments <- nested_alignments_list %>%
226250
lapply(magrittr::extract2, "Alignment") %>%
227251
tibble(Alignment = .)
252+
sequences <- nested_results_list %>%
253+
lapply(magrittr::extract2, "Sequences") %>%
254+
tibble(Sequences = .)
228255
df <- nested_results_list %>%
229256
lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
230257
purrr::map_dfr(~.) %>%
231-
cbind(alignments)
232-
if (.verbose_output) {
233-
sequences <- nested_results_list %>%
234-
lapply(magrittr::extract2, "Sequences") %>%
235-
tibble(Sequences = .)
236-
df %<>% cbind(sequences)
258+
cbind(alignments, sequences)
259+
# fix column types after dataframe rebuilding
260+
for (column in c("CDR3.germline.length", "V.length", "J.length")) {
261+
df[[column]] %<>% as.integer()
237262
}
238263
return(df)
239264
}

R/distance.R

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#' Function for computing distance for sequences
22
#'
3+
#' @concept distance
4+
#'
35
#' @importFrom stringdist stringdistmatrix
46
#' @importFrom purrr map pmap map2
57
#' @importFrom magrittr %>% %<>% set_attr

0 commit comments

Comments
 (0)