1
- # ' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
2
- # ' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
3
- # ' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
4
- # ' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
5
- # ' Making alignment of the sequence is the first step towards sequence analysis including BCR.
1
+ # ' Aligns all sequences incliding germline within each clonal lineage within each cluster
6
2
# '
7
3
# ' @concept align_lineage
8
4
# '
14
10
# ' @importFrom purrr map_dfr
15
11
# ' @importFrom rlist list.remove
16
12
# ' @importFrom utils str
17
- # ' @importFrom ape as.DNAbin muscle
18
- # ' @importFrom doParallel registerDoParallel
13
+ # ' @importFrom ape as.DNAbin clustal
14
+ # ' @importFrom doParallel registerDoParallel stopImplicitCluster
19
15
# ' @importFrom parallel mclapply
20
16
21
- # ' @description Aligns all sequences incliding germline within each clonal lineage within each cluster
17
+ # ' @description This function aligns all sequences (incliding germline) that belong to one clonal
18
+ # ' lineage and one cluster. After clustering and building the clonal lineage and germline, the next
19
+ # ' step is to analyze the degree of mutation and maturity of each clonal lineage. This allows for
20
+ # ' finding high mature cells and cells with a large number of offspring. The phylogenetic analysis
21
+ # ' will find mutations that increase the affinity of BCR. Making alignment of the sequence
22
+ # ' is the first step towards sequence analysis including BCR.
22
23
# '
23
24
# ' @usage
24
25
# '
39
40
# ' @param .align_threads Number of threads for lineage alignment.
40
41
# '
41
42
# ' It must have columns in the immunarch compatible format \link{immunarch_data_format}, and also
42
- # ' must contain 'Cluster' column, which is added by seqCluster() function, and 'Sequence.germline '
43
+ # ' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence '
43
44
# ' column, which is added by repGermline() function.
44
45
# '
45
46
# ' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
46
47
# ' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
47
- # ' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() calculation
48
- # ' will be included in the output.
48
+ # ' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
49
+ # ' repSomaticHypermutation() calculation will be included in the output.
49
50
# '
50
51
# ' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
51
52
# ' Used to avoid raising errors in examples on computers where Clustal W is not installed.
56
57
# ' The dataframe has these columns:
57
58
# ' * Cluster: cluster name
58
59
# ' * Germline: germline sequence
60
+ # ' * V.germline.nt: germline V gene sequence
61
+ # ' * J.germline.nt: germline J gene sequence
62
+ # ' * CDR3.germline.length: length of CDR3 in germline
59
63
# ' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
60
64
# ' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
61
65
# ' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
62
- # ' * V.length (included if .verbose_output=TRUE) : shortest length of V gene part outside of CDR3 region in this
66
+ # ' * V.length: shortest length of V gene part outside of CDR3 region in this
63
67
# ' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
64
- # ' * J.length (included if .verbose_output=TRUE) : shortest length of J gene part outside of CDR3 region in this
68
+ # ' * J.length: shortest length of J gene part outside of CDR3 region in this
65
69
# ' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
66
- # ' * Sequences (included if .verbose_output=TRUE) : nested dataframe containing all sequences for this combination
70
+ # ' * Sequences: nested dataframe containing all sequences for this combination
67
71
# ' of cluster and germline; it has columns
68
- # ' Sequence, V.end, J.start, CDR3.start, CDR3.end; all values taken from the input dataframe
72
+ # ' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
73
+ # ' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
74
+ # ' all values taken from the input dataframe
69
75
# '
70
76
# ' @examples
71
77
# '
74
80
# '
75
81
# ' bcr_data %>%
76
82
# ' seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>%
77
- # ' repGermline() %>%
83
+ # ' repGermline(.threads = 1 ) %>%
78
84
# ' repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE)
79
85
# ' @export repAlignLineage
80
86
repAlignLineage <- function (.data ,
@@ -88,22 +94,30 @@ repAlignLineage <- function(.data,
88
94
" Please download it from here: http://www.clustal.org/download/current/\n " ,
89
95
" or install it with your system package manager (such as apt or dnf)."
90
96
), .nofail )) {
91
- return (NA )
97
+ return (get_empty_object_with_class( " step_failure_ignored " ) )
92
98
}
93
99
94
100
doParallel :: registerDoParallel(cores = .prepare_threads )
95
101
.data %<> %
96
102
apply_to_sample_or_list(
97
103
align_single_df ,
98
104
.min_lineage_sequences = .min_lineage_sequences ,
105
+ .parallel_prepare = .prepare_threads > 1 ,
99
106
.align_threads = .align_threads ,
100
107
.verbose_output = .verbose_output
101
108
)
109
+ doParallel :: stopImplicitCluster()
102
110
return (.data )
103
111
}
104
112
105
- align_single_df <- function (data , .min_lineage_sequences , .align_threads , .verbose_output ) {
106
- for (required_column in c(" Cluster" , " Germline.sequence" )) {
113
+ align_single_df <- function (data ,
114
+ .min_lineage_sequences ,
115
+ .parallel_prepare ,
116
+ .align_threads ,
117
+ .verbose_output ) {
118
+ for (required_column in c(
119
+ " Cluster" , " Germline.sequence" , " V.germline.nt" , " J.germline.nt" , " CDR3.germline.length"
120
+ )) {
107
121
if (! (required_column %in% colnames(data ))) {
108
122
stop(
109
123
" Found dataframe without required column " ,
@@ -120,7 +134,7 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
120
134
.fun = prepare_results_row ,
121
135
.min_lineage_sequences = .min_lineage_sequences ,
122
136
.verbose_output = .verbose_output ,
123
- .parallel = TRUE
137
+ .parallel = .parallel_prepare
124
138
) %> %
125
139
`[`(! is.na(. )) %> %
126
140
unname()
@@ -142,14 +156,17 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
142
156
mc.cores = .align_threads
143
157
)
144
158
145
- return (convert_results_to_df(results , alignments , .verbose_output ))
159
+ return (convert_results_to_df(results , alignments ))
146
160
}
147
161
148
162
# this function accepts dataframe subset containing rows only for current lineage
149
163
# and returns named list containing 1 row for results dataframe
150
164
prepare_results_row <- function (lineage_subset , .min_lineage_sequences , .verbose_output ) {
151
165
cluster_name <- lineage_subset [[1 , " Cluster" ]]
152
166
germline_seq <- lineage_subset [[1 , " Germline.sequence" ]]
167
+ germline_v <- lineage_subset [[1 , " V.germline.nt" ]]
168
+ germline_j <- lineage_subset [[1 , " J.germline.nt" ]]
169
+ germline_cdr3_len <- lineage_subset [[1 , " CDR3.germline.length" ]]
153
170
aligned <- nrow(lineage_subset ) > = .min_lineage_sequences
154
171
155
172
if (! aligned & ! .verbose_output ) {
@@ -163,13 +180,19 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
163
180
lineage_subset [[" Sequence" ]], lineage_subset [[" J.start" ]], lineage_subset [[" CDR3.end" ]]
164
181
)
165
182
183
+ sequences_columns <- c(
184
+ " Sequence" , " Clone.ID" , " Clones" ,
185
+ " CDR1.nt" , " CDR2.nt" , " CDR3.nt" , " FR1.nt" , " FR2.nt" , " FR3.nt" , " FR4.nt"
186
+ )
166
187
if (.verbose_output ) {
167
- sequences <- lineage_subset [ c(" Sequence " , " V.end" , " J.start" , " CDR3.start" , " CDR3.end" )]
188
+ sequences_columns % <> % c(" V.end" , " J.start" , " CDR3.start" , " CDR3.end" )
168
189
}
190
+ sequences <- lineage_subset [sequences_columns ]
191
+ sequences [[" Clone.ID" ]] %<> % as.integer()
192
+ sequences [[" Clones" ]] %<> % as.integer()
169
193
170
- germline_parts <- strsplit(germline_seq , " N" )[[1 ]]
171
- germline_v_len <- stringr :: str_length(germline_parts [1 ])
172
- germline_j_len <- stringr :: str_length(tail(germline_parts , 1 ))
194
+ germline_v_len <- str_length(germline_v )
195
+ germline_j_len <- str_length(germline_j )
173
196
v_min_len <- min(lineage_subset [[" V.lengths" ]], germline_v_len )
174
197
j_min_len <- min(lineage_subset [[" J.lengths" ]], germline_j_len )
175
198
@@ -181,12 +204,21 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
181
204
lineage_subset [[" J.lengths" ]],
182
205
j_min_len
183
206
)
184
- alignment <- convert_to_dnabin(germline_trimmed , clonotypes_trimmed )
207
+
208
+ clonotypes_names <- sapply(lineage_subset [[" Clone.ID" ]], function (id ) {
209
+ paste0(" ID_" , id )
210
+ })
211
+ all_sequences_list <- c(list (germline_trimmed ), as.list(clonotypes_trimmed ))
212
+ names(all_sequences_list ) <- c(" Germline" , clonotypes_names )
213
+ alignment <- convert_seq_list_to_dnabin(all_sequences_list )
185
214
186
215
if (.verbose_output ) {
187
216
return (list (
188
217
Cluster = cluster_name ,
189
218
Germline = germline_seq ,
219
+ V.germline.nt = germline_v ,
220
+ J.germline.nt = germline_j ,
221
+ CDR3.germline.length = germline_cdr3_len ,
190
222
Aligned = aligned ,
191
223
Alignment = alignment ,
192
224
V.length = v_min_len ,
@@ -197,43 +229,36 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
197
229
return (list (
198
230
Cluster = cluster_name ,
199
231
Germline = germline_seq ,
200
- Alignment = alignment
232
+ V.germline.nt = germline_v ,
233
+ J.germline.nt = germline_j ,
234
+ CDR3.germline.length = germline_cdr3_len ,
235
+ Alignment = alignment ,
236
+ V.length = v_min_len ,
237
+ J.length = j_min_len ,
238
+ Sequences = sequences
201
239
))
202
240
}
203
241
}
204
242
205
- convert_to_dnabin <- function (germline_seq , clonotypes ) {
206
- all_sequences_list <- c(list (germline = germline_seq ), as.list(clonotypes ))
207
- dnabin <- all_sequences_list %> %
208
- lapply(
209
- function (sequence ) {
210
- sequence %> %
211
- stringr :: str_extract_all(stringr :: boundary(" character" )) %> %
212
- unlist()
213
- }
214
- ) %> %
215
- ape :: as.DNAbin()
216
- return (dnabin )
217
- }
218
-
219
243
# trim V/J tails in sequence to the specified lenghts v_min, j_min
220
244
trim_seq <- function (seq , v_len , v_min , j_len , j_min ) {
221
- stringr :: str_sub(seq , v_len - v_min + 1 , - (j_len - j_min + 1 ))
245
+ str_sub(seq , v_len - v_min + 1 , - (j_len - j_min + 1 ))
222
246
}
223
247
224
- convert_results_to_df <- function (nested_results_list , nested_alignments_list , .verbose_output ) {
248
+ convert_results_to_df <- function (nested_results_list , nested_alignments_list ) {
225
249
alignments <- nested_alignments_list %> %
226
250
lapply(magrittr :: extract2 , " Alignment" ) %> %
227
251
tibble(Alignment = . )
252
+ sequences <- nested_results_list %> %
253
+ lapply(magrittr :: extract2 , " Sequences" ) %> %
254
+ tibble(Sequences = . )
228
255
df <- nested_results_list %> %
229
256
lapply(rlist :: list.remove , c(" Alignment" , " Sequences" )) %> %
230
257
purrr :: map_dfr(~ . ) %> %
231
- cbind(alignments )
232
- if (.verbose_output ) {
233
- sequences <- nested_results_list %> %
234
- lapply(magrittr :: extract2 , " Sequences" ) %> %
235
- tibble(Sequences = . )
236
- df %<> % cbind(sequences )
258
+ cbind(alignments , sequences )
259
+ # fix column types after dataframe rebuilding
260
+ for (column in c(" CDR3.germline.length" , " V.length" , " J.length" )) {
261
+ df [[column ]] %<> % as.integer()
237
262
}
238
263
return (df )
239
264
}
0 commit comments