23
23
# '
24
24
# ' @usage
25
25
# '
26
- # ' repAlignLineage(.data,
27
- # ' .min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail)
26
+ # ' repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail)
28
27
# '
29
28
# ' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table}
30
29
# ' or a list of these objects.
31
30
# '
32
31
# ' @param .min_lineage_sequences If number of sequences in the same clonal lineage and the same
33
32
# ' cluster (not including germline) is lower than this threshold, this group of sequences
34
- # ' will not be aligned and will not be used in next steps of BCR pipeline
35
- # ' (will be saved in output table only if .verbose_output parameter is set to TRUE).
33
+ # ' will be filtered out from the dataframe; so only large enough lineages will be included.
36
34
# '
37
35
# ' @param .prepare_threads Number of threads to prepare results table.
38
36
# ' Please note that high number can cause heavy memory usage!
43
41
# ' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
44
42
# ' column, which is added by repGermline() function.
45
43
# '
46
- # ' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
47
- # ' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
48
- # ' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
49
- # ' repSomaticHypermutation() calculation will be included in the output.
50
- # '
51
44
# ' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
52
45
# ' Used to avoid raising errors in examples on computers where Clustal W is not installed.
53
46
# '
57
50
# ' The dataframe has these columns:
58
51
# ' * Cluster: cluster name
59
52
# ' * Germline: germline sequence
60
- # ' * V.germline.nt: germline V gene sequence
61
- # ' * J.germline.nt: germline J gene sequence
62
- # ' * CDR3.germline.length: length of CDR3 in germline
63
- # ' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
64
- # ' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
65
- # ' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
66
- # ' * V.length: shortest length of V gene part outside of CDR3 region in this
67
- # ' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
68
- # ' * J.length: shortest length of J gene part outside of CDR3 region in this
69
- # ' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
53
+ # ' * Alignment: DNAbin object with alignment
70
54
# ' * Sequences: nested dataframe containing all sequences for this combination
71
55
# ' of cluster and germline; it has columns
72
- # ' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
73
- # ' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
74
- # ' all values taken from the input dataframe
56
+ # ' * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele,
57
+ # ' V.aa, J.aa: all values taken from the input dataframe
58
+ # ' * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing
59
+ # ' * Clones: taken from the input dataframe, or created (filled with '1' values) if missing
75
60
# '
76
61
# ' @examples
77
62
# '
@@ -87,36 +72,45 @@ repAlignLineage <- function(.data,
87
72
.min_lineage_sequences = 3 ,
88
73
.prepare_threads = 2 ,
89
74
.align_threads = 4 ,
90
- .verbose_output = FALSE ,
91
75
.nofail = FALSE ) {
92
- if (! require_system_package(" clustalw" , error_message = paste0(
76
+ if (! require_system_package(c( " clustalw" , " clustalw2 " ) , error_message = paste0(
93
77
" repAlignLineage requires Clustal W app to be installed!\n " ,
94
78
" Please download it from here: http://www.clustal.org/download/current/\n " ,
95
79
" or install it with your system package manager (such as apt or dnf)."
96
80
), .nofail )) {
97
81
return (get_empty_object_with_class(" step_failure_ignored" ))
98
82
}
83
+ if (.min_lineage_sequences < 2 ) {
84
+ warning(
85
+ " .min_lineage_sequences is set to less than 2; " ,
86
+ " results will not be valid to build trees with repClonalLineage()!"
87
+ )
88
+ }
99
89
100
- doParallel :: registerDoParallel(cores = .prepare_threads )
90
+ parallel_prepare <- .prepare_threads > 1
91
+ if (parallel_prepare ) {
92
+ doParallel :: registerDoParallel(cores = .prepare_threads )
93
+ }
101
94
.data %<> %
102
95
apply_to_sample_or_list(
103
96
align_single_df ,
104
97
.min_lineage_sequences = .min_lineage_sequences ,
105
- .parallel_prepare = .prepare_threads > 1 ,
106
- .align_threads = .align_threads ,
107
- .verbose_output = .verbose_output
98
+ .parallel_prepare = parallel_prepare ,
99
+ .align_threads = .align_threads
108
100
)
109
- doParallel :: stopImplicitCluster()
101
+ if (parallel_prepare ) {
102
+ doParallel :: stopImplicitCluster()
103
+ }
110
104
return (.data )
111
105
}
112
106
113
107
align_single_df <- function (data ,
114
108
.min_lineage_sequences ,
115
109
.parallel_prepare ,
116
- .align_threads ,
117
- .verbose_output ) {
110
+ .align_threads ) {
118
111
for (required_column in c(
119
- " Cluster" , " Germline.sequence" , " V.germline.nt" , " J.germline.nt" , " CDR3.germline.length"
112
+ " Cluster" , " Germline.sequence" , " V.allele" , " J.allele" ,
113
+ " FR1.nt" , " CDR1.nt" , " FR2.nt" , " CDR2.nt" , " FR3.nt" , " CDR3.nt" , " FR4.nt" , " V.aa" , " J.aa"
120
114
)) {
121
115
if (! (required_column %in% colnames(data ))) {
122
116
stop(
@@ -129,11 +123,11 @@ align_single_df <- function(data,
129
123
}
130
124
131
125
results <- data %> %
126
+ fill_missing_columns() %> %
132
127
plyr :: dlply(
133
128
.variables = .(get(" Cluster" ), get(" Germline.sequence" )),
134
129
.fun = prepare_results_row ,
135
130
.min_lineage_sequences = .min_lineage_sequences ,
136
- .verbose_output = .verbose_output ,
137
131
.parallel = .parallel_prepare
138
132
) %> %
139
133
`[`(! is.na(. )) %> %
@@ -143,134 +137,69 @@ align_single_df <- function(data,
143
137
stop(" There are no lineages containing at least " , .min_lineage_sequences , " sequences!" )
144
138
}
145
139
146
- # only required columns are passed to alignment function to reduce consumed memory
147
- if (.verbose_output ) {
148
- alignments <- lapply(results , " [" , c(" Aligned" , " Alignment" ))
149
- } else {
150
- alignments <- lapply(results , " [" , " Alignment" )
151
- }
152
- alignments %<> % parallel :: mclapply(
153
- align_sequences ,
154
- .verbose_output = .verbose_output ,
155
- mc.preschedule = TRUE ,
156
- mc.cores = .align_threads
157
- )
140
+ # only Alignment column are passed to alignment function to reduce consumed memory
141
+ alignments <- lapply(results , " [" , " Alignment" ) %> %
142
+ par_or_normal_lapply(mc.preschedule = TRUE , mc.cores = .align_threads , function (df_row ) {
143
+ df_row [[" Alignment" ]] %<> % ape :: clustal()
144
+ })
158
145
159
146
return (convert_results_to_df(results , alignments ))
160
147
}
161
148
149
+ # fill Clone.ID and Clones columns if they are missing
150
+ fill_missing_columns <- function (data ) {
151
+ if (! (" Clone.ID" %in% colnames(data ))) {
152
+ data [[" Clone.ID" ]] <- seq.int(nrow(data ))
153
+ }
154
+ if (! (" Clones" %in% colnames(data ))) {
155
+ data [[" Clones" ]] <- as.integer(1 )
156
+ }
157
+ return (data )
158
+ }
159
+
162
160
# this function accepts dataframe subset containing rows only for current lineage
163
161
# and returns named list containing 1 row for results dataframe
164
- prepare_results_row <- function (lineage_subset , .min_lineage_sequences , .verbose_output ) {
165
- cluster_name <- lineage_subset [[1 , " Cluster" ]]
166
- germline_seq <- lineage_subset [[1 , " Germline.sequence" ]]
167
- germline_v <- lineage_subset [[1 , " V.germline.nt" ]]
168
- germline_j <- lineage_subset [[1 , " J.germline.nt" ]]
169
- germline_cdr3_len <- lineage_subset [[1 , " CDR3.germline.length" ]]
170
- aligned <- nrow(lineage_subset ) > = .min_lineage_sequences
171
-
172
- if (! aligned & ! .verbose_output ) {
162
+ prepare_results_row <- function (lineage_subset , .min_lineage_sequences ) {
163
+ if (nrow(lineage_subset ) < .min_lineage_sequences ) {
164
+ # NA rows will be filtered out
173
165
return (NA )
174
166
}
175
167
176
- lineage_subset [[" V.lengths" ]] <- v_len_outside_cdr3(
177
- lineage_subset [[" V.end" ]], lineage_subset [[" CDR3.start" ]]
178
- )
179
- lineage_subset [[" J.lengths" ]] <- j_len_outside_cdr3(
180
- lineage_subset [[" Sequence" ]], lineage_subset [[" J.start" ]], lineage_subset [[" CDR3.end" ]]
181
- )
168
+ cluster_name <- lineage_subset [[1 , " Cluster" ]]
169
+ germline_seq <- lineage_subset [[1 , " Germline.sequence" ]]
182
170
183
171
sequences_columns <- c(
184
- " Sequence" , " Clone.ID" , " Clones" ,
185
- " CDR1.nt" , " CDR2.nt" , " CDR3.nt" , " FR1.nt" , " FR2.nt" , " FR3.nt" , " FR4.nt"
172
+ " Sequence" , " Clone.ID" , " Clones" , " V.allele " , " J.allele " ,
173
+ " CDR1.nt" , " CDR2.nt" , " CDR3.nt" , " FR1.nt" , " FR2.nt" , " FR3.nt" , " FR4.nt" , " V.aa " , " J.aa "
186
174
)
187
- if (.verbose_output ) {
188
- sequences_columns %<> % c(" V.end" , " J.start" , " CDR3.start" , " CDR3.end" )
189
- }
175
+
190
176
sequences <- lineage_subset [sequences_columns ]
191
177
sequences [[" Clone.ID" ]] %<> % as.integer()
192
178
sequences [[" Clones" ]] %<> % as.integer()
193
179
194
- germline_v_len <- str_length(germline_v )
195
- germline_j_len <- str_length(germline_j )
196
- v_min_len <- min(lineage_subset [[" V.lengths" ]], germline_v_len )
197
- j_min_len <- min(lineage_subset [[" J.lengths" ]], germline_j_len )
198
-
199
- germline_trimmed <- trim_seq(germline_seq , germline_v_len , v_min_len , germline_j_len , j_min_len )
200
- clonotypes_trimmed <- trim_seq(
201
- lineage_subset [[" Sequence" ]],
202
- lineage_subset [[" V.lengths" ]],
203
- v_min_len ,
204
- lineage_subset [[" J.lengths" ]],
205
- j_min_len
206
- )
207
-
208
180
clonotypes_names <- sapply(lineage_subset [[" Clone.ID" ]], function (id ) {
209
181
paste0(" ID_" , id )
210
182
})
211
- all_sequences_list <- c(list (germline_trimmed ), as.list(clonotypes_trimmed ))
183
+ all_sequences_list <- c(list (germline_seq ), as.list(lineage_subset [[ " Sequence " ]] ))
212
184
names(all_sequences_list ) <- c(" Germline" , clonotypes_names )
213
185
alignment <- convert_seq_list_to_dnabin(all_sequences_list )
214
186
215
- if (.verbose_output ) {
216
- return (list (
217
- Cluster = cluster_name ,
218
- Germline = germline_seq ,
219
- V.germline.nt = germline_v ,
220
- J.germline.nt = germline_j ,
221
- CDR3.germline.length = germline_cdr3_len ,
222
- Aligned = aligned ,
223
- Alignment = alignment ,
224
- V.length = v_min_len ,
225
- J.length = j_min_len ,
226
- Sequences = sequences
227
- ))
228
- } else {
229
- return (list (
230
- Cluster = cluster_name ,
231
- Germline = germline_seq ,
232
- V.germline.nt = germline_v ,
233
- J.germline.nt = germline_j ,
234
- CDR3.germline.length = germline_cdr3_len ,
235
- Alignment = alignment ,
236
- V.length = v_min_len ,
237
- J.length = j_min_len ,
238
- Sequences = sequences
239
- ))
240
- }
187
+ return (list (
188
+ Cluster = cluster_name ,
189
+ Germline = germline_seq ,
190
+ Alignment = alignment ,
191
+ Sequences = sequences
192
+ ))
241
193
}
242
194
243
- # trim V/J tails in sequence to the specified lenghts v_min, j_min
244
- trim_seq <- function (seq , v_len , v_min , j_len , j_min ) {
245
- str_sub(seq , v_len - v_min + 1 , - (j_len - j_min + 1 ))
246
- }
247
-
248
- convert_results_to_df <- function (nested_results_list , nested_alignments_list ) {
249
- alignments <- nested_alignments_list %> %
250
- lapply(magrittr :: extract2 , " Alignment" ) %> %
251
- tibble(Alignment = . )
195
+ convert_results_to_df <- function (nested_results_list , alignments_list ) {
196
+ alignments <- tibble(Alignment = alignments_list )
252
197
sequences <- nested_results_list %> %
253
198
lapply(magrittr :: extract2 , " Sequences" ) %> %
254
199
tibble(Sequences = . )
255
200
df <- nested_results_list %> %
256
201
lapply(rlist :: list.remove , c(" Alignment" , " Sequences" )) %> %
257
202
purrr :: map_dfr(~ . ) %> %
258
203
cbind(alignments , sequences )
259
- # fix column types after dataframe rebuilding
260
- for (column in c(" CDR3.germline.length" , " V.length" , " J.length" )) {
261
- df [[column ]] %<> % as.integer()
262
- }
263
204
return (df )
264
205
}
265
-
266
- align_sequences <- function (df_row , .verbose_output ) {
267
- if (.verbose_output ) {
268
- aligned <- df_row [[" Aligned" ]]
269
- } else {
270
- aligned <- TRUE
271
- }
272
- if (aligned ) {
273
- df_row [[" Alignment" ]] %<> % ape :: clustal()
274
- }
275
- return (df_row )
276
- }
0 commit comments