forked from Bioconductor/Biostrings
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMultipleAlignment-class.Rd
More file actions
402 lines (361 loc) · 14.4 KB
/
MultipleAlignment-class.Rd
File metadata and controls
402 lines (361 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
\name{MultipleAlignment-class}
\docType{class}
% Classes:
\alias{class:MultipleAlignment}
\alias{MultipleAlignment-class}
\alias{MultipleAlignment}
\alias{class:DNAMultipleAlignment}
\alias{DNAMultipleAlignment-class}
\alias{DNAMultipleAlignment}
\alias{class:RNAMultipleAlignment}
\alias{RNAMultipleAlignment-class}
\alias{RNAMultipleAlignment}
\alias{class:AAMultipleAlignment}
\alias{AAMultipleAlignment-class}
\alias{AAMultipleAlignment}
% Accessor-like methods:
\alias{unmasked,MultipleAlignment-method}
\alias{rownames,MultipleAlignment-method}
\alias{rownames<-,MultipleAlignment-method}
\alias{rowmask}
\alias{rowmask,MultipleAlignment-method}
\alias{rowmask<-}
\alias{rowmask<-,MultipleAlignment,NULL-method}
\alias{rowmask<-,MultipleAlignment,ANY-method}
\alias{colmask}
\alias{colmask,MultipleAlignment-method}
\alias{colmask<-}
\alias{colmask<-,MultipleAlignment,NULL-method}
\alias{colmask<-,MultipleAlignment,ANY-method}
\alias{maskMotif,MultipleAlignment,ANY-method}
\alias{maskGaps}
\alias{maskGaps,MultipleAlignment-method}
\alias{nrow,MultipleAlignment-method}
\alias{ncol,MultipleAlignment-method}
\alias{dim,MultipleAlignment-method}
\alias{maskednrow}
\alias{maskednrow,MultipleAlignment-method}
\alias{maskedncol}
\alias{maskedncol,MultipleAlignment-method}
\alias{maskeddim}
\alias{maskeddim,MultipleAlignment-method}
\alias{maskedratio,MultipleAlignment-method}
\alias{nchar,MultipleAlignment-method}
\alias{seqtype,MultipleAlignment-method}
% Read functions:
\alias{readDNAMultipleAlignment}
\alias{readRNAMultipleAlignment}
\alias{readAAMultipleAlignment}
% Write functions:
\alias{write.phylip}
% Coercion:
\alias{coerce,MultipleAlignment,DNAStringSet-method}
\alias{coerce,MultipleAlignment,RNAStringSet-method}
\alias{coerce,MultipleAlignment,AAStringSet-method}
\alias{coerce,MultipleAlignment,BStringSet-method}
\alias{coerce,character,DNAMultipleAlignment-method}
\alias{coerce,character,RNAMultipleAlignment-method}
\alias{coerce,character,AAMultipleAlignment-method}
\alias{as.character,MultipleAlignment-method}
\alias{as.matrix,MultipleAlignment-method}
% Utilities:
\alias{consensusMatrix,MultipleAlignment-method}
\alias{consensusString,MultipleAlignment-method}
\alias{consensusString,DNAMultipleAlignment-method}
\alias{consensusString,RNAMultipleAlignment-method}
\alias{consensusString,AAMultipleAlignment-method}
\alias{consensusViews}
\alias{consensusViews,MultipleAlignment-method}
\alias{consensusViews,DNAMultipleAlignment-method}
\alias{consensusViews,RNAMultipleAlignment-method}
\alias{consensusViews,AAMultipleAlignment-method}
\alias{alphabetFrequency,MultipleAlignment-method}
% show style methods:
\alias{show,MultipleAlignment-method}
\alias{detail,MultipleAlignment-method}
\title{MultipleAlignment objects}
\description{
The MultipleAlignment class is a container for storing multiple sequence
alignments.
}
\usage{
## Constructors:
DNAMultipleAlignment(x=character(), start=NA, end=NA, width=NA,
use.names=TRUE, rowmask=NULL, colmask=NULL)
RNAMultipleAlignment(x=character(), start=NA, end=NA, width=NA,
use.names=TRUE, rowmask=NULL, colmask=NULL)
AAMultipleAlignment(x=character(), start=NA, end=NA, width=NA,
use.names=TRUE, rowmask=NULL, colmask=NULL)
## Read functions:
readDNAMultipleAlignment(filepath, format)
readRNAMultipleAlignment(filepath, format)
readAAMultipleAlignment(filepath, format)
## Write funtions:
write.phylip(x, filepath)
## ... and more (see below)
}
\arguments{
\item{x}{
Either a character vector (with no NAs), or an \link{XString},
\link{XStringSet} or \link{XStringViews} object containing
strings with the same number of characters. If writing out a Phylip
file, then x would be a \link{MultipleAlignment} object
}
\item{start,end,width}{
Either \code{NA}, a single integer, or an integer vector of the same
length as \code{x} specifying how \code{x} should be "narrowed"
(see \code{?\link[IRanges]{narrow}} in the \pkg{IRanges} package for
the details).
}
\item{use.names}{
\code{TRUE} or \code{FALSE}. Should names be preserved?
}
\item{filepath}{
A character vector (of arbitrary length when reading, of length 1
when writing) containing the paths to the files to read or write.
Note that special values like \code{""} or \code{"|cmd"} (typically
supported by other I/O functions in R) are not supported here. Also
\code{filepath} cannot be a connection.
}
\item{format}{
Either \code{"fasta"} (the default), \code{"stockholm"}, \code{"phylip"}, or
\code{"clustal"}.
}
\item{rowmask}{
a NormalIRanges object that will set masking for rows
}
\item{colmask}{
a NormalIRanges object that will set masking for columns
}
}
\details{
The MultipleAlignment class is designed to hold and represent multiple
sequence alignments. The rows and columns within an alignment can be
masked for ad hoc analyses.
}
\section{Accessor methods}{
In the code snippets below, \code{x} is a MultipleAlignment object.
\describe{
\item{\code{unmasked(x)}:}{
The underlying \link{XStringSet} object containing the multiple
sequence alignment.
}
\item{\code{rownames(x)}:}{
\code{NULL} or a character vector of the same length as \code{x}
containing a short user-provided description or comment for each
sequence in \code{x}.
}
\item{\code{rowmask(x)}, \code{rowmask(x, append, invert) <- value}:}{
Gets and sets the \link{NormalIRanges} object representing the
masked rows in \code{x}. The \code{append} argument takes
\code{union}, \code{replace} or \code{intersect} to indicate how
to combine the new \code{value} with \code{rowmask(x)}. The
\code{invert} argument takes a logical argument to indicate
whether or not to invert the new mask. The \code{value} argument
can be of any class that is coercible to a \link{NormalIRanges}
via the \code{as} function.
}
\item{\code{colmask(x)}, \code{colmask(x, append, invert) <- value}:}{
Gets and sets the \link{NormalIRanges} object representing the
masked columns in \code{x}. The \code{append} argument takes
\code{union}, \code{replace} or \code{intersect} to indicate how
to combine the new \code{value} with \code{colmask(x)}. The
\code{invert} argument takes a logical argument to indicate
whether or not to invert the new mask. The \code{value} argument
can be of any class that is coercible to a \link{NormalIRanges}
via the \code{as} function.
}
\item{\code{maskMotif(x, motif, min.block.width=1, ...)}:}{
Returns a MultipleAlignment object with a modified column mask
based upon motifs found in the consensus string where the consensus
string keeps all the columns but drops the masked rows.
\describe{
\item{motif}{The motif to mask.}
\item{min.block.width}{The minimum width of the blocks to mask.}
\item{...}{Additional arguments for \code{matchPattern}.}
}
}
\item{\code{maskGaps(x, min.fraction, min.block.width)}:}{
Returns a MultipleAlignment object with a modified column mask
based upon gaps in the columns. In particular, this mask is defined
by \code{min.block.width} or more consecutive columns that have
\code{min.fraction} or more of their non-masked rows containing
gap codes.
\describe{
\item{min.fraction}{A value in \code{[0, 1]} that indicates
the minimum fraction needed to call a gap in the consensus string
(default is \code{0.5}).}
\item{min.block.width}{A positive integer that indicates the
minimum number of consecutive gaps to mask, as defined by
\code{min.fraction} (default is \code{4}).}
}
}
\item{\code{nrow(x)}:}{
Returns the number of sequences aligned in \code{x}.
}
\item{\code{ncol(x)}:}{
Returns the number of characters for each alignment in \code{x}.
}
\item{\code{dim(x)}:}{
Equivalent to \code{c(nrow(x), ncol(x))}.
}
\item{\code{maskednrow(x)}:}{
Returns the number of masked aligned sequences in \code{x}.
}
\item{\code{maskedncol(x)}:}{
Returns the number of masked aligned characters in \code{x}.
}
\item{\code{maskeddim(x)}:}{
Equivalent to \code{c(maskednrow(x), maskedncol(x))}.
}
\item{\code{maskedratio(x)}:}{
Equivalent to \code{maskeddim(x) / dim(x)}.
}
\item{\code{nchar(x)}:}{
Returns the number of unmasked aligned characters in \code{x},
i.e. \code{ncol(x) - maskedncol(x)}.
}
\item{\code{alphabet(x)}:}{
Equivalent to \code{alphabet(unmasked(x))}.
}
}
}
\section{Coercion}{
In the code snippets below, \code{x} is a MultipleAlignment object.
\describe{
\item{\code{as(from, "DNAStringSet")}, \code{as(from, "RNAStringSet")},
\code{as(from, "AAStringSet")}, \code{as(from, "BStringSet")}:}{
Creates an instance of the specified \link{XStringSet} object subtype
that contains the unmasked regions of the multiple sequence alignment
in \code{x}.
}
\item{\code{as.character(x, use.names)}:}{
Convert \code{x} to a character vector containing the unmasked
regions of the multiple sequence alignment. \code{use.names}
controls whether or not \code{rownames(x)} should be used to set
the names of the returned vector (default is \code{TRUE}).
}
\item{\code{as.matrix(x, use.names)}:}{
Returns a character matrix containing the "exploded" representation
of the unmasked regions of the multiple sequence alignment.
\code{use.names} controls whether or not \code{rownames(x)} should
be used to set the row names of the returned matrix (default is
\code{TRUE}).
}
}
}
\section{Utilities}{
In the code snippets below, x is a MultipleAlignment object.
\describe{
\item{\code{consensusMatrix(x, as.prob, baseOnly)}:}{
Creates an integer matrix containing the column frequencies of
the underlying alphabet with masked columns being represented
with \code{NA} values. If \code{as.prob} is \code{TRUE}, then
probabilities are reported, otherwise counts are reported (the
default). If \code{baseOnly} is \code{TRUE}, then the non-base
letters are collapsed into an \code{"other"} category.
}
\item{\code{consensusString(x, ...)}:}{
Creates a consensus string for \code{x} with the symbol \code{"#"}
representing a masked column. See \code{\link{consensusString}}
for details on the arguments.
}
\item{\code{consensusViews(x, ...)}:}{
Similar to the \code{consensusString} method. It returns a
\link{XStringViews} on the consensus string containing subsequence
contigs of non-masked columns. Unlike the \code{consensusString}
method, the masked columns in the underlying string contain a
consensus value rather than the \code{"#"} symbol.
}
\item{\code{alphabetFrequency(x, as.prob, collapse)}:}{
Creates an integer matrix containing the row frequencies of
the underlying alphabet. If \code{as.prob} is \code{TRUE}, then
probabilities are reported, otherwise counts are reported (the
default). If \code{collapse} is \code{TRUE}, then returns the
overall frequency instead of the frequency by row.
}
\item{\code{detail(x, invertColMask, hideMaskedCols)}:}{ Allows for a full
pager driven display of the object so that masked cols and rows
can be removed and the entire sequence can be visually
inspected. If \code{hideMaskedCols} is set to it's default value
of \code{TRUE} then the output will hide all the the masked
columns in the output. Otherwise, all columns will be displayed
along with a row to indicate the masking status. If
\code{invertColMask} is \code{TRUE} then any displayed mask will
be flipped so as to represent things in a way consistent with
Phylip style files instead of the mask that is actually stored in
the \code{MultipleAlignment} object. Please notice that
\code{invertColMask} will be ignored if \code{hideMaskedCols} is
set to its default value of \code{TRUE} since in that case it will
not make sense to show any masking information in the output.
Masked rows are always hidden in the output.
}
}
}
\section{Display}{
The letters in a DNAMultipleAlignment or RNAMultipleAlignment object
are colored when displayed by the \code{show()} method. Set global
option \code{Biostrings.coloring} to FALSE to turn off this coloring.
}
\author{P. Aboyoun and M. Carlson}
\seealso{
\link{XStringSet-class},
\link{MaskedXString-class}
}
\examples{
## create an object from file
origMAlign <-
readDNAMultipleAlignment(filepath =
system.file("extdata",
"msx2_mRNA.aln",
package="Biostrings"),
format="clustal")
## list the names of the sequences in the alignment
rownames(origMAlign)
## rename the sequences to be the underlying species for MSX2
rownames(origMAlign) <- c("Human","Chimp","Cow","Mouse","Rat",
"Dog","Chicken","Salmon")
origMAlign
## See a detailed pager view
if (interactive()) {
detail(origMAlign)
}
## operations to mask rows
## For columns, just use colmask() and do the same kinds of operations
rowMasked <- origMAlign
rowmask(rowMasked) <- IRanges(start=1,end=3)
rowMasked
## remove rowumn masks
rowmask(rowMasked) <- NULL
rowMasked
## "select" rows of interest
rowmask(rowMasked, invert=TRUE) <- IRanges(start=4,end=7)
rowMasked
## or mask the rows that intersect with masked rows
rowmask(rowMasked, append="intersect") <- IRanges(start=1,end=5)
rowMasked
## TATA-masked
tataMasked <- maskMotif(origMAlign, "TATA")
colmask(tataMasked)
## automatically mask rows based on consecutive gaps
autoMasked <- maskGaps(origMAlign, min.fraction=0.5, min.block.width=4)
colmask(autoMasked)
autoMasked
## calculate frequencies
alphabetFrequency(autoMasked)
consensusMatrix(autoMasked, baseOnly=TRUE)[, 84:90]
## get consensus values
consensusString(autoMasked)
consensusViews(autoMasked)
## cluster the masked alignments
library(pwalign)
sdist <- pwalign::stringDist(as(autoMasked,"DNAStringSet"), method="hamming")
clust <- hclust(sdist, method = "single")
plot(clust)
fourgroups <- cutree(clust, 4)
fourgroups
## write out the alignement object (with current masks) to Phylip format
write.phylip(x = autoMasked, filepath = tempfile("foo.txt",tempdir()))
}
\keyword{methods}
\keyword{classes}