26
26
package com .fulcrumgenomics .umi
27
27
28
28
import com .fulcrumgenomics .bam .api .SamRecord
29
+ import com .fulcrumgenomics .util .Sequences
29
30
30
31
object Umis {
31
32
@@ -38,17 +39,30 @@ object Umis {
38
39
*
39
40
* @param rec the record to modify
40
41
* @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag
41
- * @param delimiter the delimiter of fields within the read name
42
+ * @param fieldDelimiter the delimiter of fields within the read name
43
+ * @param umiDelimiter the delimiter between sequences in the UMI string
44
+ * @param rcPrefix the prefix of a UMI that indicates it is reverse-complimented
45
+ * @param normalizeRcUmis whether to normalize reverse-complemented UMIs
42
46
* @return the modified record
43
47
*/
44
- def copyUmiFromReadName (rec : SamRecord , removeUmi : Boolean = false , delimiter : Char = ':' , prefix : Option [String ] = None ): SamRecord = {
48
+ def copyUmiFromReadName (rec : SamRecord ,
49
+ removeUmi : Boolean = false ,
50
+ fieldDelimiter : Char = ':' ,
51
+ umiDelimiter : Char = '+' ,
52
+ rcPrefix : Option [String ] = None ,
53
+ normalizeRcUmis : Boolean = false ): SamRecord = {
45
54
// Extract and set the UMI
46
- val umi = extractUmisFromReadName(rec.name, delimiter, strict= false , prefix= prefix)
55
+ val umi = extractUmisFromReadName(rec.name,
56
+ fieldDelimiter,
57
+ strict= false ,
58
+ umiDelimiter,
59
+ rcPrefix= rcPrefix,
60
+ normalizeRcUmis= normalizeRcUmis)
47
61
require(umi.nonEmpty, f " No valid UMI found in: ${rec.name}" )
48
62
umi.foreach(u => rec(ConsensusTags .UmiBases ) = u)
49
63
50
64
// Remove the UMI from the read name if requested
51
- if (removeUmi) rec.name = rec.name.substring(0 , rec.name.lastIndexOf(delimiter ))
65
+ if (removeUmi) rec.name = rec.name.substring(0 , rec.name.lastIndexOf(fieldDelimiter ))
52
66
53
67
rec
54
68
}
@@ -59,29 +73,43 @@ object Umis {
59
73
*
60
74
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
61
75
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
62
- * field may contain multiple UMIs, in which case they will delimit them with `+ ` characters. Pluses will be
63
- * translated to hyphens before returning.
76
+ * field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter ` characters, which
77
+ * will be translated to hyphens before returning.
64
78
*
65
79
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
66
80
with the UMI being the last in the case of 8 and `None` in the case of 7.
67
81
*
68
82
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
69
83
*/
70
- def extractUmisFromReadName (name : String , delimiter : Char = ':' , strict : Boolean , prefix : Option [String ] = None ): Option [String ] = {
84
+ def extractUmisFromReadName (name : String ,
85
+ fieldDelimiter : Char = ':' ,
86
+ strict : Boolean ,
87
+ umiDelimiter : Char = '+' ,
88
+ rcPrefix : Option [String ] = None ,
89
+ normalizeRcUmis : Boolean = false ): Option [String ] = {
71
90
// If strict, check that the read name actually has eight parts, which is expected
72
91
val rawUmi = if (strict) {
73
- val colons = name.count(_ == delimiter )
92
+ val colons = name.count(_ == fieldDelimiter )
74
93
if (colons == 6 ) None
75
- else if (colons == 7 ) Some (name.substring(name.lastIndexOf(delimiter ) + 1 , name.length))
94
+ else if (colons == 7 ) Some (name.substring(name.lastIndexOf(fieldDelimiter ) + 1 , name.length))
76
95
else throw new IllegalArgumentException (s " Trying to extract UMI from read with ${colons + 1 } parts (7-8 expected): ${name}" )
77
96
} else {
78
- val idx = name.lastIndexOf(delimiter )
79
- require(idx != - 1 , s " Read did not have multiple ' ${delimiter }'-separated fields: ${name}" )
97
+ val idx = name.lastIndexOf(fieldDelimiter )
98
+ require(idx != - 1 , s " Read did not have multiple ' ${fieldDelimiter }'-separated fields: ${name}" )
80
99
Some (name.substring(idx + 1 , name.length))
81
100
}
82
101
83
- val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get)))
84
- val umi = umiSeq.map(raw => (if (raw.indexOf('+' ) > 0 ) raw.replace('+' , '-' ) else raw).toUpperCase)
102
+ var umi = rawUmi.map(raw => rcPrefix match {
103
+ case Some (prefix) if raw.indexOf(prefix) >= 0 && normalizeRcUmis =>
104
+ raw.split(umiDelimiter).map(seq =>
105
+ (if (seq.startsWith(prefix)) Sequences .revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase
106
+ ).mkString(" -" )
107
+ case Some (prefix) if raw.indexOf(prefix) >= 0 =>
108
+ raw.replace(prefix, " " ).replace(umiDelimiter, '-' ).toUpperCase
109
+ case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-' ).toUpperCase
110
+ case _ => raw.toUpperCase
111
+ })
112
+
85
113
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
86
114
87
115
if (strict && ! valid) throw new IllegalArgumentException (s " Invalid UMI ' ${umi.get}' extracted from name ' ${name}" )
0 commit comments