Skip to content

Commit 5461911

Browse files
committed
introduce umiDelimiter, rcPrefix, and normalizeRcUmis options; make changes from PR comments
1 parent c575dfc commit 5461911

File tree

4 files changed

+86
-31
lines changed

4 files changed

+86
-31
lines changed

src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala

+23-10
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,30 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger}
3636
"""
3737
|Copies the UMI at the end of the BAM's read name to the RX tag.
3838
|
39-
|The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI
39+
|The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI
4040
|will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of
4141
|valid bases (ACGTN), the program will report the error and fail.
4242
|
43-
|If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The
44-
|resulting UMI in the `RX` tag will always be hyphen delimited.
43+
|If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)).
44+
|The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag
45+
|will always be hyphen delimited.
46+
|
47+
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence
48+
|has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to
49+
|be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will
50+
|be normalized (i.e., reverse-completemented back to be in the forward orientation).
4551
|
4652
|To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and
47-
|prefix separately, i.e. `--umi-delimiter=":"` and `--umi-prefix="r"`.
53+
|prefix separately, i.e. `--field-delimiter=":"` and `--rc-prefix="r"`.
4854
""")
4955
class CopyUmiFromReadName
50-
( @arg(flag='i', doc="The input BAM file") input: PathToBam,
51-
@arg(flag='o', doc="The output BAM file") output: PathToBam,
52-
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false,
53-
@arg(doc="Delimiter between the read name and UMI.") umiDelimiter: Char = ':',
54-
@arg(doc="Any characters preceding the UMI sequence in the read name.") umiPrefix: Option[String] = None,
56+
( @arg(flag='i', doc="The input BAM file.") input: PathToBam,
57+
@arg(flag='o', doc="The output BAM file.") output: PathToBam,
58+
@arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false,
59+
@arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':',
60+
@arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+',
61+
@arg(doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") rcPrefix: Option[String] = None,
62+
@arg(doc="Whether to reverse-complement UMI sequences with the '--rc-prefix'.") normalizeRcUmis: Boolean = false,
5563
) extends FgBioTool with LazyLogging {
5664

5765
Io.assertReadable(input)
@@ -63,7 +71,12 @@ class CopyUmiFromReadName
6371
val progress = new ProgressLogger(logger)
6472
source.foreach { rec =>
6573
progress.record(rec)
66-
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi, delimiter=umiDelimiter, prefix=umiPrefix)
74+
writer += Umis.copyUmiFromReadName(rec=rec,
75+
removeUmi=removeUmi,
76+
fieldDelimiter=fieldDelimiter,
77+
umiDelimiter=umiDelimiter,
78+
rcPrefix=rcPrefix,
79+
normalizeRcUmis=normalizeRcUmis)
6780
}
6881
progress.logLast()
6982
source.safelyClose()

src/main/scala/com/fulcrumgenomics/umi/Umis.scala

+41-13
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
package com.fulcrumgenomics.umi
2727

2828
import com.fulcrumgenomics.bam.api.SamRecord
29+
import com.fulcrumgenomics.util.Sequences
2930

3031
object Umis {
3132

@@ -38,17 +39,30 @@ object Umis {
3839
*
3940
* @param rec the record to modify
4041
* @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag
41-
* @param delimiter the delimiter of fields within the read name
42+
* @param fieldDelimiter the delimiter of fields within the read name
43+
* @param umiDelimiter the delimiter between sequences in the UMI string
44+
* @param rcPrefix the prefix of a UMI that indicates it is reverse-complimented
45+
* @param normalizeRcUmis whether to normalize reverse-complemented UMIs
4246
* @return the modified record
4347
*/
44-
def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':', prefix: Option[String] = None): SamRecord = {
48+
def copyUmiFromReadName(rec: SamRecord,
49+
removeUmi: Boolean = false,
50+
fieldDelimiter: Char = ':',
51+
umiDelimiter: Char = '+',
52+
rcPrefix: Option[String] = None,
53+
normalizeRcUmis: Boolean = false): SamRecord = {
4554
// Extract and set the UMI
46-
val umi = extractUmisFromReadName(rec.name, delimiter, strict=false, prefix=prefix)
55+
val umi = extractUmisFromReadName(rec.name,
56+
fieldDelimiter,
57+
strict=false,
58+
umiDelimiter,
59+
rcPrefix=rcPrefix,
60+
normalizeRcUmis=normalizeRcUmis)
4761
require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}")
4862
umi.foreach(u => rec(ConsensusTags.UmiBases) = u)
4963

5064
// Remove the UMI from the read name if requested
51-
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter))
65+
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter))
5266

5367
rec
5468
}
@@ -59,29 +73,43 @@ object Umis {
5973
*
6074
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
6175
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
62-
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
63-
* translated to hyphens before returning.
76+
* field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which
77+
* will be translated to hyphens before returning.
6478
*
6579
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
6680
with the UMI being the last in the case of 8 and `None` in the case of 7.
6781
*
6882
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
6983
*/
70-
def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean, prefix: Option[String] = None): Option[String] = {
84+
def extractUmisFromReadName(name: String,
85+
fieldDelimiter: Char = ':',
86+
strict: Boolean,
87+
umiDelimiter: Char = '+',
88+
rcPrefix: Option[String] = None,
89+
normalizeRcUmis: Boolean = false): Option[String] = {
7190
// If strict, check that the read name actually has eight parts, which is expected
7291
val rawUmi = if (strict) {
73-
val colons = name.count(_ == delimiter)
92+
val colons = name.count(_ == fieldDelimiter)
7493
if (colons == 6) None
75-
else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length))
94+
else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length))
7695
else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}")
7796
} else {
78-
val idx = name.lastIndexOf(delimiter)
79-
require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}")
97+
val idx = name.lastIndexOf(fieldDelimiter)
98+
require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}")
8099
Some(name.substring(idx + 1, name.length))
81100
}
82101

83-
val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get)))
84-
val umi = umiSeq.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
102+
var umi = rawUmi.map(raw => rcPrefix match {
103+
case Some(prefix) if raw.indexOf(prefix) >= 0 && normalizeRcUmis =>
104+
raw.split(umiDelimiter).map(seq =>
105+
(if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase
106+
).mkString("-")
107+
case Some(prefix) if raw.indexOf(prefix) >= 0 =>
108+
raw.replace(prefix, "").replace(umiDelimiter, '-').toUpperCase
109+
case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase
110+
case _ => raw.toUpperCase
111+
})
112+
85113
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
86114

87115
if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}")

src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala

+19-5
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
3333
private case class Result(name: String, umi: String)
3434

3535
/** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */
36-
private def run(names: Iterable[String], removeUmi: Boolean, umiPrefix: Option[String] = None): IndexedSeq[Result] = {
36+
private def run(names: Iterable[String],
37+
removeUmi: Boolean,
38+
rcPrefix: Option[String] = None,
39+
normalizeRcUmis: Boolean = false): IndexedSeq[Result] = {
3740
// build the reads
3841
val builder = new SamBuilder()
3942
names.foreach { name => builder.addFrag(name=name, unmapped=true) }
4043

4144
// run the tool
4245
val out = makeTempFile("test.", ".bam")
43-
val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi, umiPrefix=umiPrefix)
46+
val tool = new CopyUmiFromReadName(input=builder.toTempFile(),
47+
output=out,
48+
removeUmi=removeUmi,
49+
rcPrefix=rcPrefix,
50+
normalizeRcUmis=normalizeRcUmis)
4451
executeFgbioTool(tool)
4552

4653
// slurp the results
@@ -70,10 +77,17 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
7077
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
7178
}
7279

73-
it should "remove any additional separator characters preceding the UMI" in {
74-
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA-CCCC")
75-
val results = run(names=names, removeUmi=true, umiPrefix=Some("r"))
80+
it should "remove a reverse-complement prefix to the UMI" in {
81+
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
82+
val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = false)
7683
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
7784
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
7885
}
86+
87+
it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI when '--normalize-rc-umis'" in {
88+
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
89+
val results = run(names=names, removeUmi=true, rcPrefix=Some("r"), normalizeRcUmis = true)
90+
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
91+
results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")
92+
}
7993
}

src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala

+3-3
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues {
9292
}
9393

9494
it should "split on a different name delimiter if specified" in {
95-
copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
96-
copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
97-
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
95+
copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
96+
copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
97+
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
9898
}
9999

100100
it should "change the UMI delimiter from + to -" in {

0 commit comments

Comments
 (0)