Skip to content

Commit a316211

Browse files
committed
feat: FastqToBam can extract UMI(s) from the comment in the read name
1 parent ba0788e commit a316211

File tree

4 files changed

+122
-9
lines changed

4 files changed

+122
-9
lines changed

src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala

+15-4
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,15 @@ import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord}
7171
|For more information on read structures see the
7272
|[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures)
7373
|
74-
|UMIs may be extracted from the read sequences, the read names, or both. If `--extract-umis-from-read-names` is
74+
|UMIs may be extracted from the read sequences, the read names (or comment), or both. If `--extract-umis-from-read-names` is
7575
|specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with
7676
|any UMIs present in the 8th field. If this option is specified, the `--umi-qual-tag` option may not be used as
7777
|qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those
7878
|will also be extracted. If UMIs are present in both, the final UMIs are constructed by first taking the UMIs
79-
|from the read names, then adding a hyphen, then the UMIs extracted from the reads.
79+
|from the read names, then adding a hyphen, then the UMIs extracted from the reads. If `--extract-umis-from-read-comment` is
80+
|specified, any UMIs present in the read name comments are extracted; the read name comment is the text _after_
81+
|the first white space in the read name (like a FASTA). If the comment is `:`-separated, then the UMI will be
82+
|extracted from the last field, otherwise the full comment will be used.
8083
|
8184
|The same number of input files and read structures must be provided, with one exception: if supplying exactly
8285
|1 or 2 fastq files, both of which are solely template reads, no read structures need be provided.
@@ -93,7 +96,10 @@ class FastqToBam
9396
@arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.") val umiTag: String = ConsensusTags.UmiBases,
9497
@arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.") val umiQualTag: Option[String] = None,
9598
@arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.") val storeSampleBarcodeQualities: Boolean = false,
96-
@arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.") val extractUmisFromReadNames: Boolean = false,
99+
@arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadComment"))
100+
val extractUmisFromReadNames: Boolean = false,
101+
@arg(flag='c', doc="Extract UMI(s) from read name comment and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadNames"))
102+
val extractUmisFromReadComment: Boolean = false,
97103
@arg( doc="Read group ID to use in the file header.") val readGroupId: String = "A",
98104
@arg( doc="The name of the sequenced sample.") val sample: String,
99105
@arg( doc="The name/ID of the sequenced library.") val library: String,
@@ -117,6 +123,7 @@ class FastqToBam
117123
validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.")
118124
validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.")
119125
validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.")
126+
validate(!extractUmisFromReadComment || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read description.")
120127

121128
override def execute(): Unit = {
122129
val encoding = qualityEncoding
@@ -166,7 +173,11 @@ class FastqToBam
166173
val templates = subs.iterator.filter(_.kind == Template).toList
167174

168175
// If requested, pull out the UMI(s) from the read name
169-
val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None
176+
val umiFromReadName = {
177+
if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true)
178+
else if (extractUmisFromReadComment) fqs.head.comment.flatMap(comment => Umis.extractUmisFromReadComment(comment, strict=true))
179+
else None
180+
}
170181

171182
templates.zipWithIndex.map { case (read, index) =>
172183
// If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy

src/main/scala/com/fulcrumgenomics/umi/Umis.scala

+38-2
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,15 @@ object Umis {
5555

5656
/**
5757
* Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as:
58-
* @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>
58+
* `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
5959
*
6060
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
6161
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
6262
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
6363
* translated to hyphens before returning.
6464
*
6565
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
66-
with the UMI being the last in the case of 8 and `None` in the case of 7.
66+
* with the UMI being the last in the case of 8 and `None` in the case of 7.
6767
*
6868
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
6969
*/
@@ -88,6 +88,42 @@ object Umis {
8888
else umi
8989
}
9090

91+
/**
92+
* Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as:
93+
* `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
94+
*
95+
* The index in the description may be replaced with a UMI field.
96+
*
97+
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
98+
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
99+
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
100+
* translated to hyphens before returning.
101+
*
102+
* If `strict` is true the comment _must_ contain either 4 colon-separated segments,
103+
* with the UMI being the last in the case of 4.
104+
*
105+
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
106+
*/
107+
def extractUmisFromReadComment(comment: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
108+
// If strict, check that the read name actually has eight parts, which is expected
109+
val rawUmi = if (strict) {
110+
val colons = comment.count(_ == delimiter)
111+
if (colons == 3) Some(comment.substring(comment.lastIndexOf(delimiter) + 1, comment.length))
112+
else throw new IllegalArgumentException(s"Trying to extract UMI from read comment with ${colons + 1} parts (4 expected): ${comment}")
113+
} else {
114+
val idx = comment.lastIndexOf(delimiter)
115+
if (idx == -1) Some(comment)
116+
else Some(comment.substring(idx + 1, comment.length))
117+
}
118+
119+
val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
120+
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
121+
122+
if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${comment}")
123+
else if (!valid) None
124+
else umi
125+
}
126+
91127
@inline private def isValidUmiCharacter(ch: Char): Boolean = {
92128
ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
93129
}

src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala

+31
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,37 @@ class FastqToBamTest extends UnitSpec {
383383
recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
384384
}
385385

386+
387+
it should "extract UMIs from the comment in read names when requested" in {
388+
val r1 = fq(
389+
FastqRecord("q1:2:3:4:5:6:7:8", "AAAAAAAAAA", "==========", comment=Some("0:1:2:ACGT")),
390+
FastqRecord("q2:2:3:4:5:6:7:8", "TAAAAAAAAA", "==========", comment=Some("0:1:2:TTGA")),
391+
FastqRecord("q3:2:3:4:5:6:7", "TAAAAAAAAA", "=========="),
392+
)
393+
val bam = makeTempFile("fastqToBamTest.", ".bam")
394+
new FastqToBam(input=Seq(r1), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
395+
val recs = readBamRecs(bam)
396+
recs should have size 3
397+
recs(0).apply[String]("RX") shouldBe "ACGT"
398+
recs(1).apply[String]("RX") shouldBe "TTGA"
399+
recs(2).get[String]("RX") shouldBe None
400+
}
401+
402+
it should "extract UMIs from the comment in read names and read sequences" in {
403+
val r1 = fq(
404+
FastqRecord("q1:2:3:4:5:6:7:8", "GGNCCGAAAAAAA", "=============", comment=Some("0:1:2:ACGT+CGTA")),
405+
FastqRecord("q2:2:3:4:5:6:7:8", "TANAACAAAAAAA", "=============", comment=Some("0:1:2:TTGA+TAAT")),
406+
)
407+
val rs = ReadStructure("2M1S2M+T")
408+
val bam = makeTempFile("fastqToBamTest.", ".bam")
409+
new FastqToBam(input=Seq(r1), readStructures=Seq(rs), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
410+
val recs = readBamRecs(bam)
411+
recs should have size 2
412+
recs(0).apply[String]("RX") shouldBe "ACGT-CGTA-GG-CC"
413+
recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
414+
}
415+
416+
386417
it should "fail when read names don't match up" in {
387418
val r1 = fq(FastqRecord("q1", "AAAAAAAAAA", "=========="))
388419
val r2 = fq(FastqRecord("x1", "CCCCCCCCCC", "##########"))

src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala

+38-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class UmisTest extends UnitSpec with OptionValues {
4343
Umis.extractUmisFromReadName("1:2:3:4:5:6:7:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
4444
}
4545

46-
it should "return None if the read name has only 7 parts in strict mode" in {
46+
it should "return None if the read has only 7 parts in strict mode" in {
4747
Umis.extractUmisFromReadName("1:2:3:4:5:6:7", strict=true) shouldBe None
4848
}
4949

@@ -78,6 +78,41 @@ class UmisTest extends UnitSpec with OptionValues {
7878
Umis.extractUmisFromReadName("1:2:3:4:5:6:7:8", strict=false) shouldBe None
7979
}
8080

81+
"Umis.extractUmisFromReadComment" should "extract a UMI from a well-formed read name" in {
82+
Umis.extractUmisFromReadComment("0:1:2:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
83+
}
84+
85+
it should "throw an exception in strict mode if the read has too many or too few segments" in {
86+
an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:ACGTACGT", strict=true) }
87+
an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:2:3:ACGTACGT", strict=true) }
88+
}
89+
90+
it should "translate pluses to hyphens when multiple UMIs are present" in {
91+
Umis.extractUmisFromReadComment("0:1:2:ACGTACGT+TTGCGGCT", strict=true).value shouldBe "ACGTACGT-TTGCGGCT"
92+
}
93+
94+
it should "extract a UMI from the last segment in non-strict mode, if it looks like a UMI" in {
95+
Umis.extractUmisFromReadComment("ACGT", strict=false).value shouldBe "ACGT"
96+
Umis.extractUmisFromReadComment("1:ACGT", strict=false).value shouldBe "ACGT"
97+
Umis.extractUmisFromReadComment("1:2:ACGT", strict=false).value shouldBe "ACGT"
98+
Umis.extractUmisFromReadComment("1:2:3:ACGT", strict=false).value shouldBe "ACGT"
99+
Umis.extractUmisFromReadComment("1:2:3:4:ACGT", strict=false).value shouldBe "ACGT"
100+
Umis.extractUmisFromReadComment("1:2:3:4:5:ACGT", strict=false).value shouldBe "ACGT"
101+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:ACGT", strict=false).value shouldBe "ACGT"
102+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:ACGT", strict=false).value shouldBe "ACGT"
103+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:ACGT", strict=false).value shouldBe "ACGT"
104+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:9:ACGT", strict=false).value shouldBe "ACGT"
105+
}
106+
107+
it should "return None in non-strict mode if the last segment doesn't look like a UMI" in {
108+
Umis.extractUmisFromReadComment("1:2", strict=false) shouldBe None
109+
Umis.extractUmisFromReadComment("1:2:3", strict=false) shouldBe None
110+
Umis.extractUmisFromReadComment("1:2:3:4", strict=false) shouldBe None
111+
Umis.extractUmisFromReadComment("1:2:3:4:5", strict=false) shouldBe None
112+
Umis.extractUmisFromReadComment("1:2:3:4:5:6", strict=false) shouldBe None
113+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7", strict=false) shouldBe None
114+
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8", strict=false) shouldBe None
115+
}
81116

82117
"Umis.copyUmiFromReadName" should "copy the UMI from the read name" in {
83118
copyUmiFromReadName(rec=rec("UMI:A")).nameAndUmi shouldBe ("UMI:A", "A")
@@ -91,7 +126,7 @@ class UmisTest extends UnitSpec with OptionValues {
91126
copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT"), removeUmi=true).nameAndUmi shouldBe ("UMI:C", "ACC-GGT")
92127
}
93128

94-
it should "split on a different name delimiter if specified" in {
129+
it should "split on a different delimiter if specified" in {
95130
copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
96131
copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
97132
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
@@ -101,7 +136,7 @@ class UmisTest extends UnitSpec with OptionValues {
101136
copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT")).nameAndUmi shouldBe ("UMI:C:ACC+GGT", "ACC-GGT")
102137
}
103138

104-
it should "fail if the read name has only one field" in {
139+
it should "fail if the read has only one field" in {
105140
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME"))
106141
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("1-2"))
107142
}

0 commit comments

Comments
 (0)