feat: FastqToBam can extract UMI(s) from the comment in the read name

nh13 · nh13 · commit a31621119c50 · 2024-05-20T09:53:03.000-07:00
diff --git a/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala b/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala
@@ -71,12 +71,15 @@ import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord}
     |For more information on read structures see the
     |[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures)
     |
-    |UMIs may be extracted from the read sequences, the read names, or both.  If `--extract-umis-from-read-names` is
+    |UMIs may be extracted from the read sequences, the read names (or comment), or both.  If `--extract-umis-from-read-names` is
     |specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with
     |any UMIs present in the 8th field.  If this option is specified, the `--umi-qual-tag` option may not be used as
     |qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those
     |will also be extracted.  If UMIs are present in both, the final UMIs are constructed by first taking the UMIs
-    |from the read names, then adding a hyphen, then the UMIs extracted from the reads.
+    |from the read names, then adding a hyphen, then the UMIs extracted from the reads.  If `--extract-umis-from-read-comment` is
+    |specified, any UMIs present in the read name comments are extracted; the read name comment is the text _after_
+    |the first white space in the read name (like a FASTA).  If the comment is `:`-separated, then the UMI will be
+    |extracted from the last field, otherwise the full comment will be used.
     |
     |The same number of input files and read structures must be provided, with one exception: if supplying exactly
     |1 or 2 fastq files, both of which are solely template reads, no read structures need be provided.
@@ -93,7 +96,10 @@ class FastqToBam
   @arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.")                         val umiTag: String = ConsensusTags.UmiBases,
   @arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.")                 val umiQualTag: Option[String] = None,
   @arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.")                      val storeSampleBarcodeQualities: Boolean = false,
-  @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.")         val extractUmisFromReadNames: Boolean = false,
+  @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadComment"))
+  val extractUmisFromReadNames: Boolean = false,
+  @arg(flag='c', doc="Extract UMI(s) from read name comment and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadNames"))
+  val extractUmisFromReadComment: Boolean = false,
   @arg(          doc="Read group ID to use in the file header.")                               val readGroupId: String = "A",
   @arg(          doc="The name of the sequenced sample.")                                      val sample: String,
   @arg(          doc="The name/ID of the sequenced library.")                                  val library: String,
@@ -117,6 +123,7 @@ class FastqToBam
   validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.")
   validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.")
   validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.")
+  validate(!extractUmisFromReadComment || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read description.")
 
   override def execute(): Unit = {
     val encoding = qualityEncoding
@@ -166,7 +173,11 @@ class FastqToBam
       val templates     = subs.iterator.filter(_.kind == Template).toList
 
       // If requested, pull out the UMI(s) from the read name
-      val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None
+      val umiFromReadName = {
+        if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true)
+        else if (extractUmisFromReadComment) fqs.head.comment.flatMap(comment => Umis.extractUmisFromReadComment(comment, strict=true))
+        else None
+      }
 
       templates.zipWithIndex.map { case (read, index) =>
         // If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy
diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala
@@ -55,15 +55,15 @@ object Umis {
 
   /**
     * Extracts the UMI from an Illumina fastq style read name.  Illumina documents their FASTQ read names as:
-    *   @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>
+    *   `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
     *
     *  See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
     *  The UMI field is optional, so read names may or may not contain it.  Illumina also specifies that the UMI
     *  field may contain multiple UMIs, in which case they will delimit them with `+` characters.  Pluses will be
     *  translated to hyphens before returning.
     *
     *  If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, 
-    with the UMI being the last in the case of 8 and `None` in the case of 7.
+    *  with the UMI being the last in the case of 8 and `None` in the case of 7.
     * 
     * If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
     */
@@ -88,6 +88,42 @@ object Umis {
     else umi
   }
 
+  /**
+    * Extracts the UMI from an Illumina fastq style read name.  Illumina documents their FASTQ read names as:
+    *   `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
+    *
+    *   The index in the description may be replaced with a UMI field.
+    *
+    *  See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
+    *  The UMI field is optional, so read names may or may not contain it.  Illumina also specifies that the UMI
+    *  field may contain multiple UMIs, in which case they will delimit them with `+` characters.  Pluses will be
+    *  translated to hyphens before returning.
+    *
+    *  If `strict` is true the comment _must_ contain either 4 colon-separated segments,
+    *  with the UMI being the last in the case of 4.
+    *
+    * If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
+    */
+  def extractUmisFromReadComment(comment: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
+    // If strict, check that the read name actually has eight parts, which is expected
+    val rawUmi = if (strict) {
+      val colons = comment.count(_ == delimiter)
+      if (colons == 3) Some(comment.substring(comment.lastIndexOf(delimiter) + 1, comment.length))
+      else throw new IllegalArgumentException(s"Trying to extract UMI from read comment with ${colons + 1} parts (4 expected): ${comment}")
+    } else {
+      val idx = comment.lastIndexOf(delimiter)
+      if (idx == -1) Some(comment)
+      else Some(comment.substring(idx + 1, comment.length))
+    }
+
+    val umi   = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
+    val valid = umi.forall(u => u.forall(isValidUmiCharacter))
+
+    if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${comment}")
+    else if (!valid) None
+    else umi
+  }
+
   @inline private def isValidUmiCharacter(ch: Char): Boolean = {
     ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
   }
diff --git a/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala b/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala
@@ -383,6 +383,37 @@ class FastqToBamTest extends UnitSpec {
     recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
   }
 
+
+  it should "extract UMIs from the comment in read names when requested" in {
+    val r1 = fq(
+      FastqRecord("q1:2:3:4:5:6:7:8", "AAAAAAAAAA", "==========", comment=Some("0:1:2:ACGT")),
+      FastqRecord("q2:2:3:4:5:6:7:8", "TAAAAAAAAA", "==========", comment=Some("0:1:2:TTGA")),
+      FastqRecord("q3:2:3:4:5:6:7",      "TAAAAAAAAA", "=========="),
+    )
+    val bam = makeTempFile("fastqToBamTest.", ".bam")
+    new FastqToBam(input=Seq(r1), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
+    val recs = readBamRecs(bam)
+    recs should have size 3
+    recs(0).apply[String]("RX") shouldBe "ACGT"
+    recs(1).apply[String]("RX") shouldBe "TTGA"
+    recs(2).get[String]("RX")   shouldBe None
+  }
+
+  it should "extract UMIs from the comment in read names and read sequences" in {
+    val r1 = fq(
+      FastqRecord("q1:2:3:4:5:6:7:8", "GGNCCGAAAAAAA", "=============", comment=Some("0:1:2:ACGT+CGTA")),
+      FastqRecord("q2:2:3:4:5:6:7:8", "TANAACAAAAAAA", "=============", comment=Some("0:1:2:TTGA+TAAT")),
+    )
+    val rs  = ReadStructure("2M1S2M+T")
+    val bam = makeTempFile("fastqToBamTest.", ".bam")
+    new FastqToBam(input=Seq(r1), readStructures=Seq(rs), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
+    val recs = readBamRecs(bam)
+    recs should have size 2
+    recs(0).apply[String]("RX") shouldBe "ACGT-CGTA-GG-CC"
+    recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
+  }
+
+
   it should "fail when read names don't match up" in {
     val r1 = fq(FastqRecord("q1", "AAAAAAAAAA", "=========="))
     val r2 = fq(FastqRecord("x1", "CCCCCCCCCC", "##########"))
diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
@@ -43,7 +43,7 @@ class UmisTest extends UnitSpec with OptionValues {
     Umis.extractUmisFromReadName("1:2:3:4:5:6:7:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
   }
 
-  it should "return None if the read name has only 7 parts in strict mode" in {
+  it should "return None if the read has only 7 parts in strict mode" in {
     Umis.extractUmisFromReadName("1:2:3:4:5:6:7", strict=true) shouldBe None
   }
 
@@ -78,6 +78,41 @@ class UmisTest extends UnitSpec with OptionValues {
     Umis.extractUmisFromReadName("1:2:3:4:5:6:7:8", strict=false) shouldBe None
   }
 
+  "Umis.extractUmisFromReadComment" should "extract a UMI from a well-formed read name" in {
+    Umis.extractUmisFromReadComment("0:1:2:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
+  }
+
+  it should "throw an exception in strict mode if the read has too many or too few segments" in {
+    an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:ACGTACGT", strict=true) }
+    an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:2:3:ACGTACGT", strict=true) }
+  }
+
+  it should "translate pluses to hyphens when multiple UMIs are present" in {
+    Umis.extractUmisFromReadComment("0:1:2:ACGTACGT+TTGCGGCT", strict=true).value shouldBe "ACGTACGT-TTGCGGCT"
+  }
+
+  it should "extract a UMI from the last segment in non-strict mode, if it looks like a UMI" in {
+    Umis.extractUmisFromReadComment("ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:5:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:ACGT", strict=false).value shouldBe "ACGT"
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:9:ACGT", strict=false).value shouldBe "ACGT"
+  }
+
+  it should "return None in non-strict mode if the last segment doesn't look like a UMI" in {
+    Umis.extractUmisFromReadComment("1:2", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3:4", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3:4:5", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:7", strict=false) shouldBe None
+    Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8", strict=false) shouldBe None
+  }
 
   "Umis.copyUmiFromReadName" should "copy the UMI from the read name" in {
     copyUmiFromReadName(rec=rec("UMI:A")).nameAndUmi shouldBe ("UMI:A", "A")
@@ -91,7 +126,7 @@ class UmisTest extends UnitSpec with OptionValues {
     copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT"), removeUmi=true).nameAndUmi shouldBe ("UMI:C", "ACC-GGT")
   }
   
-  it should "split on a different name delimiter if specified" in {
+  it should "split on a different delimiter if specified" in {
     copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
     copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
     copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
@@ -101,7 +136,7 @@ class UmisTest extends UnitSpec with OptionValues {
     copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT")).nameAndUmi shouldBe ("UMI:C:ACC+GGT", "ACC-GGT")
   }
 
-  it should "fail if the read name has only one field" in {
+  it should "fail if the read has only one field" in {
     an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME"))
     an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("1-2"))
   }