Rework extract_umi() and copy_umi():

emmcauley · emmcauley · commit 3f7f5f59aee5 · 2024-06-03T13:49:30.000-04:00
* add strict param to copy_umi() to throw exception with invalid UMI
* add logic to extract_umi() for strict=True to count colons in read_name
* add additional testing coverage
diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py
@@ -191,13 +191,15 @@
 """The classes that should be treated as file-like classes"""
 
 SAM_UMI_DELIMITER: str = "-"
-"""Multiple UMI delimiter, which SAM specification recommends should be a hyphen"""
+"""Multiple UMI delimiter, which SAM specification recommends should be a hyphen;
+see specification here: https://samtools.github.io/hts-specs/SAMtags.pdf"""
 
 VALID_UMI_CHARACTERS: Set[str] = set("ACGTN")
-"""Illumina's restricted UMI characters."""
+"""Illumina's restricted UMI characters;
+https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm."""
 
 ILLUMINA_UMI_DELIMITER: str = "+"
-"""Multiple UMIs are delimited with a plus-sign in Illumina FASTQs."""
+"""Multiple UMIs are delimited with a plus-sign in Illumina FASTQs; see docs above."""
 
 ILLUMINA_READ_NAME_DELIMITER: str = ":"
 """Illumina read names are delimited with a colon."""
@@ -956,20 +958,25 @@ def extract_umis_from_read_name(
     strict: bool = False,
 ) -> Optional[str]:
     """Extract UMI(s) from a read name.
+
     The UMI is expected to be the final component of the read name, delimited by the
     `read_name_delimiter`. Multiple UMIs may be present, delimited by the `umi_delimiter`. This
     delimiter will be replaced by the SAM-standard `-`.
+
     Args:
         read_name: The read name to extract the UMI from.
         read_name_delimiter: The delimiter separating the components of the read name.
         umi_delimiter: The delimiter separating multiple UMIs.
         strict: If `strict` is true, the read name must contain either 7 or 8 colon-separated
-        segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
-        in the case of 7 segments. If `strict` is false, the last segment is returned so long
-        as it appears to be a valid UMI.
+          segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
+          in the case of 7 segments. `strict` requires the UMI to be valid and consistent with
+          Illumina's allowed UMI characters. If `strict` is false, the last segment is returned
+          so long as it appears to be a valid UMI.
+
     Returns:
         The UMI extracted from the read name, or None if no UMI was found. Multiple UMIs are
         returned in a single string, separated by a hyphen (`-`).
+
     Raises:
         ValueError: If the read name does not end with a valid UMI.
     """
@@ -991,19 +998,28 @@ def extract_umis_from_read_name(
 
     invalid_umis = [umi for umi in umis if not _is_valid_umi(umi)]
     if len(invalid_umis) > 0:
-        raise ValueError(
-            f"Invalid UMIs found in read name: {read_name}",
-            f"  (Invalid UMIs: {', '.join(invalid_umis)})",
-        )
-    return SAM_UMI_DELIMITER.join(umis)
+        if strict:
+            raise ValueError(
+                f"Invalid UMIs found in read name: {read_name}",
+                f"  (Invalid UMIs: {', '.join(invalid_umis)})",
+            )
+        else:
+            return None
+
+    else:
+        return SAM_UMI_DELIMITER.join(umis)
 
 
-def copy_umi_from_read_name(rec: AlignedSegment, remove_umi: bool = False) -> None:
+def copy_umi_from_read_name(
+    rec: AlignedSegment, strict: bool = False, remove_umi: bool = False
+) -> None:
     """
-    Copy a UMI from an alignment's read name to its `RX` SAM tag.
+    Copy a UMI from an alignment's read name to its `RX` SAM tag. UMI will not be copied to RX
+     tag if invalid.
 
     Args:
         rec: The alignment record to update.
+        strict: If True and UMI invalid, will throw an exception
         remove_umi: If True, the UMI will be removed from the read name after copying.
 
     Returns:
@@ -1015,17 +1031,19 @@ def copy_umi_from_read_name(rec: AlignedSegment, remove_umi: bool = False) -> No
     """
 
     umi = extract_umis_from_read_name(
-        read_name=rec.query_name, umi_delimiter=ILLUMINA_READ_NAME_DELIMITER
+        read_name=rec.query_name, strict=strict, umi_delimiter=ILLUMINA_READ_NAME_DELIMITER
     )
-    if not _is_valid_umi(umi):
-        raise ValueError(
-            f"Invalid UMI(s) found in read name: {rec.query_name}",
-        )
-    else:
-        rec.set_tag(tag="RX", value=umi, value_type="Z")
-        if remove_umi:
-            last_index = rec.query_name.rfind(ILLUMINA_READ_NAME_DELIMITER)
-            rec.query_name = rec.query_name[:last_index] if last_index != -1 else rec.query_name
+    if umi is None:
+        if strict:
+            raise ValueError(f"Invalid UMI {umi} extracted from {rec.query_name}")
+        else:
+            return
+
+    rec.set_tag(tag="RX", value=umi)
+
+    if remove_umi:
+        last_index = rec.query_name.rfind(ILLUMINA_READ_NAME_DELIMITER)
+        rec.query_name = rec.query_name[:last_index] if last_index != -1 else rec.query_name
 
 
 def _is_valid_umi(umi: str) -> bool:
diff --git a/fgpyo/sam/tests/test_umi_methods.py b/fgpyo/sam/tests/test_umi_methods.py
@@ -50,9 +50,21 @@ def test_extract_umi_from_read_name(read_name: str, umi: str) -> None:
     ],
 )
 def test_extract_umi_from_read_name_raises(read_name: str) -> None:
-    """Test that we raise an error when the read name includes an invalid UMI."""
+    """Test that we raise an error when the read name includes an invalid UMI
+    and strict=True."""
     with pytest.raises(ValueError):
-        extract_umis_from_read_name(read_name)
+        extract_umis_from_read_name(read_name=read_name, strict=True)
+
+
+def test_extract_umi_from_read_name_strict_False() -> None:
+    """Test that we return None when an invalid UMI is encountered
+    and strict=False (but still return a valid UMI)."""
+    assert extract_umis_from_read_name(read_name="abc:def:ghi:ArCGT", strict=False) is None
+    assert extract_umis_from_read_name(read_name="abc:def:ghi:ACGTr", strict=False) is None
+    assert (
+        extract_umis_from_read_name(read_name="abc:def:ghi:rACGT+CAGA", strict=False)
+        == "ACGT-CAGA"
+    )
 
 
 @pytest.mark.parametrize(
@@ -81,17 +93,25 @@ def test_strict_extract_umi_from_read_name_raises(read_name: str) -> None:
         extract_umis_from_read_name(read_name, strict=True)
 
 
-def test_copy_umi_from_read_name() -> None:
+@pytest.mark.parametrize("remove_umi, strict", [[True, False], [True, False]])
+def test_copy_valid_umi_from_read_name(remove_umi: bool, strict: bool) -> None:
+    """Test that we populate the RX field with a valid UMI if remove_umi and strict
+    are both True; otherwise do not remove UMI from read.query_name"""
     builder = SamBuilder()
-    read = builder.add_single(name="read_name:GATTACA")
-    copy_umi_from_read_name(read, remove_umi=False)
-    assert read.query_name == "read_name:GATTACA"
+    read = builder.add_single(name="abc:def:ghi:jfk:lmn:opq:rst:GATTACA")
+    copy_umi_from_read_name(read, strict=strict, remove_umi=remove_umi)
     assert read.get_tag("RX") == "GATTACA"
+    if remove_umi:
+        assert read.query_name == "abc:def:ghi:jfk:lmn:opq:rst"
+    else:
+        assert read.query_name == "abc:def:ghi:jfk:lmn:opq:rst:GATTACA"
 
 
-def test_copy_remove_umi_from_read_name() -> None:
+def test_copy_invalid_umi_from_read_name() -> None:
+    """Test that we do not set the RX tag if we encounter an invalid UMI"""
     builder = SamBuilder()
-    read = builder.add_single(name="read_name:GATTACA")
-    copy_umi_from_read_name(read, remove_umi=True)
-    assert read.query_name == "read_name"
-    assert read.get_tag("RX") == "GATTACA"
+    read = builder.add_single(name="abc:def:ghi:jfk:lmn:opq:rst:uvw+xyz")
+    assert _is_valid_umi(read.query_name) is False
+    with pytest.raises(ValueError):
+        copy_umi_from_read_name(read, strict=True, remove_umi=True)
+    assert read.has_tag("RX") is False