Skip to content

Commit 3f7f5f5

Browse files
committed
Rework extract_umi() and copy_umi():
* add strict param to copy_umi() to throw exception with invalid UMI * add logic to extract_umi() for strict=True to count colons in read_name * add additional testing coverage
1 parent a381664 commit 3f7f5f5

File tree

2 files changed

+72
-34
lines changed

2 files changed

+72
-34
lines changed

fgpyo/sam/__init__.py

+41-23
Original file line numberDiff line numberDiff line change
@@ -191,13 +191,15 @@
191191
"""The classes that should be treated as file-like classes"""
192192

193193
SAM_UMI_DELIMITER: str = "-"
194-
"""Multiple UMI delimiter, which SAM specification recommends should be a hyphen"""
194+
"""Multiple UMI delimiter, which SAM specification recommends should be a hyphen;
195+
see specification here: https://samtools.github.io/hts-specs/SAMtags.pdf"""
195196

196197
VALID_UMI_CHARACTERS: Set[str] = set("ACGTN")
197-
"""Illumina's restricted UMI characters."""
198+
"""Illumina's restricted UMI characters;
199+
https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm."""
198200

199201
ILLUMINA_UMI_DELIMITER: str = "+"
200-
"""Multiple UMIs are delimited with a plus-sign in Illumina FASTQs."""
202+
"""Multiple UMIs are delimited with a plus-sign in Illumina FASTQs; see docs above."""
201203

202204
ILLUMINA_READ_NAME_DELIMITER: str = ":"
203205
"""Illumina read names are delimited with a colon."""
@@ -956,20 +958,25 @@ def extract_umis_from_read_name(
956958
strict: bool = False,
957959
) -> Optional[str]:
958960
"""Extract UMI(s) from a read name.
961+
959962
The UMI is expected to be the final component of the read name, delimited by the
960963
`read_name_delimiter`. Multiple UMIs may be present, delimited by the `umi_delimiter`. This
961964
delimiter will be replaced by the SAM-standard `-`.
965+
962966
Args:
963967
read_name: The read name to extract the UMI from.
964968
read_name_delimiter: The delimiter separating the components of the read name.
965969
umi_delimiter: The delimiter separating multiple UMIs.
966970
strict: If `strict` is true, the read name must contain either 7 or 8 colon-separated
967-
segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
968-
in the case of 7 segments. If `strict` is false, the last segment is returned so long
969-
as it appears to be a valid UMI.
971+
segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
972+
in the case of 7 segments. `strict` requires the UMI to be valid and consistent with
973+
Illumina's allowed UMI characters. If `strict` is false, the last segment is returned
974+
so long as it appears to be a valid UMI.
975+
970976
Returns:
971977
The UMI extracted from the read name, or None if no UMI was found. Multiple UMIs are
972978
returned in a single string, separated by a hyphen (`-`).
979+
973980
Raises:
974981
ValueError: If the read name does not end with a valid UMI.
975982
"""
@@ -991,19 +998,28 @@ def extract_umis_from_read_name(
991998

992999
invalid_umis = [umi for umi in umis if not _is_valid_umi(umi)]
9931000
if len(invalid_umis) > 0:
994-
raise ValueError(
995-
f"Invalid UMIs found in read name: {read_name}",
996-
f" (Invalid UMIs: {', '.join(invalid_umis)})",
997-
)
998-
return SAM_UMI_DELIMITER.join(umis)
1001+
if strict:
1002+
raise ValueError(
1003+
f"Invalid UMIs found in read name: {read_name}",
1004+
f" (Invalid UMIs: {', '.join(invalid_umis)})",
1005+
)
1006+
else:
1007+
return None
1008+
1009+
else:
1010+
return SAM_UMI_DELIMITER.join(umis)
9991011

10001012

1001-
def copy_umi_from_read_name(rec: AlignedSegment, remove_umi: bool = False) -> None:
1013+
def copy_umi_from_read_name(
1014+
rec: AlignedSegment, strict: bool = False, remove_umi: bool = False
1015+
) -> None:
10021016
"""
1003-
Copy a UMI from an alignment's read name to its `RX` SAM tag.
1017+
Copy a UMI from an alignment's read name to its `RX` SAM tag. UMI will not be copied to RX
1018+
tag if invalid.
10041019
10051020
Args:
10061021
rec: The alignment record to update.
1022+
strict: If True and UMI invalid, will throw an exception
10071023
remove_umi: If True, the UMI will be removed from the read name after copying.
10081024
10091025
Returns:
@@ -1015,17 +1031,19 @@ def copy_umi_from_read_name(rec: AlignedSegment, remove_umi: bool = False) -> No
10151031
"""
10161032

10171033
umi = extract_umis_from_read_name(
1018-
read_name=rec.query_name, umi_delimiter=ILLUMINA_READ_NAME_DELIMITER
1034+
read_name=rec.query_name, strict=strict, umi_delimiter=ILLUMINA_READ_NAME_DELIMITER
10191035
)
1020-
if not _is_valid_umi(umi):
1021-
raise ValueError(
1022-
f"Invalid UMI(s) found in read name: {rec.query_name}",
1023-
)
1024-
else:
1025-
rec.set_tag(tag="RX", value=umi, value_type="Z")
1026-
if remove_umi:
1027-
last_index = rec.query_name.rfind(ILLUMINA_READ_NAME_DELIMITER)
1028-
rec.query_name = rec.query_name[:last_index] if last_index != -1 else rec.query_name
1036+
if umi is None:
1037+
if strict:
1038+
raise ValueError(f"Invalid UMI {umi} extracted from {rec.query_name}")
1039+
else:
1040+
return
1041+
1042+
rec.set_tag(tag="RX", value=umi)
1043+
1044+
if remove_umi:
1045+
last_index = rec.query_name.rfind(ILLUMINA_READ_NAME_DELIMITER)
1046+
rec.query_name = rec.query_name[:last_index] if last_index != -1 else rec.query_name
10291047

10301048

10311049
def _is_valid_umi(umi: str) -> bool:

fgpyo/sam/tests/test_umi_methods.py

+31-11
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,21 @@ def test_extract_umi_from_read_name(read_name: str, umi: str) -> None:
5050
],
5151
)
5252
def test_extract_umi_from_read_name_raises(read_name: str) -> None:
53-
"""Test that we raise an error when the read name includes an invalid UMI."""
53+
"""Test that we raise an error when the read name includes an invalid UMI
54+
and strict=True."""
5455
with pytest.raises(ValueError):
55-
extract_umis_from_read_name(read_name)
56+
extract_umis_from_read_name(read_name=read_name, strict=True)
57+
58+
59+
def test_extract_umi_from_read_name_strict_False() -> None:
60+
"""Test that we return None when an invalid UMI is encountered
61+
and strict=False (but still return a valid UMI)."""
62+
assert extract_umis_from_read_name(read_name="abc:def:ghi:ArCGT", strict=False) is None
63+
assert extract_umis_from_read_name(read_name="abc:def:ghi:ACGTr", strict=False) is None
64+
assert (
65+
extract_umis_from_read_name(read_name="abc:def:ghi:rACGT+CAGA", strict=False)
66+
== "ACGT-CAGA"
67+
)
5668

5769

5870
@pytest.mark.parametrize(
@@ -81,17 +93,25 @@ def test_strict_extract_umi_from_read_name_raises(read_name: str) -> None:
8193
extract_umis_from_read_name(read_name, strict=True)
8294

8395

84-
def test_copy_umi_from_read_name() -> None:
96+
@pytest.mark.parametrize("remove_umi, strict", [[True, False], [True, False]])
97+
def test_copy_valid_umi_from_read_name(remove_umi: bool, strict: bool) -> None:
98+
"""Test that we populate the RX field with a valid UMI if remove_umi and strict
99+
are both True; otherwise do not remove UMI from read.query_name"""
85100
builder = SamBuilder()
86-
read = builder.add_single(name="read_name:GATTACA")
87-
copy_umi_from_read_name(read, remove_umi=False)
88-
assert read.query_name == "read_name:GATTACA"
101+
read = builder.add_single(name="abc:def:ghi:jfk:lmn:opq:rst:GATTACA")
102+
copy_umi_from_read_name(read, strict=strict, remove_umi=remove_umi)
89103
assert read.get_tag("RX") == "GATTACA"
104+
if remove_umi:
105+
assert read.query_name == "abc:def:ghi:jfk:lmn:opq:rst"
106+
else:
107+
assert read.query_name == "abc:def:ghi:jfk:lmn:opq:rst:GATTACA"
90108

91109

92-
def test_copy_remove_umi_from_read_name() -> None:
110+
def test_copy_invalid_umi_from_read_name() -> None:
111+
"""Test that we do not set the RX tag if we encounter an invalid UMI"""
93112
builder = SamBuilder()
94-
read = builder.add_single(name="read_name:GATTACA")
95-
copy_umi_from_read_name(read, remove_umi=True)
96-
assert read.query_name == "read_name"
97-
assert read.get_tag("RX") == "GATTACA"
113+
read = builder.add_single(name="abc:def:ghi:jfk:lmn:opq:rst:uvw+xyz")
114+
assert _is_valid_umi(read.query_name) is False
115+
with pytest.raises(ValueError):
116+
copy_umi_from_read_name(read, strict=True, remove_umi=True)
117+
assert read.has_tag("RX") is False

0 commit comments

Comments
 (0)