191
191
"""The classes that should be treated as file-like classes"""
192
192
193
193
SAM_UMI_DELIMITER : str = "-"
194
- """Multiple UMI delimiter, which SAM specification recommends should be a hyphen"""
194
+ """Multiple UMI delimiter, which SAM specification recommends should be a hyphen;
195
+ see specification here: https://samtools.github.io/hts-specs/SAMtags.pdf"""
195
196
196
197
VALID_UMI_CHARACTERS : Set [str ] = set ("ACGTN" )
197
- """Illumina's restricted UMI characters."""
198
+ """Illumina's restricted UMI characters;
199
+ https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm."""
198
200
199
201
ILLUMINA_UMI_DELIMITER : str = "+"
200
- """Multiple UMIs are delimited with a plus-sign in Illumina FASTQs."""
202
+ """Multiple UMIs are delimited with a plus-sign in Illumina FASTQs; see docs above ."""
201
203
202
204
ILLUMINA_READ_NAME_DELIMITER : str = ":"
203
205
"""Illumina read names are delimited with a colon."""
@@ -956,20 +958,25 @@ def extract_umis_from_read_name(
956
958
strict : bool = False ,
957
959
) -> Optional [str ]:
958
960
"""Extract UMI(s) from a read name.
961
+
959
962
The UMI is expected to be the final component of the read name, delimited by the
960
963
`read_name_delimiter`. Multiple UMIs may be present, delimited by the `umi_delimiter`. This
961
964
delimiter will be replaced by the SAM-standard `-`.
965
+
962
966
Args:
963
967
read_name: The read name to extract the UMI from.
964
968
read_name_delimiter: The delimiter separating the components of the read name.
965
969
umi_delimiter: The delimiter separating multiple UMIs.
966
970
strict: If `strict` is true, the read name must contain either 7 or 8 colon-separated
967
- segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
968
- in the case of 7 segments. If `strict` is false, the last segment is returned so long
969
- as it appears to be a valid UMI.
971
+ segments. The UMI is assumed to be the last one in the case of 8 segments and `None`
972
+ in the case of 7 segments. `strict` requires the UMI to be valid and consistent with
973
+ Illumina's allowed UMI characters. If `strict` is false, the last segment is returned
974
+ so long as it appears to be a valid UMI.
975
+
970
976
Returns:
971
977
The UMI extracted from the read name, or None if no UMI was found. Multiple UMIs are
972
978
returned in a single string, separated by a hyphen (`-`).
979
+
973
980
Raises:
974
981
ValueError: If the read name does not end with a valid UMI.
975
982
"""
@@ -991,19 +998,28 @@ def extract_umis_from_read_name(
991
998
992
999
invalid_umis = [umi for umi in umis if not _is_valid_umi (umi )]
993
1000
if len (invalid_umis ) > 0 :
994
- raise ValueError (
995
- f"Invalid UMIs found in read name: { read_name } " ,
996
- f" (Invalid UMIs: { ', ' .join (invalid_umis )} )" ,
997
- )
998
- return SAM_UMI_DELIMITER .join (umis )
1001
+ if strict :
1002
+ raise ValueError (
1003
+ f"Invalid UMIs found in read name: { read_name } " ,
1004
+ f" (Invalid UMIs: { ', ' .join (invalid_umis )} )" ,
1005
+ )
1006
+ else :
1007
+ return None
1008
+
1009
+ else :
1010
+ return SAM_UMI_DELIMITER .join (umis )
999
1011
1000
1012
1001
- def copy_umi_from_read_name (rec : AlignedSegment , remove_umi : bool = False ) -> None :
1013
+ def copy_umi_from_read_name (
1014
+ rec : AlignedSegment , strict : bool = False , remove_umi : bool = False
1015
+ ) -> None :
1002
1016
"""
1003
- Copy a UMI from an alignment's read name to its `RX` SAM tag.
1017
+ Copy a UMI from an alignment's read name to its `RX` SAM tag. UMI will not be copied to RX
1018
+ tag if invalid.
1004
1019
1005
1020
Args:
1006
1021
rec: The alignment record to update.
1022
+ strict: If True and UMI invalid, will throw an exception
1007
1023
remove_umi: If True, the UMI will be removed from the read name after copying.
1008
1024
1009
1025
Returns:
@@ -1015,17 +1031,19 @@ def copy_umi_from_read_name(rec: AlignedSegment, remove_umi: bool = False) -> No
1015
1031
"""
1016
1032
1017
1033
umi = extract_umis_from_read_name (
1018
- read_name = rec .query_name , umi_delimiter = ILLUMINA_READ_NAME_DELIMITER
1034
+ read_name = rec .query_name , strict = strict , umi_delimiter = ILLUMINA_READ_NAME_DELIMITER
1019
1035
)
1020
- if not _is_valid_umi (umi ):
1021
- raise ValueError (
1022
- f"Invalid UMI(s) found in read name: { rec .query_name } " ,
1023
- )
1024
- else :
1025
- rec .set_tag (tag = "RX" , value = umi , value_type = "Z" )
1026
- if remove_umi :
1027
- last_index = rec .query_name .rfind (ILLUMINA_READ_NAME_DELIMITER )
1028
- rec .query_name = rec .query_name [:last_index ] if last_index != - 1 else rec .query_name
1036
+ if umi is None :
1037
+ if strict :
1038
+ raise ValueError (f"Invalid UMI { umi } extracted from { rec .query_name } " )
1039
+ else :
1040
+ return
1041
+
1042
+ rec .set_tag (tag = "RX" , value = umi )
1043
+
1044
+ if remove_umi :
1045
+ last_index = rec .query_name .rfind (ILLUMINA_READ_NAME_DELIMITER )
1046
+ rec .query_name = rec .query_name [:last_index ] if last_index != - 1 else rec .query_name
1029
1047
1030
1048
1031
1049
def _is_valid_umi (umi : str ) -> bool :
0 commit comments