Merge pull request #876 from uclahs-cds/czhu-fix-split-fasta

Chenghao Zhu · web-flow · commit 305ffbbed3d7 · 2024-06-23T14:54:28.000+08:00
Fix splitFasta: NovelORF peptides from coding transcripts not recognized correctly
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.4.2] - 2024-06-23
+
+- Fixed `splitFasta` that NovelORF peptides coding transcripts not recognized correctly.
+
 ## [1.4.1] - 2024-05-26
 
 - Fixed `VariantPepidePool` that old versions of `SeqUtils.molecular_weight` don't handle `SeqRecord` objects. #874
diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -8,7 +8,7 @@
 from . import constant
 
 
-__version__ = '1.4.1'
+__version__ = '1.4.2'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py
@@ -203,7 +203,7 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,
             info = VariantPeptideInfo(str(variant_id), gene_ids, var_ids, variant_id.index)
 
             if check_source:
-                if tx_id not in coding_tx:
+                if variant_id.orf_id is not None:
                     info.sources.add(constant.SOURCE_NOVEL_ORF, group_map=group_map)
 
                 for gene_id, _ids in var_ids.items():
diff --git a/test/unit/test_peptide_pool_splitter.py b/test/unit/test_peptide_pool_splitter.py
@@ -283,7 +283,7 @@ def test_from_variant_peptide_noncoding(self):
         infos = VariantPeptideInfo.from_variant_peptide(peptide, tx2gene, coding_tx, label_map)
         self.assertIn('NovelORF', infos[0].sources)
 
-        peptide = create_aa_record('KHIRJ','ENST0004|1')
+        peptide = create_aa_record('KHIRJ','ENST0004|ORF1|1')
         infos = VariantPeptideInfo.from_variant_peptide(peptide, tx2gene, coding_tx, label_map)
         self.assertIn('NovelORF', infos[0].sources)
 
@@ -543,7 +543,7 @@ def test_split_database_source_comb_order(self):
         peptides_data = [
             [
                 'SSSSSSSR',
-                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|1'
+                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|ORF2|1'
             ]
         ]
         peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})
@@ -576,7 +576,7 @@ def test_split_database_source_comb_order_case2(self):
         peptides_data = [
             [
                 'SSSSFSSR',
-                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|W2F-5|1'
+                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|W2F-5|ORF-2|1'
             ]
         ]
         peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})
diff --git a/test/unit/test_peptide_pool_summarizer.py b/test/unit/test_peptide_pool_summarizer.py
@@ -50,7 +50,7 @@ def test_summarize_fasta_source_comb_order(self):
         peptides_data = [
             [
                 'SSSSSSSR',
-                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|1'
+                'CIRC-ENST0002-E1-E2|1 ENST0005|SE-2100|ORF2|1'
             ]
         ]
         peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})

Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,7 @@ def test_from_variant_peptide_noncoding(self):`
`283`	`283`	`infos = VariantPeptideInfo.from_variant_peptide(peptide, tx2gene, coding_tx, label_map)`
`284`	`284`	`self.assertIn('NovelORF', infos[0].sources)`
`285`	`285`
`286`		`- peptide = create_aa_record('KHIRJ','ENST0004\|1')`
	`286`	`+ peptide = create_aa_record('KHIRJ','ENST0004\|ORF1\|1')`
`287`	`287`	`infos = VariantPeptideInfo.from_variant_peptide(peptide, tx2gene, coding_tx, label_map)`
`288`	`288`	`self.assertIn('NovelORF', infos[0].sources)`
`289`	`289`
`@@ -543,7 +543,7 @@ def test_split_database_source_comb_order(self):`
`543`	`543`	`peptides_data = [`
`544`	`544`	`[`
`545`	`545`	`'SSSSSSSR',`
`546`		`- 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|1'`
	`546`	`+ 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|ORF2\|1'`
`547`	`547`	`]`
`548`	`548`	`]`
`549`	`549`	`peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})`
`@@ -576,7 +576,7 @@ def test_split_database_source_comb_order_case2(self):`
`576`	`576`	`peptides_data = [`
`577`	`577`	`[`
`578`	`578`	`'SSSSFSSR',`
`579`		`- 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|W2F-5\|1'`
	`579`	`+ 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|W2F-5\|ORF-2\|1'`
`580`	`580`	`]`
`581`	`581`	`]`
`582`	`582`	`peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def test_summarize_fasta_source_comb_order(self):`
`50`	`50`	`peptides_data = [`
`51`	`51`	`[`
`52`	`52`	`'SSSSSSSR',`
`53`		`- 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|1'`
	`53`	`+ 'CIRC-ENST0002-E1-E2\|1 ENST0005\|SE-2100\|ORF2\|1'`
`54`	`54`	`]`
`55`	`55`	`]`
`56`	`56`	`peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})`