Skip to content

Commit 90e31ea

Browse files
author
Chenghao (Trevor) Zhu
authored
Merge pull request #891 from uclahs-cds/czhu-fix-call-variant
Fix callVariant
2 parents f65a298 + 1ecfe9b commit 90e31ea

19 files changed

Lines changed: 2363 additions & 17 deletions

.pylintrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ disable=raw-checker-failed,
7272
superfluous-parens,
7373
unnecessary-lambda-assignment,
7474
unnecessary-dunder-call,
75-
unspecified-encoding
75+
unspecified-encoding,
76+
redefined-builtin
7677

7778
# Enable the message, report, category or checker with the given id(s). You can
7879
# either give multiple identifier separated by comma (,) or put this option

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
1010

1111
## [Unreleased]
1212

13+
## [1.4.4] - 2025-02-11
14+
15+
### Fixed
16+
17+
- Fixed issue reported in #889 where variant bubbles alignment failed due to incorrect handling of in-bridge and out-bridge nodes with the same subgraph ID. The fix ensures that only in-bridge and out-bridge nodes connected to any of the nodes in the members of the variant bubble are considered.
18+
19+
- Fixed issue that `callVariant` fails on transcripts with SEC very close to the start codon.
20+
1321
## [1.4.3] - 2025-01-18
1422

1523
### Fixed

moPepGen/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from . import constant
99

1010

11-
__version__ = '1.4.3'
11+
__version__ = '1.4.4'
1212

1313
## Error messages
1414
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

moPepGen/svgraph/TVGNode.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import copy
55
from collections import deque
66
import math
7+
import uuid
78
from Bio.Seq import Seq
89
from moPepGen.dna import DNASeqRecordWithCoordinates
910
from moPepGen import seqvar, aa
@@ -29,7 +30,7 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
2930
variants:List[seqvar.VariantRecordWithCoordinate]=None,
3031
branch:bool=False, orf:List[int]=None, reading_frame_index:int=None,
3132
subgraph_id:str=None, global_variant:seqvar.VariantRecord=None,
32-
level:int=0, was_bridge:bool=False):
33+
level:int=0, was_bridge:bool=False, id:str=None):
3334
""" Constructor for TVGNode.
3435
3536
Args:
@@ -48,6 +49,7 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
4849
self.global_variant = global_variant
4950
self.level = level
5051
self.was_bridge = was_bridge
52+
self.id = id or str(uuid.uuid4())
5153

5254
def create_node(self, seq:DNASeqRecordWithCoordinates,
5355
variants:List[seqvar.VariantRecordWithCoordinate]=None,

moPepGen/svgraph/ThreeFrameTVG.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1227,6 +1227,16 @@ def find_bridge_nodes_between(self, start:TVGNode, end:TVGNode, members:Set[TVGN
12271227
subgraph_out:Set[TVGNode] = set()
12281228
visited = set()
12291229
queue:Deque[TVGNode] = deque([e.out_node for e in start.out_edges])
1230+
1231+
# This `end_nodes` is to set the boundray of the current variant bubble.
1232+
# Any "inbrige" and "outbrange" nodes to and from any node outside of
1233+
# the current variant bubble should not be considered.
1234+
end_nodes = {}
1235+
for n in members:
1236+
for out_node in n.get_out_nodes():
1237+
if out_node not in members:
1238+
end_nodes[out_node.id] = out_node
1239+
12301240
while queue:
12311241
cur = queue.pop()
12321242
len_before = len(visited)
@@ -1273,6 +1283,8 @@ def find_bridge_nodes_between(self, start:TVGNode, end:TVGNode, members:Set[TVGN
12731283
and cur.get_first_subgraph_id() != cur.get_last_subgraph_id():
12741284
subgraph_out.add(cur)
12751285
continue
1286+
if cur.id in end_nodes:
1287+
continue
12761288
elif cur is not end:
12771289
# This is when an altSplice indel/sub carries additional frameshift
12781290
# variant, and the indel frameshift is very closed to the end of

moPepGen/svgraph/VariantPeptideDict.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,7 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
465465
cur_metadata.label = label
466466
cur_metadata.has_variants = bool(cur_variants)
467467

468-
if is_valid:
469-
cur_metadata_2 = copy.copy(cur_metadata)
468+
if is_valid or is_valid_start:
470469
cur_nodes = []
471470
cut_offset = sec.location.start
472471
for node in nodes:
@@ -481,15 +480,18 @@ def translational_modification(self, seq:Seq, metadata:VariantPeptideMetadata,
481480
else:
482481
cut_offset = max(0, cut_offset - len(node.seq.seq))
483482
continue
484-
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
485-
yield seq_mod, cur_metadata_2
486-
487-
if is_valid_start:
488-
cur_seq=seq_mod[1:]
489-
cur_nodes[0] = cur_nodes[0].copy()
490-
cur_nodes[0].truncate_left(1)
491-
cur_metadata.segments = self.create_peptide_segments(cur_nodes)
492-
yield cur_seq, cur_metadata
483+
if is_valid:
484+
cur_metadata_2 = copy.copy(cur_metadata)
485+
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
486+
yield seq_mod, cur_metadata_2
487+
488+
if is_valid_start:
489+
cur_metadata_2 = copy.copy(cur_metadata)
490+
cur_seq=seq_mod[1:]
491+
cur_nodes[0] = cur_nodes[0].copy()
492+
cur_nodes[0].truncate_left(1)
493+
cur_metadata_2.segments = self.create_peptide_segments(cur_nodes)
494+
yield cur_seq, cur_metadata_2
493495

494496
@staticmethod
495497
def any_overlaps_and_all_missing_variant(nodes:Iterable[PVGNode], variant:VariantRecord):

moPepGen/util/validate_variant_calling.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def parse_args(subparsers:argparse._SubParsersAction):
5353
nargs='*',
5454
metavar='<values>'
5555
)
56+
common.add_args_debug_level(parser)
5657
common.add_args_reference(parser, proteome=True, index=False)
5758
parser.set_defaults(func=main)
5859
common.print_help_if_missing_args(parser)
@@ -120,12 +121,13 @@ def call_variant(gvf_files:Path, ref_dir:Path, output_fasta:Path, graph_output_d
120121
args.min_mw = 500.
121122
args.min_length = 7
122123
args.max_length = 25
123-
args.max_variants_per_node = 9
124-
args.additional_variants_per_misc = 2
124+
args.max_variants_per_node = (7,)
125+
args.additional_variants_per_misc = (2,)
125126
args.min_nodes_to_collapse = 30
126127
args.naa_to_collapse = 5
127128
args.noncanonical_transcripts = False
128129
args.threads = 1
130+
args.timeout_seconds = 900
129131
call_variant_peptide(args=args)
130132

131133
def call_brute_force(gvf_files:Path, ref_dir:Path, output_path:str, force:bool,

test/files/fuzz/54/AltSplice.gvf

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
##fileformat=VCFv4.2
2+
##mopepgen_version=1.4.3
3+
##parser=parseRMATS
4+
##reference_index=/hot/project/process/MissingPeptides/MISP-000132-MissingPeptidesPanCanP1/ref/GRCh38-EBI-GENCODE45/moPepGen/1.4.1
5+
##genome_fasta=
6+
##annotation_gtf=
7+
##source=AltSplice
8+
##CHROM=<Description="Gene ID">
9+
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
10+
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
11+
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
12+
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
13+
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
14+
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
15+
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
16+
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
17+
#CHROM POS ID REF ALT QUAL FILTER INFO
18+
ENSG00000105976.16 52054 A5SS_52054-52497-52576 G <INS> . . TRANSCRIPT_ID=ENST00000495962.1;DONOR_GENE_ID=ENSG00000105976.16;DONOR_START=52498;DONOR_END=52576;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116724249:116724250
19+
ENSG00000105976.16 52577 A5SS_52054-52497-52580 G <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=52577;END=52580;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116724772:116724775
20+
ENSG00000105976.16 67755 A5SS_59664-67754-67818 G <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=67755;END=67818;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116739950:116740013
21+
ENSG00000105976.16 68657 A5SS_67889-68656-68740 A <DEL> . . TRANSCRIPT_ID=ENST00000495962.1;START=68657;END=68740;GENE_SYMBOL=MET;GENOMIC_POSITION=chr7:116740852:116740935

test/files/fuzz/54/annotation.gtf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
chr1 . gene 1 126182 . + . gene_id ENSG00000105976.16; gene_type protein_coding; gene_name MET;
2+
chr1 . transcript 51951 83219 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
3+
chr1 . exon 51951 52054 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
4+
chr1 . exon 52577 52652 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
5+
chr1 . exon 59473 59664 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
6+
chr1 . exon 67755 67889 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
7+
chr1 . exon 68657 68830 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;
8+
chr1 . exon 83160 83219 . + . gene_id ENSG00000105976.16; transcript_id ENST00000495962.1; gene_type protein_coding; gene_name MET;

test/files/fuzz/54/brute_force.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
ATHFLSLGRSVAGATTNVCDR
2+
ATHFLSLGRSVAGATTNVCDRR
3+
ATHWLSLGRSVAGATTNVCDR
4+
ATHWLSLGRSVAGATTNVCDRR
5+
CGFCHDK
6+
CGFCHDKCVR
7+
CGWCHDK
8+
CGWCHDKCVR
9+
FMQCLQK
10+
GDLTIANLGTSEGRFMQCLQK
11+
KCGFCHDK
12+
KCGFCHDKCVR
13+
KCGWCHDK
14+
KCGWCHDKCVR
15+
MATHFLSLGRSVAGATTNVCDR
16+
MATHFLSLGRSVAGATTNVCDRR
17+
MATHWLSLGRSVAGATTNVCDR
18+
MATHWLSLGRSVAGATTNVCDRR
19+
SVAGATTNVCDR
20+
SVAGATTNVCDRR
21+
SVAGATTNVCDRRNA

0 commit comments

Comments
 (0)