22# -*- coding:utf-8 -*-
33u"""
44Created by ygidtu@gmail.com at 2020.05.07
5-
65This scripts contains the class handle the reference file
6+
7+ Modified by AD 2025/01/12 to include only specified transcripts, otherwise their exon coordinates still occupy the alignments.
8+
79"""
810import glob
911import gzip
@@ -349,16 +351,15 @@ def index_gtf(cls, input_gtf):
349351
350352 return output_gtf
351353
352- def __load_gtf__ (self ):
353-
354+ def __load_gtf__ (self , transcripts_to_show : list [str ]| None = None ):
354355 u"""
355356 Load transcripts inside of region from gtf file
356357 :param region: target region
357358 :return: list of Transcript
358- """
359+ """
360+ # AD - passing transcripts_to_show
359361 transcripts = {}
360362 exons = {}
361-
362363 for rec in Reader .read_gtf (self .path , self .region ):
363364 start = max (rec .start , self .region .start )
364365 end = min (rec .end , self .region .end )
@@ -369,6 +370,13 @@ def __load_gtf__(self):
369370 break
370371
371372 if re .search (r"(rna|transcript|cds)" , rec .feature , re .I ):
373+
374+ if transcripts_to_show :
375+ _name = rec .transcript_name if "transcript_name" in rec .attributes else rec .transcript_id
376+ if _name not in transcripts_to_show :
377+ logger .info (f"Skipping transcript { _name } " )
378+ continue
379+
372380 if rec .transcript_id not in transcripts .keys ():
373381 transcripts [rec .transcript_id ] = Transcript (
374382 chromosome = rec .contig ,
@@ -380,7 +388,8 @@ def __load_gtf__(self):
380388 gene = rec .gene_name if "gene_name" in rec .attributes else "" ,
381389 transcript = rec .transcript_name if "transcript_name" in rec .attributes else "" ,
382390 exons = []
383- )
391+ )
392+
384393 elif re .search (r"(exon)" , rec .feature , re .I ):
385394 if rec .transcript_id not in exons .keys ():
386395 exons [rec .transcript_id ] = []
@@ -581,9 +590,16 @@ def load(self,
581590 assert isinstance (region , GenomicLoci ), "region should be a GenomicLoci object"
582591 if transcripts is None :
583592 transcripts = []
593+ elif isinstance (transcripts , str ):
594+ # AD when no transcripts are specified, transcripts is an empty string, not None
595+ if transcripts == "" :
596+ transcripts = []
597+ else :
598+ transcripts = transcripts .split ("," )
584599 self .region = region
600+ # AD - pass specified transcripts early
585601 if self .category == "gtf" :
586- self .__load_gtf__ ()
602+ self .__load_gtf__ (transcripts )
587603 if self .add_local_domain :
588604 self .__load_local_domain__ (region )
589605
0 commit comments