BIMSBbioinfo · J-Moravec · Apr 23, 2025 · Jun 24, 2025
diff --git a/R/readAnnotate.R b/R/readAnnotate.R
@@ -27,11 +27,9 @@ bed12ToExons<-function(ref){
                        as.integer(unlist(strsplit(as.character(ref$V11),"," ))))
     rep.ref = ref[rep(1:nrow(ref),ref[,10]),] # replicate rows occurs as many as its exon number
 
+    exon.id = Map(f = function(x, y){ if(x == "+") 1:y else y:1 }, ref[, 6], ref[, 10])
+    exon.id = unlist(exon.id)
 
-    exon.id = unlist( mapply( function(x,y)
-                              if(x=="+"){return(1:y)}
-                              else{return(y:1)}, 
-                              ref[,6],ref[,10]))
     rep.ref$V5=exon.id
 
     rep.ref$V3 = rep.ref$V2+b.start.size[,1]+b.start.size[,2] # calculate exon start and ends
@@ -69,10 +67,8 @@ bed12ToIntrons<-function(ref){
     # replicate rows occurs as many as its exon number
     rep.ref = ref[rep(1:nrow(ref),ref[,10]),] 
 
-    exon.id = unlist( mapply( function(x,y) 
-                                if(x=="+"){return(1:y)}
-                                else{return(y:1)} ,
-                                ref[,6],ref[,10]  ) )
+    exon.id = Map(f = function(x, y){ if(x == "+") 1:y else y:1 }, ref[, 6], ref[, 10])
+    exon.id = unlist(exon.id)
     rep.ref$V5 = exon.id
 
     # calculate exon start and ends

diff --git a/R/readData.R b/R/readData.R
@@ -474,45 +474,51 @@ setGeneric("readTranscriptFeatures",
 setMethod("readTranscriptFeatures", 
           signature(location = "character"),
           function(location,remove.unusual,up.flank ,down.flank ,unique.prom){
-            
-            # readBed6
+
+            # readBed12
             message('Reading the table...\r')
-            bed=readTableFast(location,header=FALSE,skip="auto")                    
+            bed = readTableFast(location, header = FALSE, skip = "auto")
+
+            if(ncol(bed) < 12){
+                stop("readTranscriptFeatures require 12 column bed file, only ",
+                     ncol(bed), " columns found.", call. = FALSE)
+            }
+
             if(remove.unusual)
               bed=bed[grep("_", as.character(bed[,1]),invert=TRUE),]
-            
+
             # introns
             message('Calculating intron coordinates...\r')
             introns    = convertBed2Introns(bed)
             # exons
             message('Calculating exon coordinates...\r')
             exons    = convertBed2Exons(bed)
-            
+
             # get the locations of TSSes
             message('Calculating TSS coordinates...\r')
             tss=bed
             #  + strand
             tss[tss$V6=="+",3] = tss[tss$V6=="+",2]
             #  - strand
             tss[tss$V6=="-",2]=tss[tss$V6=="-",3]
-            
+
             tssg = GRanges(seqnames=as.character(tss$V1),
                            ranges=IRanges(start=tss$V2, end=tss$V3),
                            strand=as.character(tss$V6),
                            score=rep(0,nrow(tss)),
                            name=tss$V4)
-            
+
             message('Calculating promoter coordinates...\r')
             # get the locations of promoters
             # + strand
             bed[bed$V6=="+",3]=bed[bed$V6=="+",2]+down.flank
             bed[bed$V6=="+",2]=bed[bed$V6=="+",2]-up.flank
-            
+
             #  - strand
             bed[bed$V6=="-",2]=bed[bed$V6=="-",3]-down.flank
             bed[bed$V6=="-",3]=bed[bed$V6=="-",3]+up.flank
-            
-            
+
+
             if(! unique.prom){
               prom.df = (bed[,c(1,2,3,4,6)])
               prom = GRanges(seqnames=as.character(prom.df$V1),
@@ -528,7 +534,7 @@ setMethod("readTranscriptFeatures",
                              score=rep(0,nrow(prom.df)),
                              name=rep(".",nrow(prom.df)) )
             }
-            
+
             message('Outputting the final GRangesList...\r\n')
             GRangesList(exons=exons,introns=introns,promoters=prom,TSSes=tssg)
           })

diff --git a/inst/unitTests/bed12.bed b/inst/unitTests/bed12.bed
@@ -0,0 +1,2 @@
+ch1	10000	20000	f1	0	+	10000	20000	0	3	1000,1000,1000,	2000,4000,6000,
+ch1	20000	30000	f2	0	-	20000	30000	0	3	1000,1000,1000,	2000,4000,6000,
diff --git a/inst/unitTests/bed6.bed b/inst/unitTests/bed6.bed
@@ -0,0 +1,2 @@
+ch1	10000	20000	f1	0	+
+ch1	20000	30000	f2	0	-
diff --git a/inst/unitTests/test_annotateWithGeneParts.R b/inst/unitTests/test_annotateWithGeneParts.R
@@ -0,0 +1,53 @@
+test_annotateWithGeneParts = function(){
+    # bed with two features:
+    # 10 000 - 20 000 (exons: 12 000 - 13 000, 14 000 - 15 000, 16 000 - 17 000)
+    bed12 = system.file("unitTests/bed12.bed", package = "genomation")
+    features = suppressMessages(readTranscriptFeatures(bed12))
+
+    # 4 features with respect to above bed12:
+    # intergenic
+    # promoter
+    # intronic
+    # exonic
+    target = GRanges(
+        rep('ch1', 4),
+        IRanges(c(8001, 10001, 13001, 14001), c(8500, 10500, 13500, 14500))
+    )
+    result = annotateWithGeneParts(target, features)
+
+    members_expected = matrix(
+        c(0, 0, 0,
+          1, 0, 0,
+          0, 0, 1,
+          0, 1, 0),
+        ncol = 3, nrow = 4, byrow = TRUE,
+        dimnames = list(NULL, c("prom", "exon", "intron"))
+    )
+    checkIdentical(members_expected, result@members)
+
+
+    # distance is calculated (for both positive strand):
+    # start/end - TSS - 1
+    # if > 0, then +2 (last base of codon)
+    # smaller of abs(dist_start), abs(dist_end)
+    dist_to_tss_expected = data.frame(
+        target.row = c(1L, 2L, 3L, 4L),
+        dist.to.feature = c(-1501, 0, 3002, 4002),
+        feature.name = rep("f1", 4),
+        feature.strand = factor("+", levels = c("+", "-", "*")),
+        row.names = make.unique(rep("1", 4))
+        )
+
+    checkIdentical(dist_to_tss_expected, [email protected])
+
+    df = as.data.frame(result)
+    df_expected = data.frame(
+        "prom" = c(0, 1, 0, 0),
+        "exon" = c(0, 0, 0, 1),
+        "intron" = c(0, 0, 1, 0),
+        "dist.to.feature" = c(-1501, 0, 3002, 4002),
+        "feature.name" = rep("f1", 4),
+        row.names = 1:4
+        )
+    checkIdentical(df_expected, df)
+}
diff --git a/inst/unitTests/test_readTranscriptFeatures.R b/inst/unitTests/test_readTranscriptFeatures.R
@@ -0,0 +1,58 @@
+test_readTranscriptFeatures = function(){
+    bed6 = system.file("unitTests/bed6.bed", package = "genomation")
+    checkException(suppressMessages(readTranscriptFeatures(bed6)))
+
+    bed12 = system.file("unitTests/bed12.bed", package = "genomation")
+    features = suppressMessages(readTranscriptFeatures(bed12))
+
+    checkTrue(inherits(features, "GenomicRangesList"))
+    checkIdentical(c("exons", "introns", "promoters", "TSSes"), names(features))
+
+    exons = as.data.frame(features$exons)
+    exons_expected = data.frame(
+        seqnames = factor("ch1"),
+        start = c(12001L, 14001L, 16001L, 22001L, 24001L, 26001L),
+        end = c(13000L, 15000L, 17000L, 23000L, 25000L, 27000L),
+        width = 1000L,
+        strand = factor(c("+", "+", "+", "-", "-", "-"), levels = c("+", "-", "*")),
+        score = as.numeric(c(1:3, 3:1)),
+        name = c(rep("f1", 3), rep("f2", 3))
+    )
+    checkIdentical(exons_expected, exons)
+
+    introns = as.data.frame(features$introns)
+    introns_expected = data.frame(
+        seqnames = factor("ch1"),
+        start = c(13001L, 15001L, 23001L, 25001L),
+        end = c(14000L, 16000L, 24000L, 26000L),
+        width = 1000L,
+        strand = factor(c("+", "+", "-", "-"), levels = c("+", "-", "*")),
+        score = c(1, 2, 2, 1),
+        name = c("f1", "f1", "f2", "f2")
+    )
+    checkIdentical(introns_expected, introns)
+
+    promoters = as.data.frame(features$promoters)
+    promoters_expected = data.frame(
+        seqnames = factor("ch1"),
+        start = c(9000L, 29000L),
+        end = c(11000L, 31000L),
+        width = 2001L,
+        strand = factor(c("+", "-"), levels = c("+", "-", "*")),
+        score = 0,
+        name = "."
+    )
+    checkIdentical(promoters_expected, promoters)
+
+    TSSes = as.data.frame(features$TSSes)
+    TSSes_expected = data.frame(
+        seqnames = factor("ch1"),
+        start = c(10000L, 30000L),
+        end = c(10000L, 30000L),
+        width = 1L,
+        strand = factor(c("+", "-"), levels = c("+", "-", "*")),
+        score = 0,
+        name = c("f1", "f2")
+    )
+    checkIdentical(TSSes_expected, TSSes)
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ch1 10000 20000 f1 0 + 10000 20000 0 3 1000,1000,1000, 2000,4000,6000,
		ch1 20000 30000 f2 0 - 20000 30000 0 3 1000,1000,1000, 2000,4000,6000,