3
3
from __future__ import print_function
4
4
5
5
import os .path as op
6
+ import pandas as pd
6
7
7
8
from mirtop .mirna import fasta , mapper
8
9
from mirtop .mirna .realign import read_id
@@ -25,69 +26,69 @@ def convert_gff_counts(args):
25
26
UID miRNA Variant Sample1 Sample2 ... Sample N
26
27
"""
27
28
sep = "\t "
28
- variant_header = sep . join ( ['iso_5p' , 'iso_3p' ,
29
- 'iso_add3p' , 'iso_snp' ])
29
+ variant_header = ['iso_5p' , 'iso_3p' ,
30
+ 'iso_add3p' , 'iso_snp' ]
30
31
if args .add_extra :
31
32
precursors = fasta .read_precursor (args .hairpin , args .sps )
32
33
matures = mapper .read_gtf_to_precursor (args .gtf )
33
- variant_header = sep .join ([variant_header ,
34
- 'iso_5p_nt' , 'iso_3p_nt' ,
35
- 'iso_add3p_nt' , 'iso_snp_nt' ])
34
+ variant_header = variant_header + ['iso_5p_nt' , 'iso_3p_nt' , 'iso_add3p_nt' , 'iso_snp_nt' ]
36
35
37
36
logger .info ("INFO Reading GFF file %s" , args .gff )
38
37
logger .info ("INFO Writing TSV file to directory %s" , args .out )
39
38
40
39
gff_file = open (args .gff , 'r' )
41
40
out_file = op .join (args .out , "%s.tsv" % op .splitext (op .basename (args .gff ))[0 ])
41
+ all_lines = []
42
42
missing_parent = 0
43
43
missing_mirna = 0
44
44
unvalid_uid = 0
45
- with open (out_file , 'w' ) as outh :
46
-
47
- for samples_line in gff_file :
48
- if samples_line .startswith ("## COLDATA:" ):
49
- samples = sep .join (samples_line .strip ().split ("COLDATA:" )[1 ].strip ().split ("," ))
50
- header = sep .join (['UID' , 'Read' , 'miRNA' , 'Variant' ,
51
- variant_header , samples ])
52
- print (header , file = outh )
53
- break
54
-
55
- for mirna_line in gff_file :
56
- gff = feature (mirna_line )
57
- attr = gff .attributes
58
- UID = attr ["UID" ]
59
- Read = attr ["Read" ]
60
- mirna = attr ["Name" ]
61
- parent = attr ["Parent" ]
62
- variant = attr ["Variant" ]
63
- try :
64
- read_id (UID )
65
- except KeyError :
66
- unvalid_uid += 1
45
+ #with open(out_file, 'w') as outh:
46
+
47
+ for samples_line in gff_file :
48
+ if samples_line .startswith ("## COLDATA:" ):
49
+ samples = [sep .join (samples_line .strip ().split ("COLDATA:" )[1 ].strip ().split ("," ))]
50
+ #header = sep.join(['UID', 'Read', 'miRNA', 'Variant',
51
+ # variant_header, samples])
52
+ #print(header, file=outh)
53
+ break
54
+
55
+ for mirna_line in gff_file :
56
+ gff = feature (mirna_line )
57
+ attr = gff .attributes
58
+ UID = attr ["UID" ]
59
+ Read = attr ["Read" ]
60
+ mirna = attr ["Name" ]
61
+ parent = attr ["Parent" ]
62
+ variant = attr ["Variant" ]
63
+ try :
64
+ read_id (UID )
65
+ except KeyError :
66
+ unvalid_uid += 1
67
+ continue
68
+
69
+ expression = [sep .join (attr ["Expression" ].strip ().split ("," ))]
70
+ cols_variants = _expand (variant )
71
+ logger .debug ("COUNTS::Read:%s" % Read )
72
+ logger .debug ("COUNTS::EXTRA:%s" % variant )
73
+ if args .add_extra :
74
+ if parent not in precursors :
75
+ missing_parent += 1
67
76
continue
68
-
69
- expression = sep .join (attr ["Expression" ].strip ().split ("," ))
70
- cols_variants = sep .join (_expand (variant ))
71
- logger .debug ("COUNTS::Read:%s" % Read )
72
- logger .debug ("COUNTS::EXTRA:%s" % variant )
73
- if args .add_extra :
74
- if parent not in precursors :
75
- missing_parent += 1
76
- continue
77
- if mirna not in matures [parent ]:
78
- missing_mirna += 1
79
- continue
80
- extra = variant_with_nt (mirna_line , precursors , matures )
81
- if extra == "Invalid" :
82
- continue
83
- logger .debug ("COUNTS::EXTRA:%s" % extra )
84
- cols_variants = sep .join ([cols_variants ] + _expand (extra , True ))
85
- summary = sep .join ([UID , Read , mirna , variant ,
86
- cols_variants , expression ])
87
- logger .debug (summary )
88
- print (summary , file = outh )
89
-
90
- gff_file .close ()
77
+ if mirna not in matures [parent ]:
78
+ missing_mirna += 1
79
+ continue
80
+ extra = variant_with_nt (mirna_line , precursors , matures )
81
+ if extra == "Invalid" :
82
+ continue
83
+ logger .debug ("COUNTS::EXTRA:%s" % extra )
84
+ cols_variants = cols_variants + _expand (extra , True )
85
+ summary = [UID , Read , mirna , variant ] + cols_variants + expression
86
+ logger .debug (summary )
87
+ all_lines .append (summary )
88
+ #import pdb; pdb.set_trace()
89
+ df = pd .DataFrame (all_lines , columns = ['UID' , 'Read' , 'miRNA' , 'Variant' ] + variant_header + samples )
90
+ df = df .drop_duplicates ()
91
+ df .to_csv (out_file , sep = "\t " , index = False )
91
92
logger .info ("Missing Parents in hairpin file: %s" % missing_parent )
92
93
logger .info ("Missing MiRNAs in GFF file: %s" % missing_mirna )
93
94
logger .info ("Non valid UID: %s" % unvalid_uid )
0 commit comments