Skip to content

Commit d835c16

Browse files
committed
change to make rfe work with all INFO fields
1 parent 20f1aa1 commit d835c16

File tree

1 file changed

+63
-8
lines changed

1 file changed

+63
-8
lines changed

VCF/VCFfilter/MLclassifier.py

+63-8
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
'''
66
import pandas as pd
77
import numpy as np
8-
import os
98
import pdb
109
import pickle
1110
import gc
@@ -68,13 +67,20 @@ def __process_df(self, tp_annotations, fp_annotations):
6867

6968
if DF_TP_columns.equals((DF_FP_columns)) is False:
7069
raise Exception("Indices in the passed dataframes are not equal")
71-
72-
# create 2 dataframes from tsv files skipping the 2 first columns,
73-
# as it is assumed that the 1st is 'chr' and 2nd is 'pos'
74-
DF_TP = pd.read_csv(tp_annotations, sep="\t", na_values=['.'],
75-
usecols=[i for i in range(2, len(DF_TP_columns))])
76-
DF_FP = pd.read_csv(fp_annotations, sep="\t", na_values=['.'],
77-
usecols=[i for i in range(2, len(DF_FP_columns))])
70+
71+
DF_TP = None
72+
DF_FP = None
73+
if DF_TP_columns[2] == '[3](null)' or DF_FP_columns[2] == '[3](null)':
74+
# all INFO columns are in dataframe. Parse the DF with different function
75+
DF_TP = self.__process_dfINFO(tp_annotations)
76+
DF_FP = self.__process_dfINFO(fp_annotations)
77+
else:
78+
# create 2 dataframes from tsv files skipping the 2 first columns,
79+
# as it is assumed that the 1st is 'chr' and 2nd is 'pos'
80+
DF_TP = pd.read_csv(tp_annotations, sep="\t", na_values=['.'],
81+
usecols=[i for i in range(2, len(DF_TP_columns))])
82+
DF_FP = pd.read_csv(fp_annotations, sep="\t", na_values=['.'],
83+
usecols=[i for i in range(2, len(DF_FP_columns))])
7884

7985
#assign outcome=1 if TP and 0 if FP
8086
DF_TP = DF_TP.assign(is_valid=1)
@@ -108,6 +114,55 @@ def __process_df(self, tp_annotations, fp_annotations):
108114
aDF_std.insert(loc=0, column='is_valid', value=DF_tr['is_valid'].values)
109115

110116
return aDF_std
117+
118+
def __get_ids(self, x):
119+
ids = []
120+
for i in x:
121+
# sometimes, the value is None
122+
if i is None:
123+
continue
124+
elms = i.split('=')
125+
ids.append(elms[0])
126+
new_ids = list(set(ids))
127+
return new_ids[0]
128+
129+
def __get_values(self, x):
130+
values = []
131+
for i in x:
132+
if i is None:
133+
values.append(0)
134+
continue
135+
elms = i.split('=')
136+
if len(elms)==1:
137+
# value is of FLAG type
138+
values.append(1)
139+
else:
140+
values.append(elms[1])
141+
return values
142+
143+
def __process_dfINFO(self, annotations):
144+
"""
145+
Function to parse the annotations file when these are obtained
146+
by using bcftools query -f '%INFO', i.e. all INFO fields are fetched
147+
148+
Parameters
149+
----------
150+
annotations : str
151+
Path to file with variant annotations obtained using
152+
'bcftools query'
153+
154+
Returns
155+
-------
156+
new_DF : dataframe
157+
"""
158+
DF_columns = pd.read_csv(annotations, sep="\t", na_values=['.'], nrows=1).columns
159+
DF = pd.read_csv(annotations, sep="\t", na_values=['.'],usecols=[i for i in range(2, len(DF_columns))])
160+
DF.rename(columns={"[3](null)":"INFO"},inplace=True)
161+
DF = DF.INFO.str.split(";",expand=True,)
162+
ids=DF.apply(self.__get_ids)
163+
DF.columns=ids
164+
new_DF=DF.apply(self.__get_values)
165+
return new_DF
111166

112167
def train(self, tp_annotations, fp_annotations, outprefix, test_size=0.25):
113168
"""

0 commit comments

Comments
 (0)