|
5 | 5 | '''
|
6 | 6 | import pandas as pd
|
7 | 7 | import numpy as np
|
8 |
| -import os |
9 | 8 | import pdb
|
10 | 9 | import pickle
|
11 | 10 | import gc
|
@@ -68,13 +67,20 @@ def __process_df(self, tp_annotations, fp_annotations):
|
68 | 67 |
|
69 | 68 | if DF_TP_columns.equals((DF_FP_columns)) is False:
|
70 | 69 | raise Exception("Indices in the passed dataframes are not equal")
|
71 |
| - |
72 |
| - # create 2 dataframes from tsv files skipping the 2 first columns, |
73 |
| - # as it is assumed that the 1st is 'chr' and 2nd is 'pos' |
74 |
| - DF_TP = pd.read_csv(tp_annotations, sep="\t", na_values=['.'], |
75 |
| - usecols=[i for i in range(2, len(DF_TP_columns))]) |
76 |
| - DF_FP = pd.read_csv(fp_annotations, sep="\t", na_values=['.'], |
77 |
| - usecols=[i for i in range(2, len(DF_FP_columns))]) |
| 70 | + |
| 71 | + DF_TP = None |
| 72 | + DF_FP = None |
| 73 | + if DF_TP_columns[2] == '[3](null)' or DF_FP_columns[2] == '[3](null)': |
| 74 | + # all INFO columns are in dataframe. Parse the DF with different function |
| 75 | + DF_TP = self.__process_dfINFO(tp_annotations) |
| 76 | + DF_FP = self.__process_dfINFO(fp_annotations) |
| 77 | + else: |
| 78 | + # create 2 dataframes from tsv files skipping the 2 first columns, |
| 79 | + # as it is assumed that the 1st is 'chr' and 2nd is 'pos' |
| 80 | + DF_TP = pd.read_csv(tp_annotations, sep="\t", na_values=['.'], |
| 81 | + usecols=[i for i in range(2, len(DF_TP_columns))]) |
| 82 | + DF_FP = pd.read_csv(fp_annotations, sep="\t", na_values=['.'], |
| 83 | + usecols=[i for i in range(2, len(DF_FP_columns))]) |
78 | 84 |
|
79 | 85 | #assign outcome=1 if TP and 0 if FP
|
80 | 86 | DF_TP = DF_TP.assign(is_valid=1)
|
@@ -108,6 +114,55 @@ def __process_df(self, tp_annotations, fp_annotations):
|
108 | 114 | aDF_std.insert(loc=0, column='is_valid', value=DF_tr['is_valid'].values)
|
109 | 115 |
|
110 | 116 | return aDF_std
|
| 117 | + |
| 118 | + def __get_ids(self, x): |
| 119 | + ids = [] |
| 120 | + for i in x: |
| 121 | + # sometimes, the value is None |
| 122 | + if i is None: |
| 123 | + continue |
| 124 | + elms = i.split('=') |
| 125 | + ids.append(elms[0]) |
| 126 | + new_ids = list(set(ids)) |
| 127 | + return new_ids[0] |
| 128 | + |
| 129 | + def __get_values(self, x): |
| 130 | + values = [] |
| 131 | + for i in x: |
| 132 | + if i is None: |
| 133 | + values.append(0) |
| 134 | + continue |
| 135 | + elms = i.split('=') |
| 136 | + if len(elms)==1: |
| 137 | + # value is of FLAG type |
| 138 | + values.append(1) |
| 139 | + else: |
| 140 | + values.append(elms[1]) |
| 141 | + return values |
| 142 | + |
| 143 | + def __process_dfINFO(self, annotations): |
| 144 | + """ |
| 145 | + Function to parse the annotations file when these are obtained |
| 146 | + by using bcftools query -f '%INFO', i.e. all INFO fields are fetched |
| 147 | +
|
| 148 | + Parameters |
| 149 | + ---------- |
| 150 | + annotations : str |
| 151 | + Path to file with variant annotations obtained using |
| 152 | + 'bcftools query' |
| 153 | +
|
| 154 | + Returns |
| 155 | + ------- |
| 156 | + new_DF : dataframe |
| 157 | + """ |
| 158 | + DF_columns = pd.read_csv(annotations, sep="\t", na_values=['.'], nrows=1).columns |
| 159 | + DF = pd.read_csv(annotations, sep="\t", na_values=['.'],usecols=[i for i in range(2, len(DF_columns))]) |
| 160 | + DF.rename(columns={"[3](null)":"INFO"},inplace=True) |
| 161 | + DF = DF.INFO.str.split(";",expand=True,) |
| 162 | + ids=DF.apply(self.__get_ids) |
| 163 | + DF.columns=ids |
| 164 | + new_DF=DF.apply(self.__get_values) |
| 165 | + return new_DF |
111 | 166 |
|
112 | 167 | def train(self, tp_annotations, fp_annotations, outprefix, test_size=0.25):
|
113 | 168 | """
|
|
0 commit comments