|
1 | | -import argparse |
2 | | -def str2bool(v): |
3 | | - if isinstance(v, bool): |
4 | | - return v |
5 | | - if v.lower() in ('yes', 'true', 't', 'y', '1'): |
6 | | - return True |
7 | | - elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
8 | | - return False |
9 | | - else: |
10 | | - raise argparse.ArgumentTypeError('Boolean value expected.') |
11 | | - |
12 | | - |
13 | | -from .config import * |
14 | 1 | import pandas as pd |
15 | 2 | import numpy as np |
16 | 3 | import scipy |
17 | 4 | import sparse |
18 | 5 | from collections import defaultdict |
19 | | - |
20 | | -from joblib import Parallel, delayed, parallel_backend |
21 | 6 | from tqdm import tqdm |
22 | 7 |
|
23 | 8 | from sklearn.feature_selection import VarianceThreshold |
24 | 9 | import sklearn |
25 | 10 | from collections import defaultdict |
26 | 11 |
|
| 12 | +try: |
| 13 | + from .config import * |
| 14 | +except: |
| 15 | + from config import * |
| 16 | + |
27 | 17 | def print_header(*content, char='='): |
28 | 18 | print() |
29 | 19 | print(char * 80) |
@@ -95,11 +85,11 @@ def get_unique_variables(df): |
95 | 85 | return sorted(df[var_col].unique()) |
96 | 86 |
|
97 | 87 | def get_frequent_numeric_variables(df_time_series, variables, threshold, args): |
98 | | - data_path = args.data_path |
| 88 | + output_dir = args.output_dir |
99 | 89 | df_population = args.df_population |
100 | 90 | T, dt = args.T, args.dt |
101 | 91 |
|
102 | | - df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type'] |
| 92 | + df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type'] |
103 | 93 | numeric_vars = [col for col in variables if df_types[col] == 'Numeric'] |
104 | 94 | df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. |
105 | 95 | variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)] |
@@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None): |
136 | 126 | assert False |
137 | 127 | return |
138 | 128 |
|
139 | | -def smart_qcut_dummify(x, q, use_ordinal_encoding=False): |
| 129 | + |
| 130 | +def compute_bin_edges(x, q): |
140 | 131 | # ignore strings when performing qcut |
141 | 132 | z = x.copy() |
142 | 133 | z = z.apply(make_float) |
143 | 134 | m = z.apply(np.isreal) |
| 135 | + bin_edges = None |
144 | 136 | if z.loc[m].dropna().nunique() > 1: # when more than one numeric values |
145 | | - if use_ordinal_encoding: |
146 | | - bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100]) |
147 | | - bin_edges = np.unique(bin_edges) |
148 | | - col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] |
149 | | - out = pd.DataFrame(0, z.index, col_names) |
150 | | - for i, bin_edge in enumerate(bin_edges[:-1]): |
151 | | - out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int) |
152 | | - out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) |
| 137 | + if z.loc[m].dropna().nunique() == 2: |
| 138 | + pass |
| 139 | + else: |
| 140 | + bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1)))) |
| 141 | + return (x.name, bin_edges) |
| 142 | + |
| 143 | +def smart_qcut_dummify_parallel(first_arg): |
| 144 | + return smart_qcut_dummify(*first_arg) |
| 145 | + |
| 146 | +def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False): |
| 147 | + # ignore strings when performing qcut |
| 148 | + z = x.copy() |
| 149 | + z = z.apply(make_float) |
| 150 | + m = z.apply(np.isreal) |
| 151 | + if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values |
| 152 | + if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values |
| 153 | + out = pd.get_dummies(x, prefix=x.name) |
153 | 154 | else: |
154 | | - z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop') |
155 | | - out = pd.get_dummies(z, prefix=z.name) |
| 155 | + if use_ordinal_encoding: |
| 156 | + col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] |
| 157 | + out = pd.DataFrame(0, z.index, col_names) |
| 158 | + for i, bin_edge in enumerate(bin_edges[:-1]): |
| 159 | + out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int) |
| 160 | + out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) |
| 161 | + else: |
| 162 | + z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True) |
| 163 | + out = pd.get_dummies(z, prefix=z.name) |
156 | 164 | else: |
157 | 165 | out = pd.get_dummies(x, prefix=x.name) |
158 | 166 | return out |
@@ -202,13 +210,13 @@ def pivot_event_table(df): |
202 | 210 | # Handle cases where the same variable is recorded multiple times with the same timestamp |
203 | 211 | # Adjust the timestamps by epsilon so that all timestamps are unique |
204 | 212 | eps = 1e-6 |
205 | | - m_dups = df.duplicated([ID_col, t_col, var_col], keep=False) |
| 213 | + m_dups = df.duplicated([t_col, var_col], keep=False) |
206 | 214 | df_dups = df[m_dups].copy() |
207 | 215 | for v, df_v in df_dups.groupby(var_col): |
208 | 216 | df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v)) |
209 | 217 |
|
210 | 218 | df = pd.concat([df[~m_dups], df_dups]) |
211 | | - assert not df.duplicated([ID_col, t_col, var_col], keep=False).any() |
| 219 | + assert not df.duplicated([t_col, var_col], keep=False).any() |
212 | 220 |
|
213 | 221 | return pd.pivot_table(df, val_col, t_col, var_col, 'first') |
214 | 222 |
|
|
0 commit comments