Skip to content

Commit 2a4ada9

Browse files
committed
v0.2.0
1 parent 42f93a0 commit 2a4ada9

36 files changed

+12051
-1028
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
data/*
2+
**output**/
23
.ipynb_checkpoints
34
*.png
45

Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM python:3.8
2+
WORKDIR /workdir
3+
4+
COPY requirements.txt .
5+
RUN pip install -r requirements.txt
6+
7+
COPY FIDDLE/ ./FIDDLE/

FIDDLE/config.yaml renamed to FIDDLE/config-default.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,17 @@ column_names:
55
var_name: variable_name
66
var_value: variable_value
77

8-
use_ordinal_encoding: no
8+
parallel: yes
9+
n_jobs: 72
10+
batch_size: 100
11+
912
hierarchical_sep: ":"
1013
hierarchical_levels: [0, 1, 2]
1114

15+
discretize: yes
16+
use_ordinal_encoding: no
17+
discretization: ~
18+
1219
value_types:
1320
# enter the feature type that you would like to override in the following format:
1421
FIRST_WARDID: Categorical

FIDDLE/config.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
11
import os, yaml
2-
with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
3-
config = yaml.full_load(f)
2+
import copy
43

5-
ID_col = config['column_names']['ID']
6-
var_col = config['column_names']['var_name']
7-
val_col = config['column_names']['var_value']
8-
t_col = config['column_names']['t']
9-
hierarchical_sep = config['hierarchical_sep']
10-
hierarchical_levels = config['hierarchical_levels']
4+
with open(os.path.join(os.path.dirname(__file__), 'config-default.yaml')) as f:
5+
config_default = yaml.safe_load(f)
116

12-
use_ordinal_encoding = config['use_ordinal_encoding']
13-
value_type_override = config['value_types']
7+
def load_config(fname):
8+
config = copy.deepcopy(config_default)
9+
if fname:
10+
config_custom = yaml.safe_load(open(fname, 'r'))
11+
for k, v in config_custom.items():
12+
config[k] = v
13+
return config
1414

15-
parallel = True
16-
n_jobs = 72
15+
16+
ID_col = 'ID'
17+
t_col = 't'
18+
var_col = 'variable_name'
19+
val_col = 'variable_value'
20+
21+
if 'column_names' in config_default:
22+
ID_col = config_default['column_names'].get('ID', 'ID')
23+
t_col = config_default['column_names'].get('t', 't')
24+
var_col = config_default['column_names'].get('var_name', 'variable_name')
25+
val_col = config_default['column_names'].get('var_value', 'variable_value')
26+
else:
27+
pass

FIDDLE/helpers.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,19 @@
1-
import argparse
2-
def str2bool(v):
3-
if isinstance(v, bool):
4-
return v
5-
if v.lower() in ('yes', 'true', 't', 'y', '1'):
6-
return True
7-
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
8-
return False
9-
else:
10-
raise argparse.ArgumentTypeError('Boolean value expected.')
11-
12-
13-
from .config import *
141
import pandas as pd
152
import numpy as np
163
import scipy
174
import sparse
185
from collections import defaultdict
19-
20-
from joblib import Parallel, delayed, parallel_backend
216
from tqdm import tqdm
227

238
from sklearn.feature_selection import VarianceThreshold
249
import sklearn
2510
from collections import defaultdict
2611

12+
try:
13+
from .config import *
14+
except:
15+
from config import *
16+
2717
def print_header(*content, char='='):
2818
print()
2919
print(char * 80)
@@ -95,11 +85,11 @@ def get_unique_variables(df):
9585
return sorted(df[var_col].unique())
9686

9787
def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
98-
data_path = args.data_path
88+
output_dir = args.output_dir
9989
df_population = args.df_population
10090
T, dt = args.T, args.dt
10191

102-
df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
92+
df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type']
10393
numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
10494
df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient.
10595
variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
@@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None):
136126
assert False
137127
return
138128

139-
def smart_qcut_dummify(x, q, use_ordinal_encoding=False):
129+
130+
def compute_bin_edges(x, q):
140131
# ignore strings when performing qcut
141132
z = x.copy()
142133
z = z.apply(make_float)
143134
m = z.apply(np.isreal)
135+
bin_edges = None
144136
if z.loc[m].dropna().nunique() > 1: # when more than one numeric values
145-
if use_ordinal_encoding:
146-
bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100])
147-
bin_edges = np.unique(bin_edges)
148-
col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
149-
out = pd.DataFrame(0, z.index, col_names)
150-
for i, bin_edge in enumerate(bin_edges[:-1]):
151-
out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int)
152-
out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
137+
if z.loc[m].dropna().nunique() == 2:
138+
pass
139+
else:
140+
bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1))))
141+
return (x.name, bin_edges)
142+
143+
def smart_qcut_dummify_parallel(first_arg):
144+
return smart_qcut_dummify(*first_arg)
145+
146+
def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False):
147+
# ignore strings when performing qcut
148+
z = x.copy()
149+
z = z.apply(make_float)
150+
m = z.apply(np.isreal)
151+
if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values
152+
if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values
153+
out = pd.get_dummies(x, prefix=x.name)
153154
else:
154-
z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop')
155-
out = pd.get_dummies(z, prefix=z.name)
155+
if use_ordinal_encoding:
156+
col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
157+
out = pd.DataFrame(0, z.index, col_names)
158+
for i, bin_edge in enumerate(bin_edges[:-1]):
159+
out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int)
160+
out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
161+
else:
162+
z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True)
163+
out = pd.get_dummies(z, prefix=z.name)
156164
else:
157165
out = pd.get_dummies(x, prefix=x.name)
158166
return out
@@ -202,13 +210,13 @@ def pivot_event_table(df):
202210
# Handle cases where the same variable is recorded multiple times with the same timestamp
203211
# Adjust the timestamps by epsilon so that all timestamps are unique
204212
eps = 1e-6
205-
m_dups = df.duplicated([ID_col, t_col, var_col], keep=False)
213+
m_dups = df.duplicated([t_col, var_col], keep=False)
206214
df_dups = df[m_dups].copy()
207215
for v, df_v in df_dups.groupby(var_col):
208216
df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
209217

210218
df = pd.concat([df[~m_dups], df_dups])
211-
assert not df.duplicated([ID_col, t_col, var_col], keep=False).any()
219+
assert not df.duplicated([t_col, var_col], keep=False).any()
212220

213221
return pd.pivot_table(df, val_col, t_col, var_col, 'first')
214222

0 commit comments

Comments
 (0)