MLD3
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 7 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎FIDDLE/config.yaml‎ renamed to ‎FIDDLE/config-default.yaml‎
Lines changed: 8 additions & 1 deletion b/‎FIDDLE/config.yaml‎ renamed to ‎FIDDLE/config-default.yaml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎FIDDLE/config.py‎
Lines changed: 23 additions & 12 deletions b/‎FIDDLE/config.py‎
Lines changed: 23 additions & 12 deletions
diff --git a/‎FIDDLE/helpers.py‎
Lines changed: 38 additions & 30 deletions b/‎FIDDLE/helpers.py‎
Lines changed: 38 additions & 30 deletions
@@ -1,4 +1,5 @@
 data/*
+**output**/
 .ipynb_checkpoints
 *.png
 
 
@@ -0,0 +1,7 @@
+FROM python:3.8
+WORKDIR /workdir
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY FIDDLE/ ./FIDDLE/
@@ -5,10 +5,17 @@ column_names:
     var_name: variable_name
     var_value: variable_value
 
-use_ordinal_encoding: no
+parallel: yes
+n_jobs: 72
+batch_size: 100
+
 hierarchical_sep: ":"
 hierarchical_levels: [0, 1, 2]
 
+discretize: yes
+use_ordinal_encoding: no
+discretization: ~
+
 value_types:
     # enter the feature type that you would like to override in the following format:
     FIRST_WARDID: Categorical
 
@@ -1,16 +1,27 @@
 import os, yaml
-with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
-    config = yaml.full_load(f)
+import copy
 
-ID_col = config['column_names']['ID']
-var_col = config['column_names']['var_name']
-val_col = config['column_names']['var_value']
-t_col = config['column_names']['t']
-hierarchical_sep = config['hierarchical_sep']
-hierarchical_levels = config['hierarchical_levels']
+with open(os.path.join(os.path.dirname(__file__), 'config-default.yaml')) as f:
+    config_default = yaml.safe_load(f)
 
-use_ordinal_encoding = config['use_ordinal_encoding']
-value_type_override = config['value_types']
+def load_config(fname):
+    config = copy.deepcopy(config_default)
+    if fname:
+        config_custom = yaml.safe_load(open(fname, 'r'))
+        for k, v in config_custom.items():
+            config[k] = v
+    return config
 
-parallel = True
-n_jobs = 72
+
+ID_col = 'ID'
+t_col = 't'
+var_col = 'variable_name'
+val_col = 'variable_value'
+
+if 'column_names' in config_default:
+    ID_col = config_default['column_names'].get('ID', 'ID')
+    t_col = config_default['column_names'].get('t', 't')
+    var_col = config_default['column_names'].get('var_name', 'variable_name')
+    val_col = config_default['column_names'].get('var_value', 'variable_value')
+else:
+    pass
@@ -1,29 +1,19 @@
-import argparse
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-from .config import *
 import pandas as pd
 import numpy as np
 import scipy
 import sparse
 from collections import defaultdict
-
-from joblib import Parallel, delayed, parallel_backend
 from tqdm import tqdm
 
 from sklearn.feature_selection import VarianceThreshold
 import sklearn
 from collections import defaultdict
 
+try:
+    from .config import *
+except:
+    from config import *
+
 def print_header(*content, char='='):
     print()
     print(char * 80)
@@ -95,11 +85,11 @@ def get_unique_variables(df):
     return sorted(df[var_col].unique())
 
 def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
-    data_path = args.data_path
+    output_dir = args.output_dir
     df_population = args.df_population
     T, dt = args.T, args.dt
 
-    df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
+    df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type']
     numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
     df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. 
     variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
@@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None):
             assert False
     return
 
-def smart_qcut_dummify(x, q, use_ordinal_encoding=False):
+
+def compute_bin_edges(x, q):
     # ignore strings when performing qcut
     z = x.copy()
     z = z.apply(make_float)
     m = z.apply(np.isreal)
+    bin_edges = None
     if z.loc[m].dropna().nunique() > 1: # when more than one numeric values
-        if use_ordinal_encoding:
-            bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100])
-            bin_edges = np.unique(bin_edges)
-            col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
-            out = pd.DataFrame(0, z.index, col_names)
-            for i, bin_edge in enumerate(bin_edges[:-1]):
-                out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int)
-            out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
+        if z.loc[m].dropna().nunique() == 2:
+            pass
+        else:
+            bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1))))
+    return (x.name, bin_edges)
+
+def smart_qcut_dummify_parallel(first_arg):
+    return smart_qcut_dummify(*first_arg)
+
+def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False):
+    # ignore strings when performing qcut
+    z = x.copy()
+    z = z.apply(make_float)
+    m = z.apply(np.isreal)
+    if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values
+        if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values
+            out = pd.get_dummies(x, prefix=x.name)
         else:
-            z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop')
-            out = pd.get_dummies(z, prefix=z.name)
+            if use_ordinal_encoding:
+                col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
+                out = pd.DataFrame(0, z.index, col_names)
+                for i, bin_edge in enumerate(bin_edges[:-1]):
+                    out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int)
+                out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
+            else:
+                z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True)
+                out = pd.get_dummies(z, prefix=z.name)
     else:
         out = pd.get_dummies(x, prefix=x.name)
     return out
@@ -202,13 +210,13 @@ def pivot_event_table(df):
     # Handle cases where the same variable is recorded multiple times with the same timestamp
     # Adjust the timestamps by epsilon so that all timestamps are unique
     eps = 1e-6
-    m_dups = df.duplicated([ID_col, t_col, var_col], keep=False)
+    m_dups = df.duplicated([t_col, var_col], keep=False)
     df_dups = df[m_dups].copy()
     for v, df_v in df_dups.groupby(var_col):
         df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
 
     df = pd.concat([df[~m_dups], df_dups])
-    assert not df.duplicated([ID_col, t_col, var_col], keep=False).any()
+    assert not df.duplicated([t_col, var_col], keep=False).any()
 
     return pd.pivot_table(df, val_col, t_col, var_col, 'first')
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`data/*`
	`2`	`+output/`
`2`	`3`	`.ipynb_checkpoints`
`3`	`4`	`*.png`
`4`	`5`