Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions dice_ml/data_interfaces/base_data_interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Module containing base class for data interfaces for dice-ml."""

from abc import ABC, abstractmethod
import numpy as np
import pandas as pd


class _BaseData(ABC):
Expand All @@ -27,6 +29,52 @@ def set_continuous_feature_indexes(self, query_instance):
self.continuous_feature_indexes = [query_instance.columns.get_loc(name) for name in
self.continuous_feature_names]

def from_dummies(self, data, prefix_sep='_'):
"""Gets the original data from dummy encoded data with k levels."""
out = data.copy()
for feat in self.categorical_feature_names:
# first, derive column names in the one-hot-encoded data from the original data
cat_col_values = []
for val in list(self.data_df[feat].unique()):
cat_col_values.append(feat + prefix_sep + str(
val)) # join original feature name and its unique values , ex: education_school
match_cols = [c for c in data.columns if
c in cat_col_values] # check for the above matching columns in the encoded data

# then, recreate original data by removing the suffixes - based on the GitHub issue comment:
# https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
cols, labs = [[c.replace(
x, "") for c in match_cols] for x in ["", feat + prefix_sep]]
out[feat] = pd.Categorical(
np.array(labs)[np.argmax(data[cols].values, axis=1)])
out.drop(cols, axis=1, inplace=True)
return out

def one_hot_encode_data(self, data):
"""One-hot-encodes the data."""
return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)

def get_decoded_data(self, data, encoding='one-hot'):
"""Gets the original data from encoded data."""
if len(data) == 0:
return data

index = [i for i in range(0, len(data))]
if encoding == 'one-hot':
if isinstance(data, pd.DataFrame):
return self.from_dummies(data)
elif isinstance(data, np.ndarray):
data = pd.DataFrame(data=data, index=index,
columns=self.ohe_encoded_feature_names)
return self.from_dummies(data)
else:
raise ValueError("data should be a pandas dataframe or a numpy array")

elif encoding == 'label':
data = pd.DataFrame(data=data, index=index,
columns=self.feature_names)
return data

@abstractmethod
def __init__(self, params):
"""The init method needs to be implemented by the inherting classes."""
Expand Down
36 changes: 0 additions & 36 deletions dice_ml/data_interfaces/private_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,6 @@ def _validate_and_set_mad(self, params):
else:
self.mad = {}

def one_hot_encode_data(self, data):
"""One-hot-encodes the data."""
return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)

def normalize_data(self, df, encoding='one-hot'):
"""Normalizes continuous features to make them fall in the range [0,1]."""
result = df.copy()
Expand Down Expand Up @@ -254,17 +250,6 @@ def from_label(self, data):
out[column] = self.labelencoder[self.feature_names[column]].inverse_transform([round(out[column])])[0]
return out

def from_dummies(self, data, prefix_sep='_'):
"""Gets the original data from dummy encoded data with k levels."""
out = data.copy()
for feature_name in self.categorical_feature_names:
cols, labs = [[c.replace(
x, "") for c in data.columns if feature_name+prefix_sep in c] for x in ["", feature_name+prefix_sep]]
out[feature_name] = pd.Categorical(
np.array(labs)[np.argmax(data[cols].values, axis=1)])
out.drop(cols, axis=1, inplace=True)
return out

def get_decimal_precisions(self):
""""Gets the precision of continuous features in the data."""
precisions = [0]*len(self.continuous_feature_names)
Expand All @@ -276,27 +261,6 @@ def get_decimal_precisions(self):
precisions[ix] = self.type_and_precision[feature_name][1]
return precisions

def get_decoded_data(self, data, encoding='one-hot'):
"""Gets the original data from encoded data."""
if len(data) == 0:
return data

index = [i for i in range(0, len(data))]
if encoding == 'one-hot':
if isinstance(data, pd.DataFrame):
return self.from_dummies(data)
elif isinstance(data, np.ndarray):
data = pd.DataFrame(data=data, index=index,
columns=self.ohe_encoded_feature_names)
return self.from_dummies(data)
else:
raise ValueError("data should be a pandas dataframe or a numpy array")

elif encoding == 'label':
data = pd.DataFrame(data=data, index=index,
columns=self.feature_names)
return data

def prepare_df_for_ohe_encoding(self):
"""Create base dataframe to do OHE for a single instance or a set of instances"""
levels = []
Expand Down
46 changes: 0 additions & 46 deletions dice_ml/data_interfaces/public_data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,6 @@ def get_data_type(self, col):
else:
raise ValueError("Unknown data type of feature %s: must be int or float" % col)

def one_hot_encode_data(self, data):
"""One-hot-encodes the data."""
return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)

def normalize_data(self, df):
"""Normalizes continuous features to make them fall in the range [0,1]."""
result = df.copy()
Expand Down Expand Up @@ -436,27 +432,6 @@ def from_label(self, data):
out[c] = self.labelencoder[self.feature_names[c]].inverse_transform([round(out[c])])[0]
return out

def from_dummies(self, data, prefix_sep='_'):
"""Gets the original data from dummy encoded data with k levels."""
out = data.copy()
for feat in self.categorical_feature_names:
# first, derive column names in the one-hot-encoded data from the original data
cat_col_values = []
for val in list(self.data_df[feat].unique()):
cat_col_values.append(feat + prefix_sep + str(
val)) # join original feature name and its unique values , ex: education_school
match_cols = [c for c in data.columns if
c in cat_col_values] # check for the above matching columns in the encoded data

# then, recreate original data by removing the suffixes - based on the GitHub issue comment:
# https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271
cols, labs = [[c.replace(
x, "") for c in match_cols] for x in ["", feat + prefix_sep]]
out[feat] = pd.Categorical(
np.array(labs)[np.argmax(data[cols].values, axis=1)])
out.drop(cols, axis=1, inplace=True)
return out

def get_decimal_precisions(self, output_type="list"):
""""Gets the precision of continuous features in the data."""
# if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the
Expand All @@ -481,27 +456,6 @@ def get_decimal_precisions(self, output_type="list"):
elif output_type == "dict":
return precisions_dict

def get_decoded_data(self, data, encoding='one-hot'):
"""Gets the original data from encoded data."""
if len(data) == 0:
return data

index = [i for i in range(0, len(data))]
if encoding == 'one-hot':
if isinstance(data, pd.DataFrame):
return self.from_dummies(data)
elif isinstance(data, np.ndarray):
data = pd.DataFrame(data=data, index=index,
columns=self.ohe_encoded_feature_names)
return self.from_dummies(data)
else:
raise ValueError("data should be a pandas dataframe or a numpy array")

elif encoding == 'label':
data = pd.DataFrame(data=data, index=index,
columns=self.feature_names)
return data

def prepare_df_for_ohe_encoding(self):
"""Create base dataframe to do OHE for a single instance or a set of instances"""
levels = []
Expand Down