-
Notifications
You must be signed in to change notification settings - Fork 217
Open
Description
When generating counterfactual explanations then looking at the factual in the explanation structure returned, I observe small differences. Namely, the value in column "BP" changes a bit. MWE follows:
import os
import random
from urllib.request import urlretrieve
import dice_ml
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
def diabetes_df():
url = "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt"
# safety measure for MacOS, see
# https://docs.python.org/3/library/urllib.request.html#module-urllib.request
os.environ["no_proxy"] = "*"
file_name, _ = urlretrieve(url)
df = pd.read_csv(file_name, sep="\t").astype({"SEX": str}).astype({"SEX": "category"})
return df.sample(200, random_state=1)
def data_and_model(df, numerical, categorical, target_column):
np.random.seed(1)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])
transformations = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical),
("cat", categorical_transformer, categorical),
]
)
#
X = df.drop(target_column, axis=1)
y = df[target_column]
clf = Pipeline(steps=[("preprocessor", transformations), ("regressor", LGBMRegressor())])
model = clf.fit(X, y)
return X, y, model
# Data set
df = diabetes_df()
numerical = ["AGE", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]
categorical = ["SEX"]
x_train, y_train, model = data_and_model(df, numerical, categorical, "Y")
factuals = x_train[0:1]
seed = 5
random.seed(seed)
np.random.seed(seed)
# Ask for counterfactual explanations
df_for_dice = pd.concat([x_train, y_train], axis=1)
dice_data = dice_ml.Data(dataframe=df_for_dice, continuous_features=numerical, outcome_name="Y")
dice_model = dice_ml.Model(model=model, backend="sklearn", model_type="regressor")
dice_explainer = dice_ml.Dice(dice_data, dice_model, method="genetic")
features_to_vary = ["BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]
explanations = dice_explainer.generate_counterfactuals(
factuals,
total_CFs=5,
desired_range=[60, 90],
features_to_vary=features_to_vary,
posthoc_sparsity_algorithm="binary",
)
print(explanations.cf_examples_list[0].test_instance_df)
print(factuals)
And here is the output:
AGE SEX BMI BP S1 S2 S3 S4 S5 S6 Y
0 60 1 23.4 76.669998 247 148.0 65.0 3.8 5.1358 77 93.585579
AGE SEX BMI BP S1 S2 S3 S4 S5 S6
246 60 1 23.4 76.67 247 148.0 65.0 3.8 5.1358 77
Metadata
Metadata
Assignees
Labels
No labels