Skip to content

Factual presented in explanation is different from original factual #433

@fabiensatalia

Description

@fabiensatalia

When generating counterfactual explanations then looking at the factual in the explanation structure returned, I observe small differences. Namely, the value in column "BP" changes a bit. MWE follows:

import os
import random
from urllib.request import urlretrieve

import dice_ml
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def diabetes_df():
    url = "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt"
    # safety measure for MacOS, see
    # https://docs.python.org/3/library/urllib.request.html#module-urllib.request
    os.environ["no_proxy"] = "*"
    file_name, _ = urlretrieve(url)
    df = pd.read_csv(file_name, sep="\t").astype({"SEX": str}).astype({"SEX": "category"})
    return df.sample(200, random_state=1)


def data_and_model(df, numerical, categorical, target_column):
    np.random.seed(1)
    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])
    transformations = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numerical),
            ("cat", categorical_transformer, categorical),
        ]
    )
    #
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    clf = Pipeline(steps=[("preprocessor", transformations), ("regressor", LGBMRegressor())])
    model = clf.fit(X, y)
    return X, y, model


# Data set
df = diabetes_df()
numerical = ["AGE", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]
categorical = ["SEX"]
x_train, y_train, model = data_and_model(df, numerical, categorical, "Y")
factuals = x_train[0:1]

seed = 5
random.seed(seed)
np.random.seed(seed)

# Ask for counterfactual explanations
df_for_dice = pd.concat([x_train, y_train], axis=1)
dice_data = dice_ml.Data(dataframe=df_for_dice, continuous_features=numerical, outcome_name="Y")
dice_model = dice_ml.Model(model=model, backend="sklearn", model_type="regressor")
dice_explainer = dice_ml.Dice(dice_data, dice_model, method="genetic")
features_to_vary = ["BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]
explanations = dice_explainer.generate_counterfactuals(
    factuals,
    total_CFs=5,
    desired_range=[60, 90],
    features_to_vary=features_to_vary,
    posthoc_sparsity_algorithm="binary",
)
print(explanations.cf_examples_list[0].test_instance_df)
print(factuals)

And here is the output:

   AGE SEX   BMI         BP   S1     S2    S3   S4      S5  S6          Y
0   60   1  23.4  76.669998  247  148.0  65.0  3.8  5.1358  77  93.585579
     AGE SEX   BMI     BP   S1     S2    S3   S4      S5  S6
246   60   1  23.4  76.67  247  148.0  65.0  3.8  5.1358  77

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions