Skip to content

Custom CV #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: dev
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ MANIFEST
.venv*/

.DS_Store
.vscode/settings.json
devstuff.py
144 changes: 110 additions & 34 deletions src/ppscore/calculation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from sklearn import tree
from sklearn import preprocessing
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, f1_score

from sklearn.metrics import f1_score
from sklearn.compose import make_column_transformer, ColumnTransformer
import numpy as np
import pandas as pd
from pandas.api.types import (
is_numeric_dtype,
Expand All @@ -15,17 +17,21 @@
)

# if the number is 4, then it is possible to detect patterns when there are at least 4 times the same observation. If the limit is increased, the minimum observations also increase. This is important, because this is the limit when sklearn will throw an error which will lead to a score of 0 if we catch it
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

CV_ITERATIONS = 4

RANDOM_SEED = 587136


def _calculate_model_cv_score_(df, target, feature, task, **kwargs):
def _calculate_model_cv_score_(df, target, feature, task, cv, **kwargs):
"Calculates the mean model score based on cross-validation"
# Sources about the used methods:
# https://scikit-learn.org/stable/modules/tree.html
# https://scikit-learn.org/stable/modules/cross_validation.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

metric = task["metric_key"]
model = task["model"]
# shuffle the rows - this is important for crossvalidation
Expand All @@ -36,6 +42,9 @@ def _calculate_model_cv_score_(df, target, feature, task, **kwargs):
df = df.sample(frac=1, random_state=RANDOM_SEED, replace=False)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case that we use the default CV, I think that we still need this

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think I meant to bring it back and forgot..

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, no worries :)


# preprocess target
# TODO: this has a risk of leaking information if a target encoder were to be used, or misleading a
# TODO: tree if some classes are not observed in some training folds
# TODO: we need to make pre-processing a part of CV pipeline
if task["type"] == "classification":
label_encoder = preprocessing.LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])
Expand All @@ -44,26 +53,46 @@ def _calculate_model_cv_score_(df, target, feature, task, **kwargs):
target_series = df[target]

# preprocess feature
if _dtype_represents_categories(df[feature]):
one_hot_encoder = preprocessing.OneHotEncoder()
array = df[feature].__array__()
sparse_matrix = one_hot_encoder.fit_transform(array.reshape(-1, 1))
feature_input = sparse_matrix
else:
# reshaping needed because there is only 1 feature
feature_input = df[feature].values.reshape(-1, 1)
preprocess = None
if df[feature].dtype == object:
# Dealing with categorical feature here here:
preprocess = ColumnTransformer(
transformers=[("ct", OneHotEncoder(), [feature])]
)

# reshaping needed because there is only 1 feature: coerce to DataFrame
feature_df = df[feature].to_frame()

if preprocess is None:
pipeline_model = model
else:
pipeline_model = make_pipeline(preprocess, model)

# # IMPORTANT: changes on master TODO: decide what to do with them
# if _dtype_represents_categories(df[feature]):
# one_hot_encoder = preprocessing.OneHotEncoder()
# array = df[feature].__array__()
# sparse_matrix = one_hot_encoder.fit_transform(array.reshape(-1, 1))
# feature_input = sparse_matrix
# else:
# # reshaping needed because there is only 1 feature
# feature_input = df[feature].values.reshape(-1, 1)

# Pull the groups out if passed
groups = kwargs.get("groups", None)

# Run crossvalidation with the CV specified
# Crossvalidation is stratifiedKFold for classification, KFold for regression
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great to keep the annotation somewhere when no explicit CV object is passed but just the number of folds

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is already in the score function. I also added it to the docstring

# CV on one core (n_job=1; default) has shown to be fastest
scores = cross_val_score(
model, feature_input, target_series, cv=CV_ITERATIONS, scoring=metric
pipeline_model, feature_df, target_series, cv=cv, scoring=metric, groups=groups
)

return scores.mean()


def _normalized_mae_score(model_mae, naive_mae):
"Normalizes the model MAE score, given the baseline score"
"""Normalizes the model MAE score, given the baseline score"""
# # Value range of MAE is [0, infinity), 0 is best
# 10, 5 ==> 0 because worse than naive
# 10, 20 ==> 0.5
Expand All @@ -74,17 +103,31 @@ def _normalized_mae_score(model_mae, naive_mae):
return 1 - (model_mae / naive_mae)


def _mae_normalizer(df, y, model_score):
"In case of MAE, calculates the baseline score for y and derives the PPS."
def _mae_normalizer(df, y, model_score, cv, **kwargs):
"""
In case of MAE, calculates the baseline score for y and derives the PPS.

"""
df["naive"] = df[y].median()
baseline_score = mean_absolute_error(df[y], df["naive"]) # true, pred
# Re-write baseline score using DummyRegressor with median strategy
baseline_regr = DummyRegressor(strategy="median")
groups = kwargs.get("groups", None)
baseline_scores_cv = cross_val_score(
baseline_regr,
df,
df[y],
cv=cv,
scoring="neg_mean_absolute_error",
groups=groups,
)
baseline_score = np.mean(np.abs(baseline_scores_cv))

ppscore = _normalized_mae_score(abs(model_score), baseline_score)
return ppscore, baseline_score


def _normalized_f1_score(model_f1, baseline_f1):
"Normalizes the model F1 score, given the baseline score"
"""Normalizes the model F1 score, given the baseline score"""
# # F1 ranges from 0 to 1
# # 1 is best
# 0.5, 0.7 ==> 0 because model is worse than naive baseline
Expand All @@ -98,21 +141,33 @@ def _normalized_f1_score(model_f1, baseline_f1):
return f1_diff / scale_range # 0.1/0.3 = 0.33


def _f1_normalizer(df, y, model_score):
"In case of F1, calculates the baseline score for y and derives the PPS."
label_encoder = preprocessing.LabelEncoder()
df["truth"] = label_encoder.fit_transform(df[y])
df["most_common_value"] = df["truth"].value_counts().index[0]
random = df["truth"].sample(frac=1)

baseline_score = max(
f1_score(df["truth"], df["most_common_value"], average="weighted"),
f1_score(df["truth"], random, average="weighted"),
def _f1_normalizer(df, y, model_score, cv, **kwargs):
"""In case of F1, calculates the baseline score for y and derives the PPS."""
baseline_clf = DummyClassifier(strategy="stratified")
groups = kwargs.get("groups", None)
baseline_scores_cv = cross_val_score(
baseline_clf, df, df[y], cv=cv, scoring="f1_weighted", groups=groups
)

baseline_score = baseline_scores_cv.mean()
ppscore = _normalized_f1_score(model_score, baseline_score)
return ppscore, baseline_score

# # TODO: code from master
# def _f1_normalizer(df, y, model_score):
# "In case of F1, calculates the baseline score for y and derives the PPS."
# label_encoder = preprocessing.LabelEncoder()
# df["truth"] = label_encoder.fit_transform(df[y])
# df["most_common_value"] = df["truth"].value_counts().index[0]
# random = df["truth"].sample(frac=1)

# baseline_score = max(
# f1_score(df["truth"], df["most_common_value"], average="weighted"),
# f1_score(df["truth"], random, average="weighted"),
# )
# ppscore = _normalized_f1_score(model_score, baseline_score)
# return ppscore, baseline_score


TASKS = {
"regression": {
Expand Down Expand Up @@ -164,7 +219,7 @@ def _dtype_represents_categories(series) -> bool:


def _infer_task(df, x, y):
"Returns str with the name of the inferred task based on the columns x and y"
"""Returns str with the name of the inferred task based on the columns x and y"""
if x == y:
return "predict_itself"

Expand Down Expand Up @@ -193,7 +248,7 @@ def _infer_task(df, x, y):


def _feature_is_id(df, x):
"Returns Boolean if the feature column x is an ID"
"""Returns Boolean if the feature column x is an ID"""
if not (is_string_dtype(df[x]) or is_categorical_dtype(df[x])):
return False

Expand Down Expand Up @@ -226,7 +281,7 @@ def _maybe_sample(df, sample):
return df


def score(df, x, y, task=None, sample=5000):
def score(df, x, y, task=None, sample=5000, cv=None, **kwargs):
"""
Calculate the Predictive Power Score (PPS) for "x predicts y"
The score always ranges from 0 to 1 and is data-type agnostic.
Expand All @@ -246,14 +301,17 @@ def score(df, x, y, task=None, sample=5000):
sample : int or ``None``
Number of rows for sampling. The sampling decreases the calculation time of the PPS.
If ``None`` there will be no sampling.
cv: iterable or sklearn-compatible cv object
Crossvalidation strategy to be used. if `None`, cv defaults to:
stratifiedKFold for classification, KFold for regression


Returns
-------
Dict
A dict that contains multiple fields about the resulting PPS.
The dict enables introspection into the calculations that have been performed under the hood
"""

if not isinstance(df, pd.DataFrame):
raise TypeError(
f"The 'df' argument should be a pandas.DataFrame but you passed a {type(df)}\nPlease convert your input to a pandas.DataFrame"
Expand All @@ -279,6 +337,16 @@ def score(df, x, y, task=None, sample=5000):
"The attribute 'task' is no longer supported because it led to confusion and inconsistencies.\nThe task of the model is now determined based on the data types of the columns. If you want to change the task please adjust the data type of the column.\nFor more details, please refer to the README"
)

if cv is None:
# Did not pass any CV - fallback to defaults:
# Crossvalidation is stratifiedKFold for classification, KFold for regression
cv = CV_ITERATIONS

if isinstance(cv, int):
# We either passed an integer for CV, or None and got cv set to an integer value of CV_ITERATIONS above
# Shuffle data to imitate KFold(shuffle=True)
df = df.sample(frac=1, random_state=RANDOM_SEED, replace=False)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a notion that this might be too easy. For example, when the user just wants to use 6 folds and passes cv=6, then we still need to perform the random resampling/shuffling in case of a normal KFold or stratifiedKFold.
Just in case of a CV strategy that needs to keep the order of the rows in the dataset, there should be no reshuffling.
What do you think about this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought you wanted to keep it in the default case only, hence I added it back as per your comment above. Alternatively, one can pass KFold with shuffle either True, or False, thus explicitly setting a CV procedure.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, setting CV to a train-test index generator as opposed to an integer means the user is doing it willingly, with understanding on consequences and a specific purpose in mind

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe I just do not understand what you d like me to do here :)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the most flexible and robust solution is to perform the default shuffling when the user passes no CV or an integer (assuming he wants to adjust the number of folds). In all the other cases, we do not have to shuffle as he has to pass a valid CV iterator.

What do you think about this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok i see. As of now it shuffles only if no CV is passed, and you're saying "let's shuffle the data if no CV is passed or if an integer is passed", is that correct ?

If the above is True, then I see your point and will adjust the if-statement above in the code accordingly.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That looks fine, thank you :)


if x == y:
task_name = "predict_itself"
else:
Expand Down Expand Up @@ -310,9 +378,17 @@ def score(df, x, y, task=None, sample=5000):
ppscore = 0
baseline_score = 0
else:

model_score = _calculate_model_cv_score_(df, target=y, feature=x, task=task)
ppscore, baseline_score = task["score_normalizer"](df, y, model_score)
model_score = _calculate_model_cv_score_(
df,
target=y,
feature=x,
task=task,
cv=cv,
**kwargs,
)
ppscore, baseline_score = task["score_normalizer"](
df, y, model_score, cv, **kwargs # TODO: kwargs needed?
)

return {
"x": x,
Expand Down
77 changes: 71 additions & 6 deletions tests/test_calculation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# # -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

import pytest
import pandas as pd
import numpy as np

import ppscore as pps
from sklearn.model_selection import (
KFold,
StratifiedKFold,
TimeSeriesSplit,
ShuffleSplit,
)


def test__normalized_f1_score():
Expand Down Expand Up @@ -71,7 +77,18 @@ def test__maybe_sample():
assert len(_maybe_sample(df, 10)) == 10


def test_score():
# StratifiedKFold doesn't work for regression. That's why we have an extra test case for
# a classification problem below
cv_regression_list = [
5,
KFold(n_splits=2, shuffle=True),
TimeSeriesSplit(n_splits=5),
ShuffleSplit(),
]

# TODO: unsure if we should parametrize the whole test
@pytest.mark.parametrize("cv", cv_regression_list)
def test_score_cv(cv):
df = pd.DataFrame()
df["x"] = np.random.uniform(-2, 2, 1_000)
df["error"] = np.random.uniform(-0.5, 0.5, 1_000)
Expand Down Expand Up @@ -144,11 +161,11 @@ def test_score():

# check scores
# feature is id
assert pps.score(df, "id", "y")["ppscore"] == 0
assert pps.score(df, "id", "y", cv=cv)["ppscore"] == 0

# numeric feature and target
assert pps.score(df, "x", "y")["ppscore"] > 0.5
assert pps.score(df, "y", "x")["ppscore"] < 0.05
assert pps.score(df, "x", "y", cv=cv)["ppscore"] > 0.5
assert pps.score(df, "y", "x", cv=cv)["ppscore"] < 0.05

# boolean feature or target
assert pps.score(df, "x", "x_greater_0_boolean")["ppscore"] > 0.6
Expand Down Expand Up @@ -220,7 +237,55 @@ def test_predictors():
# the underlying calculations are tested as part of test_score


def test_matrix():
def test_score_cv_on_classification():
df = pd.DataFrame()
df["x"] = np.random.uniform(-2, 2, 5_000)
df["error"] = np.random.uniform(-0.5, 0.5, 5_000)
df["y"] = df["x"] + df["error"] > 0

cv = StratifiedKFold(n_splits=5)

result_dict = pps.score(df, "x", "y", cv=cv)
assert result_dict["task"] == "classification"
assert result_dict["ppscore"] > 0.8


cv_list = [
5,
KFold(n_splits=2, shuffle=True),
StratifiedKFold(n_splits=3),
TimeSeriesSplit(n_splits=5),
ShuffleSplit(),
]

@pytest.mark.parametrize("cv", cv_list)
def test_score_cv_stable(cv):
df = pd.DataFrame()
df["x"] = np.random.uniform(-2, 2, 1_000)
df["error"] = np.random.uniform(-0.5, 0.5, 1_000)
df["y_binary"] = df["x"] + df["error"] > 0
df["y_numeric"] = df["x"] ** 2 + df["error"]

def compute_ppscore(target, df=df, x="x", cv=cv):
return pps.score(df=df, x=x, y=target, cv=cv)["ppscore"]

# classification
result_1 = compute_ppscore(target="y_binary")
result_2 = compute_ppscore(target="y_binary")
assert abs(result_1 - result_2) < 0.05

# regression
# StratifiedKFold doesn't work for regression.
if isinstance(cv, StratifiedKFold):
return
else:
result_1 = compute_ppscore(target="y_numeric")
result_2 = compute_ppscore(target="y_numeric")
assert abs(result_1 - result_2) < 0.05


@pytest.mark.parametrize("cv", cv_list)
def test_matrix(cv):
df = pd.read_csv("examples/titanic.csv")
df = df[["Age", "Survived"]]
df["Age_datetime"] = pd.to_datetime(df["Age"], infer_datetime_format=True)
Expand Down