Skip to content

Limit model new #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 113 additions & 118 deletions prettymetrics/clf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,14 @@
# ("CheckingClassifier", sklearn.utils._mocking.CheckingClassifier),
("ClassifierChain", ClassifierChain),
("ComplementNB", ComplementNB),
("GradientBoostingClassifier",GradientBoostingClassifier,),
("GaussianProcessClassifier",GaussianProcessClassifier,),
(
"GradientBoostingClassifier",
GradientBoostingClassifier,
),
(
"GaussianProcessClassifier",
GaussianProcessClassifier,
),
# (
# "HistGradientBoostingClassifier",
# HistGradientBoostingClassifier,
Expand All @@ -70,6 +76,7 @@
CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier))
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))
CLASSIFIERS_DICT = {key : value for key, value in CLASSIFIERS}

numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
Expand Down Expand Up @@ -111,15 +118,14 @@ def get_card_split(df, cols, n=11):
card_high : list-like
Columns with cardinality >= n
"""
cond = df[cols].nunique() > n
card_high = cols[cond]
card_low = cols[~cond]

cond = df[cols].nunique() > n
card_high = cols[cond]
card_low = cols[~cond]
return card_low, card_high



class Classifier:
class LazyClassifier:
"""
This module helps in fitting to all the classification algorithms that are available in Scikit-learn
Parameters
Expand All @@ -138,14 +144,14 @@ class Classifier:

Examples
--------
>>> from prettymetrics.clf import Classifier
>>> from prettymetrics.supervised import LazyClassifier
>>> from sklearn.datasets import load_breast_cancer
>>> from sklearn.model_selection import train_test_split
>>> data = load_breast_cancer()
>>> X = data.data
>>> y= data.target
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
>>> clf = Classifier(verbose=0,ignore_warnings=True, custom_metric=None)
>>> clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
>>> models,predictions = clf.fit(X_train, X_test, y_train, y_test)
>>> model_dictionary = clf.provide_models(X_train,X_test,y_train,y_test)
>>> models
Expand Down Expand Up @@ -185,36 +191,20 @@ class Classifier:

def __init__(
self,
verbose = 0,
ignore_warnings = True,
custom_metric = None,
predictions = False,
random_state = 42,
classifiers = "all"
verbose=0,
ignore_warnings=True,
custom_metric=None,
predictions=False,
random_state=42,
classifiers = "all"
):
self.verbose = verbose
self.ignore_warnings = ignore_warnings
self.custom_metric = custom_metric
self.predictions = predictions
self.models = {}
self.random_state = random_state
self.classifiers = classifiers

def get_classifiers(self):

if self.classifiers == "all":
self.classifiers = CLASSIFIERS
return

try:
temp_list = []
for classifier in self.classifiers:
full_name = (classifier.__class__.__name__, classifier)
temp_list.append(full_name)
self.classifiers = temp_list
except Exception as exception:
print(exception)
print("Invalid Classifier(s)")
self.verbose = verbose
self.ignore_warnings = ignore_warnings
self.custom_metric = custom_metric
self.predictions = predictions
self.models = {}
self.random_state = random_state
self.classifiers = classifiers

def fit(self, X_train, X_test, y_train, y_test):
"""Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
Expand All @@ -239,136 +229,141 @@ def fit(self, X_train, X_test, y_train, y_test):
predictions : Pandas DataFrame
Returns predictions of all the models in a Pandas DataFrame.
"""
accuracy_list = []
balanced_accuracy_list = []
roc_auc_list = []
f1_list = []
names = []
time_list = []
predictions = {}

if self.custom_metric:
Accuracy = []
B_Accuracy = []
ROC_AUC = []
F1 = []
names = []
TIME = []
predictions = {}

if self.custom_metric is not None:
CUSTOM_METRIC = []

if isinstance(X_train, np.ndarray):
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_test = pd.DataFrame(X_test)

numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

categorical_low, categorical_high = get_card_split(
X_train, categorical_features
)

preprocessor = ColumnTransformer(
transformers = [
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical_low", categorical_transformer_low, categorical_low),
("categorical_high", categorical_transformer_high, categorical_high),
]
)

self.get_classifiers()
if self.classifiers == "all":
self.classifiers = CLASSIFIERS
else:
try:
temp_list = [(classifier, CLASSIFIERS_DICT[classifier]) for classifier in self.classifiers]
self.classifiers = temp_list
except Exception as exception:
print(exception)
print("Invalid Classifier(s)")

for name, model in tqdm(self.classifiers):
start = time.time()
try:
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps = [
steps=[
("preprocessor", preprocessor),
("classifier", model(random_state=self.random_state)),
]
)
else:
pipe = Pipeline(
steps = [("preprocessor", preprocessor), ("classifier", model())]
steps=[("preprocessor", preprocessor), ("classifier", model())]
)

pipe.fit(X_train, y_train)

self.models[name] = pipe
y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred, normalize=True)
b_accuracy = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

self.models[name] = pipe
y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred, normalize=True)
b_accuracy = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
try:
roc_auc = roc_auc_score(y_test, y_pred)
except Exception as exception:
roc_auc = None
if self.ignore_warnings is False:
if not self.ignore_warnings:
print("ROC AUC couldn't be calculated for " + name)
print(exception)

# indexes.append()
names.append(name)
accuracy_list.append(accuracy)
balanced_accuracy_list.append(b_accuracy)
roc_auc_list.append(roc_auc)
f1_list.append(f1)
time_list.append(time.time() - start)

if self.custom_metric:
Accuracy.append(accuracy)
B_Accuracy.append(b_accuracy)
ROC_AUC.append(roc_auc)
F1.append(f1)
TIME.append(time.time() - start)
if self.custom_metric is not None:
custom_metric = self.custom_metric(y_test, y_pred)
CUSTOM_METRIC.append(custom_metric)

if self.verbose > 0:
current_metric = {
"Model" : name,
"Accuracy" : accuracy,
"Balanced Accuracy" : b_accuracy,
"ROC AUC" : roc_auc,
"F1 Score" : f1,
"Time taken" : time.time() - start,
}

if self.custom_metric:
current_metric[self.custom_metric.__name__] = custom_metric

print(current_metric)

if self.custom_metric is not None:
print(
{
"Model": name,
"Accuracy": accuracy,
"Balanced Accuracy": b_accuracy,
"ROC AUC": roc_auc,
"F1 Score": f1,
self.custom_metric.__name__: custom_metric,
"Time taken": time.time() - start,
}
)
else:
print(
{
"Model": name,
"Accuracy": accuracy,
"Balanced Accuracy": b_accuracy,
"ROC AUC": roc_auc,
"F1 Score": f1,
"Time taken": time.time() - start,
}
)
if self.predictions:
predictions[name] = y_pred

except Exception as exception:
if self.ignore_warnings is False:
if not self.ignore_warnings:
print(name + " model failed to execute")
print(exception)

# indexes = scores.index[lambda x: x in scores.indexes()]

scores = pd.DataFrame(
{
"Model" : names,
"Accuracy" : accuracy_list,
"Balanced Accuracy" : balanced_accuracy_list,
"ROC AUC" : roc_auc_list,
"F1 Score" : f1_list,
"Time Taken" : time_list,
}
)

if self.custom_metric:
scores[self.custom_metric.__name__] = CUSTOM_METRIC

# Sort the final metris by Balance Accuracy
scores = scores.sort_values(
by = "Balanced Accuracy",
ascending = False,
# ignore_index = True # This is not helping on the indexing
).set_index(
if self.custom_metric is None:
scores = pd.DataFrame(
{
"Model": names,
"Accuracy": Accuracy,
"Balanced Accuracy": B_Accuracy,
"ROC AUC": ROC_AUC,
"F1 Score": F1,
"Time Taken": TIME,
}
)
else:
scores = pd.DataFrame(
{
"Model": names,
"Accuracy": Accuracy,
"Balanced Accuracy": B_Accuracy,
"ROC AUC": ROC_AUC,
"F1 Score": F1,
self.custom_metric.__name__: CUSTOM_METRIC,
"Time Taken": TIME,
}
)
scores = scores.sort_values(by="Balanced Accuracy", ascending=False).set_index(
"Model"
)

# TODO: We need to index the score so we can see how many algorithms used
indexes = scores.index.tolist()
# scores['L_Index'] = indexes

if self.predictions:
predictions_df = pd.DataFrame.from_dict(predictions)

return scores, predictions_df if self.predictions is True else scores

def provide_models(self, X_train, X_test, y_train, y_test):
Expand Down Expand Up @@ -396,6 +391,6 @@ def provide_models(self, X_train, X_test, y_train, y_test):
with key as name of models.
"""
if len(self.models.keys()) == 0:
self.fit(X_train, X_test, y_train,y_test)
self.fit(X_train,X_test,y_train,y_test)

return self.models
Loading