ProjectR/fishSpeciesClassifier.py at main · RedZapdos123/ProjectR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#A python program to classify fishes from a fish's weight and dimensions, using XGBoost with Optuna (Baynesian optimizer) algorithm.
import numpy as np
import pandas as pd
from sklearn.exceptions import DataConversionWarning
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    confusion_matrix,
)
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings

#Suppressing all of the displayed warnings and optuna's determination of the hyperparameters' outputs.
#This is done to make the output on the terminal, better.
optuna.logging.set_verbosity(optuna.logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)


#Taking the dataset file path as input, and laoding and converting the dataset into a dataframe.
filePath = input("Enter the file path of the fish species dataset (CSV): ")
data = pd.read_csv(filePath)

#Encoding the Species column of dataset into numbers.
labelEncoder = LabelEncoder()
data["Species"] = labelEncoder.fit_transform(data["Species"])

#Splitting up the target (Species) and features' columns.
X = data.drop("Species", axis=1)
#Converted the Species column into a 1D Numpy array to remove the warning.
Y = data["Species"].values.ravel()

#Splitiiing the dataset into training (80%) and testing (20%) sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2, random_state = 17)

#The Optuna's objective function.
def objective(trial):
    params ={
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "num_class": len(labelEncoder.classes_),
        "seed": 17,
    }

    #Training teh model.
    model = xgb.XGBClassifier(**params)
    model.fit(Xtrain, Ytrain)

    #Making prediction upon the test set.
    predY = model.predict(Xtest)

    # Calculate evaluation metrics
    f1 = f1_score(Ytest, predY, average = "weighted")
    accuracy = accuracy_score(Ytest, predY)
    precision = precision_score(Ytest, predY, average = "weighted", zero_division = 0)

    #Making attempt optuna to maximize the F1 and accuracy scores.
    return f1 + accuracy

#Making Optuna study (determine) the best hyperparameters and features.
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 100)

#Training the model with best hyperparameters,a nd most required features.
bestParameters = study.best_params
bestParameters["objective"] = "multi:softprob"
bestParameters["eval_metric"] = "mlogloss"
bestParameters["num_class"] = len(labelEncoder.classes_)
bestParameters["seed"] = 17

bestModel = xgb.XGBClassifier(**bestParameters)
bestModel.fit(Xtrain, Ytrain)

#Evaluating the model.
predY = bestModel.predict(Xtest)

f1 = f1_score(Ytest, predY, average = "weighted")
accuracy = accuracy_score(Ytest, predY)
precision = precision_score(Ytest, predY, average = "weighted", zero_division = 0)
matrix = confusion_matrix(Ytest, predY)

#printing the evaluation metrics.
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print("Confusion Matrix:")
print(matrix)

#Printing five random classifications.
np.random.seed(17)
ra = np.random.choice(Xtest.index, size = 5, replace = False)
for i in ra:
    actual = labelEncoder.inverse_transform([Ytest[Xtest.index == i]])[0]
    predicted = labelEncoder.inverse_transform([bestModel.predict(Xtest.loc[i:i].values)])[0]
    print(f"Actual Species = {actual}; Predicted Species = {predicted}")