-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmachine_learning.py
More file actions
executable file
·102 lines (86 loc) · 4.39 KB
/
machine_learning.py
File metadata and controls
executable file
·102 lines (86 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import GridSearchCV
import warnings
import numpy as np
def model_preprocessing(clf, df, df_df):
"""
This function performs model preprocessing by training a machine learning model on different combinations of
vectorizers and preprocessors.
Parameters:
- clf (sklearn.base.BaseEstimator): A scikit-learn estimator to be trained on the data.
- df (pandas.DataFrame): A pandas DataFrame containing the target variable 'type'.
- df_df (pandas.DataFrame): A pandas DataFrame containing the vectorizers and preprocessors as columns and index
respectively.
Returns:
- df_res (pandas.DataFrame): A pandas DataFrame containing the accuracy scores of the trained models for each
combination of vectorizers and preprocessors.
"""
vectorizers = df_df.columns
preprocessors = df_df.index
df_res = pd.DataFrame(index=preprocessors, columns=vectorizers)
for vectorizer in vectorizers:
for preprocessor in preprocessors:
X_train, X_test, y_train, y_test = \
train_test_split(df_df[vectorizer][preprocessor],
df['type'], test_size=0.2, random_state=21)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
df_res[vectorizer][preprocessor] = accuracy_score(y_test, y_pred)
return df_res
def grid_search_all(clf, df, df_df, parameters):
"""
This function performs a grid search for a given machine learning model (clf) on all combinations of vectorizers and
preprocessors specified in the df_df DataFrame. The function uses the GridSearchCV class from scikit-learn to
search for the best hyperparameters for each combination of vectorizer and preprocessor. The results are printed
to the console.
Parameters:
- clf (sklearn.base.BaseEstimator): A scikit-learn estimator to be trained on the data.
- df (pandas.DataFrame): A pandas DataFrame containing the target variable 'type'.
- df_df (pandas.DataFrame): A pandas DataFrame containing the vectorizers and preprocessors as columns and
index respectively.
- parameters (dict): A dictionary containing the hyperparameters to be searched for each vectorizer and
preprocessor combination.
Returns:
None. The function prints the accuracy scores and best parameters for each combination of vectorizer and
preprocessor to the console.
"""
vectorizers = df_df.columns
preprocessors = df_df.index
for preprocessor in preprocessors:
for vectorizer in vectorizers:
gs = GridSearchCV(clf,
param_grid=parameters,
scoring='accuracy',
cv=5)
# X_train, X_test, y_train, y_test = \
# train_test_split(df_df[vectorizer][preprocessor],
# df['type'], test_size=0.2, random_state=21)
gs.fit(df_df[vectorizer][preprocessor], df['type'])
print(f'{preprocessor}, {vectorizer}:\n'
f'accuracy: {gs.best_score_:.6f},\n'
f'params = {gs.best_params_}\n')
def grid_search(clf, df, df_x, parameters):
"""
This function performs a grid search for a given machine learning model (clf) on the input data (df_x) and target
variable (df['type']).
The function uses the GridSearchCV class from scikit-learn to search for the best hyperparameters for the given
model.
Parameters:
- clf (sklearn.base.BaseEstimator): A scikit-learn estimator to be trained on the data.
- df (pandas.DataFrame): A pandas DataFrame containing the target variable 'type'.
- df_x (pandas.DataFrame): A pandas DataFrame containing the input features to be used for training the model.
- parameters (dict): A dictionary containing the hyperparameters to be searched for the given model.
Returns:
None. The function prints the accuracy score and best parameters for the trained model to the console.
"""
gs = GridSearchCV(clf,
param_grid=parameters,
scoring='accuracy',
# n_jobs=-1,
cv=5)
gs.fit(df_x, df['type'])
print(f'accuracy: {gs.best_score_:.6f},\nparams = {gs.best_params_}')
def custom_grid_search():
pass