-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree.py
120 lines (96 loc) · 4.38 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import base64
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from graphviz import Source
class DecisionTree:
"""Страница дерева решений."""
def __init__(self, df):
st.title('Дерево решений')
self.df = df
self.features_selection()
self.model_fit()
self.show_feature_importances()
self.show_students_with_problems()
self.show_metrics()
def features_selection(self):
"""Выбор предикторов и целевой переменной."""
target_feature = st.sidebar.selectbox(
'Целевая переменная:', self.df.columns, len(self.df.columns) - 1)
st.sidebar.text('Предикторы:')
features = np.array(
[f for f in self.df.columns if f != target_feature])
selected_features = features[[
st.sidebar.checkbox(f, f) for f in features]]
self.X = self.df[selected_features]
self.y = self.df[target_feature]
self.X = pd.get_dummies(self.X)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=.33)
def model_fit(self):
"""Создание и обучение модели."""
max_depth = st.sidebar.slider('Глубина дерева', 1, 10, 3)
self.best_tree = DecisionTreeClassifier(
criterion='gini',
max_depth=max_depth,
min_samples_leaf=1,
min_samples_split=2
)
self.best_tree.fit(self.X_train, self.y_train)
graph = Source(tree.export_graphviz(
self.best_tree,
out_file=None,
feature_names=list(self.X),
class_names=['Problem', 'No problem'],
filled=True
))
st.image(graph.pipe(format='png'))
def show_feature_importances(self):
"""Отображение важности предикторов."""
feature_importances = self.best_tree.feature_importances_
feature_importances_df = pd.DataFrame({
'feature_importances': feature_importances,
'features': list(self.X_train)
})
st.subheader('Важность предикторов.')
st.write(feature_importances_df.sort_values(
'feature_importances', ascending=False))
def show_students_with_problems(self):
"""Отображение учеников в группе риска."""
st.subheader('Выявленные ученики в группе риска.')
self.predictions = self.best_tree.predict_proba(self.X_test)[:, 1]
students_with_problems = self.X_test.copy()
students_with_problems['predictions'] = self.predictions
threshold = st.slider('Пороговое значение:', .01, .99, .5)
students_with_problems = students_with_problems[students_with_problems.iloc[:,-1] < threshold]
st.write(f'Всего: {students_with_problems.shape[0]}')
st.write(students_with_problems)
csv = students_with_problems.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="students_with_problems.csv">Download CSV</a>'
st.markdown(href, unsafe_allow_html=True)
def show_metrics(self):
"""Отображение метрик качества модели."""
st.subheader('Метрики качества модели.')
st.code(classification_report(
self.y_test, self.predictions.round(), zero_division=True))
fpr, tpr, thresholds = roc_curve(self.y_test, self.predictions)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
st.pyplot(plt)