DecisionTreeRules/tree_extract_rule.py at master · SmartDataInnovationLab/DecisionTreeRules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import numpy as np
import pandas as pd
from collections import deque


def extract_rules(tree_given, features, dataset, target_dataset, show_test_dist=False,
                  target_class=None):
    """
    This function returns the the rules of the Decision Tree.
    Caution: multioutput problems are not included yet
    :param tree_given: decision Tree
    :param features: please use 'features=dtrain.columns' directly before training the tree and use the list as features
                Reasen: if a other dataset with different column order is used the result would be wrong if the feature
                order is not the same as in the train dataset. Thus the feature order should always have the same order
                than the train dataset.
    :param dataset: dataset the decisionTree got (Data) (can be test or train data) (important: Type: Dataframe)
    :param target_dataset: dataset the decisionTree got (Target) (can be test or train data)(important: Type: Series)
    :param show_test_dist: Only use if the dataset is the same dataset the tree is trained.
            If this is the case 'test_class_dist' should be the same as 'class_dist' in the dictionary.
    :param target_class: Name of class on which the rules point (only rules that point to special class).
            if None: all rules are printed
    :return rule_dict: dictionary containing the rules
    you want to preserve. DONT USE THIS IF YOU ARE NOT SURE WHAT YOU ARE DOING.
    """

    if not isinstance(dataset, pd.DataFrame):
        raise Exception("dataset has to be a Dataframe")

    if not isinstance(target_dataset, pd.Series):
        raise Exception("target_dataset has to be a Series")

    if target_dataset.name is None:
        target_dataset.name = 'target'

    weight = tree_given.class_weight  # if tree is uses weigths they are red here
    print(tree_given.class_weight)
    if weight is 'balanced':
        weight = (target_dataset.value_counts().sum() / (2 * target_dataset.value_counts())).to_dict()
    # features = dtrain.columns
    rule_dict = {}
    data_ges = pd.concat([dataset, target_dataset], axis=1)
    list_leaf = _extract_leafs(tree_given, tree_given.classes_, target_class)
    for count, leaf in enumerate(list_leaf):
        tree_path = _buildtree(tree_given, leaf, features)
        dist = _get_dist(tree_path, data_ges, target_dataset.name, weight)
        if tree_given.classes_ is None:  # recognizes if regressor is DecisionTree or RegressionTree
            leaf_class = tree_given.tree_.value[leaf][0]
            reg_tree = True
        else:
            leaf_class = tree_given.classes_[np.argmax(tree_given.tree_.value[leaf])]
            reg_tree = False
        dict_temp = _print_tree(tree_path, target_dataset.name, data_ges, dist, leaf_class, reg_tree)
        dict_temp = {count: dict_temp}
        if show_test_dist:  # test distribution is the one the tree saves for the trained dataset, it is calculated new in case the dataset is not the traindataset
            tree_dist_train = tree_given.tree_.value[leaf].tolist()  # zur ueberpruefung anschalten
            tree_classes_list = tree_given.classes_.tolist()
            dict_temp[count]['test_class_dist'] = dict(zip(tree_classes_list, *tree_dist_train))
        rule_dict.update(dict_temp)
    return rule_dict


def extract_elements_of_rule(data, rule):
    """
    This function returns the the rules of the Decision Tree as pandas.Dataframe.
    :param data: dataset to search in (panda)
    :param rule: string, rule used, all elements are given back pertain to this rule.
    :return target_elements: elements to be true for the rule.

    you want to preserve. DONT USE THIS IF YOU ARE NOT SURE WHAT YOU ARE DOING.
    """

    target_elements = data.copy()
    print(target_elements.columns)
    if isinstance(target_elements.columns,
                  pd.RangeIndex):  # if the dataset has no columnnames cast the numbers to strings
        target_elements.columns = [str(i) for i in target_elements.columns]
    if isinstance(target_elements.columns,
                  object):
        target_elements.columns = [str(i) for i in target_elements.columns]
    tree = _build_tree_out_of_string(rule)
    for i in tree.index:  # going through all features
        if tree.true_false[i]:
            target_elements = target_elements[
                target_elements[tree.feature[i]] <= tree.condition[i]]  # delete all rows which not fullfill the crite
        else:
            target_elements = target_elements[target_elements[tree.feature[i]] > tree.condition[i]]
    return target_elements


def cut_tree_rules(dict_of_rules_to_cut, data, target_variabel_name, cut_feature_str=None, max_precision=None,
                   min_recall=None, weight_classes=None):
    """
    This function returns the the rules of the Decision Tree.
    The methode first filters for feature_strings, than precision and then recall values.
    Caution: The target class can change!!!! Only use it for Decision Tree not for Regression Tree
    :param target_variabel_name: string of the target feature
    :param data: dataset to search in (panda)
    :param dict_of_rules_to_cut: list of strings, rule you want to use
    :param cut_feature_str: string of feature: feature at which the tree is cutted to prevent further splitting.
    :param max_precision: float value: if the precision is higher than this value the tree is cutted
    :param min_recall: float value: if the recall is gets smaller than this value the tree is cutted before
    :param weight_classes: if weighted classes are used insert the dictionary used to train the DecisionTree
    return cutted_tree_rules: dictionary with rules true for the condition

    you want to preserve. DONT USE THIS IF YOU ARE NOT SURE WHAT YOU ARE DOING.
    """
    new_rules = dict_of_rules_to_cut.copy()
    for i in dict_of_rules_to_cut.keys():
        rule = dict_of_rules_to_cut[i]
        tree = _build_tree_out_of_string(rule['rule'])
        if cut_feature_str is not None:
            intersect = set(cut_feature_str).intersection(set(tree.feature.tolist()))
            if not intersect:
                new_rules.update({i: rule})
            for k in tree.index:
                if tree.loc[k, 'feature'] in list(intersect):
                    data_temp = data.copy()
                    new_tree = tree.drop(range(k, max(tree.index) + 1))
                    dist = _get_dist(new_tree, data, target_variabel_name, weight_classes)
                    dict_temp = _print_tree(new_tree, target_variabel_name, data_temp, dist)
                    new_rules.update({i: dict_temp})
                    break
        rule = new_rules[i]
        tree = _build_tree_out_of_string(rule['rule'])
        precision_recall = None
        if any(c is not None for c in [min_recall, max_precision]):
            precision_recall = _calculate_precision(tree, data, target_variabel_name, weight_classes)
        if max_precision is not None:
            if rule['precision'] > max_precision:
                for k in tree.index:
                    if precision_recall.loc[k, 'precision'] > min_recall:
                        data_temp = data.copy()
                        new_tree = tree.drop(range(k + 1, max(tree.index) + 1))
                        dist = _get_dist(new_tree, data, target_variabel_name, weight_classes)
                        dict_temp = _print_tree(new_tree, target_variabel_name, data_temp, dist)
                        new_rules.update({i: dict_temp})
                        break

        if min_recall is not None:
            for k in tree.index:
                if precision_recall.loc[k, 'recall'] < min_recall:
                    data_temp = data.copy()
                    new_tree = tree.drop(range(k, max(tree.index) + 1))
                    dist = _get_dist(new_tree, data, target_variabel_name, weight_classes)
                    dict_temp = _print_tree(new_tree, target_variabel_name, data_temp, dist)
                    new_rules.update({i: dict_temp})
                    break
    new_rules_sorted = []
    cutted_tree_rules = {}
    for i in new_rules:
        if str(new_rules[i]) not in new_rules_sorted:
            new_rules_sorted.append(str(new_rules[i]))

    for idx, i in enumerate(new_rules_sorted):
        cutted_tree_rules.update({idx: eval(i)})
    return cutted_tree_rules


# returns the parent leaf and the value if left or right bought is used
def _parent(child_left, child_right, actual):
    newvalue = -1
    tr_fl = -1
    if actual in child_left:
        for idx, i in enumerate(child_left):
            if i == actual:
                newvalue = idx
                tr_fl = True
    if actual in child_right:
        for idx, i in enumerate(child_right):
            if i == actual:
                newvalue = idx
                tr_fl = False
    if newvalue == -1:
        raise Exception("Error in tree! could not find leaf")
    if tr_fl == -1:
        raise Exception("Error in tree! could not find leaf")
    return newvalue, tr_fl


# returns the basis structure of the rules
def _buildtree(tree_given, start, features_names):
    child_left = tree_given.tree_.children_left
    child_right = tree_given.tree_.children_right
    features = tree_given.tree_.feature
    treshold = tree_given.tree_.threshold
    # number of the feature of the first node is the class of the rule
    class_number = np.argmax(tree_given.tree_.value[start])
    tree_path = pd.DataFrame([[start, class_number, np.NaN, -5]],
                             columns=['node', 'feature', 'condition', 'true_false'])
    node = start
    while node != 0:  # going backwards in the tree to the beginning of the tree
        node, tr_fl = _parent(child_left, child_right, node)  # get the parent_leaf
        feature = features_names[features[node]]
        condition = treshold[node]
        tree_temp = pd.DataFrame([[node, feature, condition, tr_fl]],
                                 columns=['node', 'feature', 'condition', 'true_false'])
        tree_path = tree_path.append(tree_temp, ignore_index=True)

    tree_path = tree_path.sort_index(axis=0, ascending=False)
    tree_path.drop(0, inplace=True)
    tree_path.index = range(tree_path.shape[0])
    return tree_path


# returns all leafs which are in the class(rule), if rule is None all rules for all classes are returned
def _extract_leafs(tree_given, classes, rule):
    features = tree_given.tree_.feature
    list_leafs = []
    list_leafs_end = []
    for idx, i in enumerate(features):
        if i == -2:
            list_leafs.append(idx)
    if rule is not None:
        for i in list_leafs:
            if rule == classes[np.argmax(tree_given.tree_.value[i])]:
                list_leafs_end.append(i)
    else:
        list_leafs_end = list_leafs
    return list_leafs_end


# returns the distribution of the rule  mit Panda!!!!
def _get_dist(tree_path, data_ges, target_variabel_name, weight=None):
    classes = data_ges[target_variabel_name].unique()
    if weight is None:
        weight = {k: 1 for k in classes}
    dist = pd.DataFrame(columns=['elements', 'uniques'])  # index= class

    for i in range(tree_path.shape[0]):  # going through all features
        if tree_path.true_false[i]:
            data_ges = data_ges[data_ges[tree_path.feature[i]] <= tree_path.condition[
                i]]  # delete all rows which not fullfill the criterion
        else:
            data_ges = data_ges[data_ges[tree_path.feature[i]] > tree_path.condition[i]]
    for i in classes:  # going through all classes
        dist_temp = data_ges[data_ges[target_variabel_name] == i]  # delete if the row is not in the class
        if dist_temp.shape[0] == 0:
            dist.loc[i] = [0, 0]
        else:
            dist.loc[i] = [dist_temp.shape[0] * weight[i], dist_temp.drop_duplicates().shape[0] * weight[i]]
            # [number of elements, unique elements]
    return dist


# returns a string of the rule and distribution
def _print_tree(tree_path, target_variabel_name, data_ges, dist, leaf_class=None, reg_tree=False):
    if leaf_class is None:
        dist['elements'] = dist['elements'].apply(pd.to_numeric, errors='coerce')
        leaf_class = dist.elements.idxmax()

    rule = []
    classes = data_ges[target_variabel_name].unique()
    ttrain = data_ges[target_variabel_name]
    for i in tree_path.index:
        if tree_path.true_false[i]:
            rule.append('If ' + str(tree_path.feature[i]) + ' <= ' + str(tree_path.condition[i]) + '\n')
        else:
            rule.append('If ' + str(tree_path.feature[i]) + ' > ' + str(tree_path.condition[i]) + '\n')
    rule = ''.join(rule)

    if reg_tree:
        return {'rule': rule, 'targetclass': leaf_class[0]}
    else:
        dist_dict = {}
        for cl in classes:
            dist_dict.update({cl: float(dist.loc[cl, 'elements'])})

        # precision
        data_sum = dist['elements'].sum()
        data_class = np.NaN
        for i in dist.index:
            if str(i) == str(leaf_class):
                data_class = dist['elements'][i]
        if data_sum == 0:
            precision = np.NaN
        else:
            precision = float(data_class) / float(data_sum)
        # recall
        ttrain_regel = len([i for i in ttrain if i == leaf_class])  # deletes all rows, which are not of the wanted
        #  (rule) class
        recall = 'Error'
        for i in dist.index:
            if str(i) == str(leaf_class):
                if ttrain_regel == 0:
                    recall = np.NaN
                else:
                    recall = float(dist['elements'][i]) / float(ttrain_regel)
    return {'rule': rule, 'targetclass': leaf_class, 'class_dist': dist_dict, 'precision': precision,
            'recall': recall}


def _build_tree_out_of_string(rule):
    tree = pd.DataFrame(columns=['feature', 'true_false', 'condition'])
    rule = deque(rule.split())
    i = 0
    while rule != deque([]):
        rule.popleft()
        tree.loc[i] = [str(rule.popleft()), True if str(rule.popleft()) == '<=' else False, float(rule.popleft())]
        i += 1
    return tree


def _calculate_precision(tree, data, target_variabel_name, weight_classes):
    pre_rec = pd.DataFrame(columns=['precision', 'recall'])
    number_elemts = pd.DataFrame(columns=['elements'])
    classes = data[target_variabel_name].unique()
    if weight_classes is None:
        weight_classes = {k: 1 for k in classes}
    ttrain = data[target_variabel_name]
    data_ges = data.copy()
    for i in range(tree.shape[0]):  # going through all features
        if tree.true_false[i]:
            data_ges = data_ges[data_ges[tree.feature[i]] <= tree.condition[
                i]]  # delete all rows which not fullfill the criterion
        else:
            data_ges = data_ges[data_ges[tree.feature[i]] > tree.condition[i]]
        for k in classes:  # going through all classes
            dist_temp = data_ges[data_ges[target_variabel_name] == k]  # delete if the row is not in the class
            if dist_temp.shape[0] == 0:
                number_elemts.loc[k] = [0]
            else:
                number_elemts.loc[k] = [dist_temp.shape[0] * weight_classes[k]]

        # precision
        number_elemts['elements'] = number_elemts['elements'].apply(pd.to_numeric, errors='coerce')
        rule_class = number_elemts['elements'].idxmax()
        data_sum = number_elemts['elements'].sum()
        data_class = np.NaN
        for k in number_elemts.index:
            if k == rule_class:
                data_class = number_elemts['elements'][k]
        if data_sum == 0:
            precision = np.NaN
        else:
            precision = float(data_class) / float(data_sum)
        ttrain_regel = len(
            [j for j in ttrain if j == rule_class])
        recall = np.NaN
        for l in number_elemts.index:
            if l == rule_class:
                if ttrain_regel == 0:
                    recall = np.NaN
                else:
                    recall = float(number_elemts['elements'][l]) / float(ttrain_regel)
        pre_rec.loc[i, 'precision'] = precision
        pre_rec.loc[i, 'recall'] = recall
    return pre_rec