-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathcomparators.py
More file actions
106 lines (90 loc) · 3.31 KB
/
comparators.py
File metadata and controls
106 lines (90 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from __future__ import annotations
from typing import Literal
import numpy as np
from ..dataset import ABCRole, Dataset
from ..utils.constants import NUMBER_TYPES_LIST
from .abstract import Comparator
NUM_OF_BUCKETS = 10
class GroupDifference(Comparator):
def __init__(
self,
compare_by: Literal[
"groups", "columns", "columns_in_groups", "cross"
] = "groups",
grouping_role: ABCRole | None = None,
target_roles: ABCRole | list[ABCRole] | None = None,
):
super().__init__(
compare_by=compare_by,
grouping_role=grouping_role,
target_roles=target_roles,
)
@property
def search_types(self) -> list[type] | None:
return NUMBER_TYPES_LIST
@classmethod
def calc(cls, data: Dataset, test_data: Dataset | None = None, **kwargs) -> dict:
test_data = cls._check_test_data(test_data)
control_mean = data.mean()
test_mean = test_data.mean()
return {
"control mean": control_mean,
"test mean": test_mean,
"difference": test_mean - control_mean,
"difference %": (
(test_mean / control_mean - 1) * 100 if control_mean != 0 else None
),
}
class GroupSizes(Comparator):
def __init__(
self,
compare_by: Literal[
"groups", "columns", "columns_in_groups", "cross"
] = "groups",
grouping_role: ABCRole | None = None,
):
super().__init__(
compare_by=compare_by,
grouping_role=grouping_role,
target_roles=grouping_role,
)
@classmethod
def calc(cls, data: Dataset, test_data: Dataset | None = None, **kwargs) -> dict:
size_a = len(data)
size_b = len(test_data) if isinstance(test_data, Dataset) else 0
return {
"control size": size_a,
"test size": size_b,
"control size %": (size_a / (size_a + size_b)) * 100,
"test size %": (size_b / (size_a + size_b)) * 100,
}
class PSI(Comparator):
@classmethod
def calc(
cls, data: Dataset, test_data: Dataset | None = None, **kwargs
) -> dict[str, float]:
test_data = cls._check_test_data(test_data=test_data)
data.sort(ascending=False)
test_data.sort(ascending=False)
data_column = data.iloc[:, 0]
test_data_column = test_data.iloc[:, 0]
data_bins = np.arange(
data_column.min(),
data_column.max(),
(data_column.max() - data_column.min()) / NUM_OF_BUCKETS,
)
test_data_bins = np.arange(
test_data_column.min(),
test_data_column.max(),
(test_data_column.max() - test_data_column.min()) / NUM_OF_BUCKETS,
)
data_groups = data_column.groupby(
data_column.cut(data_bins).get_values(column=data.columns[0])
)
test_data_groups = test_data_column.groupby(
test_data_column.cut(test_data_bins).get_values(column=test_data.columns[0])
)
data_psi = [x[1].count() / len(data) for x in data_groups]
test_data_psi = [x[1].count() / len(test_data) for x in test_data_groups]
psi = [(y - x) * np.log(y / x) for x, y in zip(data_psi, test_data_psi)]
return {"PSI": sum(psi)}