-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprianalysis.py
More file actions
116 lines (79 loc) · 4.09 KB
/
prianalysis.py
File metadata and controls
116 lines (79 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
This modules analyses priority behaviour in our dataset
"""
import logging
import math
import pandas as pd
import numpy as np
from scipy import stats
import gtconfig
import time
import simdata
import simdriver
if gtconfig.is_windows:
import winsound
logger = gtconfig.get_logger("priority_analysis", "priority_analysis.txt", level=logging.INFO)
def get_effect_size(one_sample, other_sample):
"""
Return's the Cohen's d statistic
:return:
"""
one_sample_count, other_sample_count = one_sample.count() - 1, other_sample.count() - 1
pooled_std = math.sqrt((one_sample_count * one_sample.std() ** 2 + other_sample_count * other_sample.std() ** 2) / (
one_sample_count + other_sample_count - 2))
effect_size = (one_sample.mean() - other_sample.mean()) / pooled_std
return effect_size
def perform_parametric_test(first_sample, second_sample, threshold):
t_statistic, p_value = stats.ttest_ind(first_sample,
second_sample,
equal_var=False)
logger.info("Welch t-test result: t_statistic " + str(t_statistic) + " p_value " + str(p_value))
logger.info("Effect size (Cohen's d): " + str(get_effect_size(first_sample,
second_sample)))
if p_value > threshold:
logger.info("We CANNOT REJECT the null hypothesis of identical average scores")
else:
logger.info("We REJECT the null hypothesis of equal averages")
def perform_nonparametric_test(first_sample, second_sample, threshold):
u_statistic, p_value = stats.mannwhitneyu(first_sample, second_sample, alternative="two-sided")
logger.info("Mann-Whitney rank test result: u_statistic " + str(u_statistic) + " p_value " + str(p_value))
if p_value < threshold:
logger.info("The two samples are significantly DIFFERENT")
else:
logger.info("NO DIFFERENCE between the two samples.")
def main():
logger.info("Starting priority analysis ...")
logger.info("Loading information from " + simdata.ALL_ISSUES_CSV)
all_issues = pd.read_csv(simdata.ALL_ISSUES_CSV)
logger.info("Adding calculated fields...")
enhanced_dataframe = simdata.enhace_report_dataframe(all_issues)
valid_projects = simdriver.get_valid_projects(enhanced_dataframe, threshold=simdriver.VALID_THRESHOLD)
_, training_issues, _, _ = simdriver.split_bug_dataset(enhanced_dataframe, test_size=simdriver.TEST_SIZE,
valid_projects=valid_projects)
priority_sample = training_issues[simdata.SIMPLE_PRIORITY_COLUMN]
counts_per_priority = priority_sample.value_counts()
logger.info("Simplified Priorities in Training Range: \n " + str(counts_per_priority))
all_resolved_issues = simdata.filter_resolved(training_issues, only_with_commits=True,
only_valid_resolution=True)
samples_per_priority = {}
for priority in priority_sample.unique():
if not np.isnan(priority):
priority_resolved = all_resolved_issues[all_resolved_issues[simdata.SIMPLE_PRIORITY_COLUMN] == priority]
resolution_time_sample = priority_resolved[simdata.RESOLUTION_TIME_COLUMN].dropna()
desc = "Priority_" + str(priority)
logger.info("Resolution times in Training Range for " + desc + ": \n" +
str(resolution_time_sample.describe()))
samples_per_priority[priority] = resolution_time_sample
threshold = 0.05
perform_parametric_test(samples_per_priority[simdata.NON_SEVERE_PRIORITY],
samples_per_priority[simdata.SEVERE_PRIORITY], threshold)
perform_nonparametric_test(samples_per_priority[simdata.NON_SEVERE_PRIORITY],
samples_per_priority[simdata.SEVERE_PRIORITY], threshold)
if __name__ == "__main__":
start_time = time.time()
try:
main()
finally:
if gtconfig.is_windows:
winsound.Beep(2500, 1000)
print "Execution time in seconds: ", (time.time() - start_time)