Skip to content

Commit 0a68639

Browse files
authored
Merge pull request #65 from ACCLAB/v0.2.5
v0.2.5
2 parents dfea0dd + 0db73e2 commit 0a68639

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+637
-435
lines changed

dabest/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@
2323
from ._stats_tools import effsize as effsize
2424
from ._classes import TwoGroupsEffectSize
2525

26-
__version__ = "0.2.4"
26+
__version__ = "0.2.5"

dabest/_classes.py

+33-9
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,24 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
4141
if all([isinstance(i, str) for i in idx]):
4242
# flatten out idx.
4343
all_plot_groups = pd.unique([t for t in idx]).tolist()
44+
if len(idx) > len(all_plot_groups):
45+
err0 = '`idx` contains duplicated groups. Please remove any duplicates and try again.'
46+
raise ValueError(err0)
47+
4448
# We need to re-wrap this idx inside another tuple so as to
4549
# easily loop thru each pairwise group later on.
4650
self.__idx = (idx,)
4751

4852
elif all([isinstance(i, (tuple, list)) for i in idx]):
4953
all_plot_groups = pd.unique([tt for t in idx for tt in t]).tolist()
54+
55+
actual_groups_given = sum([len(i) for i in idx])
56+
57+
if actual_groups_given > len(all_plot_groups):
58+
err0 = 'Groups are repeated across tuples,'
59+
err1 = ' or a tuple has repeated groups in it.'
60+
err2 = ' Please remove any duplicates and try again.'
61+
raise ValueError(err0 + err1 + err2)
5062

5163
else: # mix of string and tuple?
5264
err = 'There seems to be a problem with the idx you'
@@ -91,9 +103,14 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
91103
# check all the idx can be found in data_in[x]
92104
for g in all_plot_groups:
93105
if g not in data_in[x].unique():
94-
raise IndexError('{0} is not a group in `{1}`.'.format(g, x))
106+
err0 = '"{0}" is not a group in the column `{1}`.'.format(g, x)
107+
err1 = " Please check `idx` and try again."
108+
raise IndexError(err0 + err1)
95109

110+
# Select only rows where the value in the `x` column
111+
# is found in `idx`.
96112
plot_data = data_in[data_in.loc[:, x].isin(all_plot_groups)].copy()
113+
97114
# plot_data.drop("index", inplace=True, axis=1)
98115

99116
# Assign attributes
@@ -113,8 +130,10 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
113130
# First, check we have all columns in the dataset.
114131
for g in all_plot_groups:
115132
if g not in data_in.columns:
116-
raise IndexError('{0} is not a column in `data`.'.format(g))
117-
133+
err0 = '"{0}" is not a column in `data`.'.format(g)
134+
err1 = " Please check `idx` and try again."
135+
raise IndexError(err0 + err1)
136+
118137
set_all_columns = set(data_in.columns.tolist())
119138
set_all_plot_groups = set(all_plot_groups)
120139
id_vars = set_all_columns.difference(set_all_plot_groups)
@@ -139,8 +158,8 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
139158
categories=all_plot_groups,
140159
ordered=True)
141160

142-
# Line 143 added in v0.2.4.
143-
plot_data.dropna(inplace=True)
161+
# # The line below was added in v0.2.4, removed in v0.2.5.
162+
# plot_data.dropna(inplace=True)
144163

145164
self.__plot_data = plot_data
146165

@@ -990,8 +1009,11 @@ def __pre_calc(self):
9901009
self.__random_seed)
9911010
r_dict = result.to_dict()
9921011

993-
r_dict["control"] = cname
994-
r_dict["test"] = tname
1012+
r_dict["control"] = cname
1013+
r_dict["test"] = tname
1014+
r_dict["control_N"] = int(len(control))
1015+
r_dict["test_N"] = int(len(test))
1016+
9951017
out.append(r_dict)
9961018

9971019
if j == len(idx)-1 and ix == len(current_tuple)-2:
@@ -1020,7 +1042,8 @@ def __pre_calc(self):
10201042

10211043
out_ = pd.DataFrame(out)
10221044

1023-
columns_in_order = ['control', 'test', 'effect_size', 'is_paired',
1045+
columns_in_order = ['control', 'test', 'control_N', 'test_N',
1046+
'effect_size', 'is_paired',
10241047
'difference', 'ci',
10251048

10261049
'bca_low', 'bca_high', 'bca_interval_idx',
@@ -1256,7 +1279,8 @@ def statistical_tests(self):
12561279
stats_columns = [c for c in results_df.columns
12571280
if c.startswith("statistic") or c.startswith("pvalue")]
12581281

1259-
default_cols = ['control', 'test', 'effect_size', 'is_paired',
1282+
default_cols = ['control', 'test', 'control_N', 'test_N',
1283+
'effect_size', 'is_paired',
12601284
'difference', 'ci', 'bca_low', 'bca_high']
12611285

12621286
cols_of_interest = default_cols + stats_columns

dabest/_stats_tools/confint_1group.py

+29-22
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,28 @@
22
# -*-coding: utf-8 -*-
33
# Author: Joses Ho
44
5+
"""
6+
A range of functions to compute bootstraps for a single sample.
7+
"""
58

9+
def create_bootstrap_indexes(array, resamples=5000, random_seed=12345):
10+
"""Given an array-like, returns a generator of bootstrap indexes
11+
to be used for resampling.
12+
"""
13+
import numpy as np
14+
15+
# Set seed.
16+
np.random.seed(random_seed)
17+
18+
indexes = range(0, len(array))
619

20+
out = (np.random.choice(indexes, len(indexes), replace=True)
21+
for i in range(0, resamples))
22+
23+
# Reset seed
24+
np.random.seed()
25+
26+
return out
727

828
def compute_1group_jackknife(x, func, *args, **kwargs):
929
"""
@@ -20,22 +40,6 @@ def compute_1group_jackknife(x, func, *args, **kwargs):
2040
def compute_1group_acceleration(jack_dist):
2141
from . import confint_2group_diff as ci_2g
2242
return ci_2g._calc_accel(jack_dist)
23-
24-
25-
26-
def _create_bootstrap_indexes(array, resamples=5000):
27-
"""Given an array-like, returns a generator of bootstrap indexes
28-
to be used for resampling.
29-
"""
30-
import numpy as np
31-
32-
indexes = range(0, len(array))
33-
34-
out = (np.random.choice(indexes, len(indexes), replace=True)
35-
for i in range(0, resamples))
36-
37-
return out
38-
3943

4044

4145

@@ -49,8 +53,9 @@ def compute_1group_bootstraps(x, func, resamples=5000, random_seed=12345,
4953
np.random.seed(random_seed)
5054

5155
# Create bootstrap indexes.
52-
boot_indexes = _create_bootstrap_indexes(x, resamples)
53-
56+
boot_indexes = create_bootstrap_indexes(x, resamples=resamples,
57+
random_seed=random_seed)
58+
5459
out = [func(x[b], *args, **kwargs) for b in boot_indexes]
5560

5661
del boot_indexes
@@ -123,11 +128,13 @@ def summary_ci_1group(x, func, resamples=5000, alpha=0.05, random_seed=12345,
123128
from . import confint_2group_diff as ci2g
124129
from numpy import sort as npsort
125130

126-
boots = compute_1group_bootstraps(x, func, resamples, random_seed)
127-
bias = compute_1group_bias_correction(x, boots, func)
131+
boots = compute_1group_bootstraps(x, func, resamples=resamples,
132+
random_seed=random_seed,
133+
*args, **kwargs)
134+
bias = compute_1group_bias_correction(x, boots, func)
128135

129-
jk = compute_1group_jackknife(x, func)
130-
accel = ci2g._calc_accel(jk)
136+
jk = compute_1group_jackknife(x, func, *args, **kwargs)
137+
accel = compute_1group_acceleration(jk)
131138
del jk
132139

133140
ci_idx = ci2g.compute_interval_limits(bias, accel, resamples, alpha)

dabest/_stats_tools/confint_2group_diff.py

+43-6
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
# -*-coding: utf-8 -*-
33
# Author: Joses Ho
44
5-
6-
5+
"""
6+
A range of functions to compute bootstraps for the mean difference
7+
between two groups.
8+
"""
79

810
def create_jackknife_indexes(data):
911
"""
@@ -103,9 +105,34 @@ def _calc_accel(jack_dist):
103105

104106

105107

108+
# def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
109+
# resamples=5000, random_seed=12345):
110+
# """Bootstraps the effect_size for 2 groups."""
111+
# from . import effsize as __es
112+
# import numpy as np
113+
#
114+
# np.random.seed(random_seed)
115+
#
116+
# out = np.repeat(np.nan, resamples)
117+
# x0_len = len(x0)
118+
# x1_len = len(x1)
119+
#
120+
# for i in range(int(resamples)):
121+
# x0_boot = np.random.choice(x0, x0_len, replace=True)
122+
# x1_boot = np.random.choice(x1, x1_len, replace=True)
123+
# out[i] = __es.two_group_difference(x0_boot, x1_boot,
124+
# is_paired, effect_size)
125+
#
126+
# # reset seed
127+
# np.random.seed()
128+
#
129+
# return out
130+
131+
106132
def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
107133
resamples=5000, random_seed=12345):
108134
"""Bootstraps the effect_size for 2 groups."""
135+
109136
from . import effsize as __es
110137
import numpy as np
111138

@@ -114,11 +141,20 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
114141
out = np.repeat(np.nan, resamples)
115142
x0_len = len(x0)
116143
x1_len = len(x1)
117-
144+
118145
for i in range(int(resamples)):
119-
x0_boot = np.random.choice(x0, x0_len, replace=True)
120-
x1_boot = np.random.choice(x1, x1_len, replace=True)
121-
out[i] = __es.two_group_difference(x0_boot, x1_boot,
146+
147+
if is_paired:
148+
if x0_len != x1_len:
149+
raise ValueError("The two arrays do not have the same length.")
150+
random_idx = np.random.choice(x0_len, x0_len, replace=True)
151+
x0_sample = x0[random_idx]
152+
x1_sample = x1[random_idx]
153+
else:
154+
x0_sample = np.random.choice(x0, x0_len, replace=True)
155+
x1_sample = np.random.choice(x1, x1_len, replace=True)
156+
157+
out[i] = __es.two_group_difference(x0_sample, x1_sample,
122158
is_paired, effect_size)
123159

124160
# reset seed
@@ -128,6 +164,7 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
128164

129165

130166

167+
131168
def compute_meandiff_bias_correction(bootstraps, effsize):
132169
"""
133170
Computes the bias correction required for the BCa method

dabest/plotter.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -670,16 +670,13 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
670670

671671

672672

673-
674-
# Place raw axes y-label.
675-
if plot_kwargs['swarm_label'] is not None:
676-
swarm_label = plot_kwargs['swarm_label']
677-
else:
673+
# Set raw axes y-label.
674+
swarm_label = plot_kwargs['swarm_label']
675+
if swarm_label is None and yvar is None:
676+
swarm_label = "value"
677+
elif swarm_label is None and yvar is not None:
678678
swarm_label = yvar
679-
rawdata_axes.set_ylabel(swarm_label)
680-
681-
682-
679+
683680
# Place contrast axes y-label.
684681
contrast_label_dict = {'mean_diff' : "mean difference",
685682
'median_diff' : "median difference",
@@ -702,8 +699,8 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
702699
contrast_axes.yaxis.set_label_position("right")
703700

704701

705-
# Set the rawdata axes labels appropriately
706-
rawdata_axes.set_ylabel(plot_kwargs["swarm_label"])
702+
# Set the rawdata axes labels appropriately
703+
rawdata_axes.set_ylabel(swarm_label)
707704
rawdata_axes.set_xlabel("")
708705

709706

dabest/tests/test_02_edge_cases.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/python
2+
# -*-coding: utf-8 -*-
3+
# Author: Joses Ho
4+
5+
6+
7+
import sys
8+
import numpy as np
9+
import scipy as sp
10+
import pytest
11+
import pandas as pd
12+
from .._api import load
13+
14+
15+
16+
def test_unrelated_columns(N=60, random_seed=12345):
17+
"""
18+
Test to see if 'unrelated' columns jam up the analysis.
19+
See Github Issue 43.
20+
https://github.com/ACCLAB/DABEST-python/issues/44.
21+
22+
Added in v0.2.5.
23+
"""
24+
25+
np.random.seed(random_seed)
26+
27+
df = pd.DataFrame(
28+
{'groups': np.random.choice(['Group 1', 'Group 2', 'Group 3'], size=(N,)),
29+
'color' : np.random.choice(['green', 'red', 'purple'], size=(N,)),
30+
'value': np.random.random(size=(N,))})
31+
32+
np.random.seed()
33+
34+
df['unrelated'] = np.nan
35+
36+
test = load(data=df, x='groups', y='value',
37+
idx=['Group 1', 'Group 2'])
38+
39+
md = test.mean_diff.results
40+
41+
assert md.difference[0] == pytest.approx(0.1115, abs=1e-6)
42+
assert md.bca_low[0] == pytest.approx(-0.042835, abs=1e-6)
43+
assert md.bca_high[0] == pytest.approx(0.264542, abs=1e-6)
File renamed without changes.

0 commit comments

Comments
 (0)