Skip to content

Commit 88d7547

Browse files
authored
Merge pull request #3 from Nance-Lab/Nels
Nels
2 parents bdbbf6f + 387c027 commit 88d7547

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+200087
-52725
lines changed

diff_predictor/data_process.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,36 +28,42 @@ def generate_fullstats(dataset_path, filelist, targets, target_col_name='Target'
2828
video_num = 0
2929
for filename in filelist:
3030
fstats = pd.read_csv(dataset_path + filename, encoding = "ISO-8859-1", index_col='Unnamed: 0')
31-
print('{} size: {}'.format(filename, fstats.shape))
31+
#print('{} size: {}'.format(filename, fstats.shape))
3232

3333
for i in range(0, len(targets)):
3434
if targets[i] in filename:
35+
print('Adding file {} size: {}'.format(filename, fstats.shape))
3536
fstats[target_col_name] = pd.Series(fstats.shape[0]*[targets[i]], index=fstats.index)
36-
break
37+
fstats['Filename'] = pd.Series(fstats.shape[0]*[filename], index=fstats.index)
38+
fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
39+
if fstats_tot is None:
40+
fstats_tot = fstats
41+
else:
42+
fstats_tot = fstats_tot.append(fstats, ignore_index=True)
43+
video_num += 1
44+
#break
3745

38-
fstats['Video Number'] = pd.Series(fstats.shape[0]*[video_num], index=fstats.index)
39-
if fstats_tot is None:
40-
fstats_tot = fstats
41-
else:
42-
fstats_tot = fstats_tot.append(fstats, ignore_index=True)
43-
video_num += 1
4446

4547
return fstats_tot
4648

4749
def balance_data(df, target, **kwargs):
4850
"""
49-
Balances the dataset so there are equal number of rows for each class
50-
Parameters:
51+
Balance spatial data using undersampling. Assumes input will
52+
be a dataframe and data will be used for categorical classification
53+
Parameters
5154
----------
52-
df: pandas.DataFrame
53-
dataframe to be balanced
54-
target: string
55-
name of dataframe column that represents that class the row is from
56-
57-
Returns:
58-
--------
59-
bal_df: pandas.DataFrame
60-
dataframe with equal number of rows per unique class
55+
df : pandas.DataFrame
56+
pandas dataframe to be balanced
57+
target : string
58+
the name of the target/tag/y-value column to balance data around
59+
60+
Optional Parameters
61+
-------------------
62+
random_state : int : 1
63+
seed to base random sampling from
64+
Returns
65+
-------
66+
A fully balanced pandas dataframe
6167
"""
6268
if 'random_state' not in kwargs:
6369
random_state = 1
@@ -140,5 +146,4 @@ def split_data(df, target, train_split, test_val_split=1.0, seed=1234):
140146
y_train = X_train['encoded_target']
141147
y_test = X_test['encoded_target']
142148
result = np.append([(X_train, y_train), (X_test, y_test)], result)
143-
return result, le
144-
149+
return result, le

diff_predictor/eval.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,66 @@
11
import sys
2+
import numpy
3+
import scipy.stats
4+
from seaborn import heatmap
25

6+
if 'diff_predictor.core' not in sys.modules:
7+
from diff_predictor import core
38

4-
if 'core' not in sys.modules:
5-
import core
9+
10+
def perf_meas(y_actual, y_pred, cls, verbose=True):
11+
'''
12+
Shows the performance measurements of resulting prediction.
13+
Performance measures include true positive, true negative,
14+
false positive, false negative
15+
Parameters
16+
----------
17+
y_actual : list
18+
Actual values of y
19+
y_pred : list
20+
Predicted values of y
21+
cls : int
22+
class to run performance measure on
23+
verbose : boolean : True
24+
report performance as a string
25+
Returns
26+
-------
27+
tuple of four performance values (TP, FP, TN, FN)
28+
'''
29+
30+
assert len(y_actual) == len(y_pred), 'Must be same number of actual and predicted values'
31+
32+
TP = 0
33+
FP = 0
34+
TN = 0
35+
FN = 0
36+
for i in range(len(y_actual)):
37+
if (y_actual[i]==y_pred[i]) and (y_pred[i]==cls):
38+
TP += 1
39+
if (y_pred[i]==cls) and (y_actual[i]!=y_pred[i]):
40+
FP += 1
41+
if (y_actual[i]==y_pred[i]) and (y_pred[i]!=cls):
42+
TN += 1
43+
if (y_pred[i]!=cls) and (y_actual[i]!=y_pred[i]):
44+
FN += 1
45+
if verbose is True:
46+
print(f'(TP, FP, TN, FN) = {(TP, FP, TN, FN)}')
47+
return(TP, FP, TN, FN)
48+
49+
50+
def corrmat(df, method='pearson', show_plot=True, **kwargs):
51+
'''
52+
53+
'''
54+
plot_options = {'annot': True,
55+
'fmt': "f",
56+
}
57+
plot_options.update(kwargs)
58+
error_msg = "Correlation type not available. Select" +\
59+
"from pearson, spearman, or kendall corr."
60+
switch_case = {'pearson': df.corr(),
61+
'spearman': df.corr(method=method),
62+
'kendall': df.corr(method=method)}
63+
corr_mat = switch_case.get(method, lambda: error_msg)
64+
if show_plot:
65+
return heatmap(corr_mat, **plot_options)
66+
return corr_mat

diff_predictor/predxgboost.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,14 @@
33
import json
44
import numpy as np
55
import pandas as pd
6-
from os import path
7-
from sklearn.metrics import accuracy_score
6+
import xgboost as xgb
7+
import shap
8+
from matplotlib import colors as plt_colors
9+
import operator
10+
11+
from sklearn.model_selection import train_test_split
12+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
13+
from sklearn import preprocessing
814

915
import xgboost as xgb
1016
from xgboost import callback, DMatrix, Booster
@@ -71,8 +77,8 @@ def mknfold(X_train, y_train, nfold, param, evals=(), features=None):
7177
wt_list : list
7278
list of weights for each fold. This is the size of each fold
7379
'''
74-
if not features:
75-
features = X_train.columns
80+
#if not features:
81+
#features = X_train.columns
7682
out_idset, wt_list = bin_fold(X_train, nfold)
7783
in_idset = [np.concatenate([out_idset[i]
7884
for i in range(nfold) if k != i])
@@ -469,7 +475,7 @@ def _gs_helper(var1n, var2n, best_model, best_param,
469475
return best_model, best_param, best_eval, best_boost_rounds
470476

471477

472-
def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
478+
def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000, verbose=True):
473479
'''
474480
Parameters
475481
----------
@@ -514,13 +520,13 @@ def train(param, dtrain, dtest, dval=None, evals=None, num_round=2000):
514520
evals = [(dtrain, 'train')]
515521
if dval is not None and (dval, 'eval') not in evals:
516522
evals += [(dval, 'eval')]
517-
model = xgb.train(param, dtrain, num_round, evals, )
523+
model = xgb.train(param, dtrain, num_round, evals, verbose_eval=verbose)
518524
true_label = dtest.get_label()
519525
ypred = model.predict(dtest)
520526
preds = [np.where(x == np.max(x))[0][0] for x in ypred]
521527
acc = accuracy_score(true_label, preds)
522-
print("Accuracy:", acc)
523-
return model, acc
528+
print("Accuracy:",acc)
529+
return model, acc, true_label, preds
524530

525531

526532
def save(model, filename):

0 commit comments

Comments
 (0)