Skip to content

Commit a7d919a

Browse files
update subgroup experiment with gb
1 parent b101c54 commit a7d919a

File tree

11 files changed

+655
-212
lines changed

11 files changed

+655
-212
lines changed

feature_importance/correlation-bias/correlation.ipynb

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

feature_importance/correlation-bias/correlation_gb.ipynb

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

feature_importance/counterfactuals/knn-results.ipynb

Lines changed: 429 additions & 67 deletions
Large diffs are not rendered by default.

feature_importance/subgroup/get-values/investigation.sh renamed to feature_importance/subgroup/get-values/investigation-gb.sh

File renamed without changes.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
#SBATCH --partition=yugroup
3+
#SBATCH --cpus-per-task=4
4+
5+
source activate mdi
6+
command="investigation.py --dataname ${1} --seed ${2} --method ${3}"
7+
8+
# Execute the command
9+
python $command
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/bash
2+
3+
slurm_script="investigation-gb.sh"
4+
5+
id=361260
6+
seeds=(0 1 2 3 4)
7+
method="gb"
8+
9+
for seed in "${seeds[@]}"; do
10+
sbatch $slurm_script $id $seed $method # submit SLURM job using the specified script
11+
done

feature_importance/subgroup/get-values/investigation-runner.sh renamed to feature_importance/subgroup/get-values/investigation-runner-rf.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
slurm_script="investigation.sh"
3+
slurm_script="investigation-rf.sh"
44

55
id=361260
66
seeds=(0 1 2 3 4)
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# standard data science packages
2+
import numpy as np
3+
4+
# functions for subgroup experiments
5+
import shap
6+
from local_mdi import local_mdi_score
7+
8+
# sklearn imports
9+
from sklearn.model_selection import train_test_split
10+
11+
# for saving results
12+
import argparse
13+
import os
14+
from os.path import join as oj
15+
import time
16+
17+
# subgroup imports
18+
from subgroup import fit_gb_models, create_lmdi_variant_map, get_lmdi_explainers, \
19+
get_lmdi, get_shap, get_lime
20+
21+
if __name__ == '__main__':
22+
23+
# store command-line arguments
24+
parser = argparse.ArgumentParser()
25+
parser.add_argument('--dataname', type=str, default=None)
26+
parser.add_argument('--seed', type=int, default=None)
27+
parser.add_argument('--method', type=str, default=None)
28+
args = parser.parse_args()
29+
30+
# convert namespace to a dictionary
31+
args_dict = vars(args)
32+
33+
# assign the arguments to variables
34+
dataname = args_dict['dataname']
35+
seed = args_dict['seed']
36+
tree_method = args_dict['method']
37+
38+
# check that tree_method is valid
39+
if tree_method != "gb":
40+
raise ValueError("Invalid tree method. Please choose 'gb'.")
41+
# if tree_method not in ["rf", "gb"]:
42+
# raise ValueError("Invalid tree method. Please choose 'rf' or 'gb'.")
43+
44+
print("Running Pipeline w/ " + dataname)
45+
46+
dir_data = "../data_openml"
47+
48+
X = np.loadtxt(oj(dir_data, f"X_{dataname}.csv"), delimiter=",")[1:,:]
49+
y = np.loadtxt(oj(dir_data, f"y_{dataname}.csv"), delimiter=",")[1:]
50+
51+
# cast to np.float32
52+
X = X.astype(np.float32)
53+
y = y.astype(np.float32)
54+
55+
print("Step 1")
56+
57+
starttime = time.time()
58+
59+
# split data into training and testing
60+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,
61+
random_state = seed)
62+
63+
# fit random forest models
64+
gb, gb_plus_elastic = fit_gb_models(X_train, y_train, "regression")
65+
66+
endtime = time.time()
67+
68+
print("Step 2: " + str(endtime - starttime) + " seconds")
69+
70+
starttime = time.time()
71+
72+
# create list of lmdi variants
73+
lmdi_variants = create_lmdi_variant_map()
74+
75+
# obtain lmdi+ feature importances
76+
lmdi_explainers = get_lmdi_explainers(gb_plus_elastic, lmdi_variants)
77+
78+
endtime = time.time()
79+
80+
print("Step 3: " + str(endtime - starttime) + " seconds")
81+
82+
starttime = time.time()
83+
84+
# we don't actually want to use the training values, but for leaf averaging
85+
# variants, we need to have the training data to compute the leaf averages
86+
lfi_values, lfi_rankings = get_lmdi(X_test, None, lmdi_variants,
87+
lmdi_explainers)
88+
89+
endtime = time.time()
90+
91+
print("Step 4: " + str(endtime - starttime) + " seconds")
92+
93+
starttime = time.time()
94+
95+
# obtain shap feature importances
96+
shap_explainer = shap.TreeExplainer(gb)
97+
shap_values, shap_rankings = get_shap(X_test, shap_explainer, "regression")
98+
99+
endtime = time.time()
100+
101+
print("Step 5: " + str(endtime - starttime) + " seconds")
102+
103+
starttime = time.time()
104+
105+
# obtain lime feature importances
106+
lime_values, lime_rankings = get_lime(X_test, gb, "regression")
107+
108+
endtime = time.time()
109+
110+
print("Step 6: " + str(endtime - starttime) + " seconds")
111+
112+
# get the path to the parent directory of the current file
113+
parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
114+
result_dir = oj(parent_dir, "lfi-values", "gb", f"seed{seed}")
115+
116+
# if the path does not exist, create it
117+
if not os.path.exists(oj(result_dir, dataname)):
118+
os.makedirs(oj(result_dir, dataname))
119+
120+
# print result directory
121+
print("Writing results to: " + oj(result_dir, dataname))
122+
123+
# for each variant write the LFI values to a csv
124+
for variant in lfi_values.keys():
125+
np.savetxt(oj(result_dir, dataname, f"{variant}.csv"), lfi_values[variant], delimiter=",")
126+
127+
np.savetxt(oj(result_dir, dataname, "shap.csv"), shap_values, delimiter=",")
128+
np.savetxt(oj(result_dir, dataname, "lime.csv"), lime_values, delimiter=",")

feature_importance/subgroup/get-values/investigation.py renamed to feature_importance/subgroup/get-values/investigation_rf.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
import time
1616

1717
# subgroup imports
18-
from subgroup import fit_models, create_lmdi_variant_map, get_lmdi_explainers, \
19-
get_lmdi, get_shap, get_lime, get_maple
18+
from subgroup import fit_rf_models, create_lmdi_variant_map, get_lmdi_explainers, \
19+
get_lmdi, get_shap, get_lime
2020

2121
if __name__ == '__main__':
2222

@@ -61,7 +61,7 @@
6161
random_state = seed)
6262

6363
# fit random forest models
64-
rf, rf_plus_baseline, rf_plus_elastic = fit_models(X_train, y_train, "regression")
64+
rf, rf_plus_elastic = fit_rf_models(X_train, y_train, "regression")
6565

6666
endtime = time.time()
6767

@@ -73,8 +73,7 @@
7373
lmdi_variants = create_lmdi_variant_map()
7474

7575
# obtain lmdi+ feature importances
76-
lmdi_explainers = get_lmdi_explainers(rf_plus_baseline, rf_plus_elastic,
77-
lmdi_variants)
76+
lmdi_explainers = get_lmdi_explainers(rf_plus_elastic, lmdi_variants)
7877

7978
endtime = time.time()
8079

@@ -110,14 +109,17 @@
110109

111110
print("Step 6: " + str(endtime - starttime) + " seconds")
112111

113-
# obtain maple feature importances
114-
maple_values, maple_rankings = get_maple(X_train, y_train, X_test, rf)
112+
starttime = time.time()
115113

116114
_, lmdi_sutera_values = local_mdi_score(X_train, X_test, model=rf, absolute=False)
117115

116+
endtime = time.time()
117+
118+
print("Step 7: " + str(endtime - starttime) + " seconds")
119+
118120
# get the path to the parent directory of the current file
119121
parent_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
120-
result_dir = oj(parent_dir, "lfi-values", f"seed{seed}")
122+
result_dir = oj(parent_dir, "lfi-values", "rf", f"seed{seed}")
121123

122124
# if the path does not exist, create it
123125
if not os.path.exists(oj(result_dir, dataname)):
@@ -132,5 +134,4 @@
132134

133135
np.savetxt(oj(result_dir, dataname, "shap.csv"), shap_values, delimiter=",")
134136
np.savetxt(oj(result_dir, dataname, "lime.csv"), lime_values, delimiter=",")
135-
np.savetxt(oj(result_dir, dataname, "maple.csv"), maple_values, delimiter=",")
136137
np.savetxt(oj(result_dir, dataname, "lmdi_sutera.csv"), lmdi_sutera_values, delimiter=",")

feature_importance/subgroup/get-values/skmaple.py

Lines changed: 0 additions & 54 deletions
This file was deleted.

0 commit comments

Comments
 (0)