Skip to content

Commit fb2e842

Browse files
subgroup experiments
1 parent 56e5e9f commit fb2e842

File tree

77 files changed

+28379
-201818
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+28379
-201818
lines changed

feature_importance/correlation-bias/correlation.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/bin/bash
2-
#SBATCH --partition=jsteinhardt
32

43
njobs=8
54

feature_importance/subgroup/miami-housing/compile-results/case-study.ipynb renamed to feature_importance/subgroup/compile-results/case-study.ipynb

Lines changed: 100 additions & 101 deletions
Large diffs are not rendered by default.

feature_importance/subgroup/miami-housing/compile-results/compile-results.py renamed to feature_importance/subgroup/compile-results/compile-results.py

Lines changed: 32 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,12 @@
55
# sklearn imports
66
from sklearn.model_selection import train_test_split
77
from sklearn.linear_model import LinearRegression
8-
from sklearn.metrics import mean_squared_error, r2_score
9-
from sklearn.tree import DecisionTreeRegressor
10-
from sklearn.ensemble import RandomForestRegressor
8+
from sklearn.metrics import mean_squared_error
119
from sklearn.cluster import KMeans
12-
from sklearn.preprocessing import StandardScaler
1310

1411
# hierarchical clustering imports
1512
from scipy.cluster import hierarchy
1613

17-
# data getter imports
18-
from data_loader import load_regr_data
19-
2014
# filesystem imports
2115
import os
2216
from os.path import join as oj
@@ -37,8 +31,8 @@
3731
parser.add_argument('--seed', type=int, default=None)
3832
parser.add_argument('--clustertype', type=str, default=None)
3933
parser.add_argument('--clustermodel', type=str, default=None)
40-
parser.add_argument('--datafolder', type=str, default=None)
41-
parser.add_argument('--methodname', type=str, default=None)
34+
# parser.add_argument('--datafolder', type=str, default=None)
35+
# parser.add_argument('--methodname', type=str, default=None)
4236
args = parser.parse_args()
4337

4438
# convert namespace to a dictionary
@@ -49,26 +43,26 @@
4943
seed = args_dict['seed']
5044
clustertype = args_dict['clustertype']
5145
clustermodel = args_dict['clustermodel']
52-
datafolder = args_dict['datafolder']
53-
methodname = args_dict['methodname']
46+
# datafolder = args_dict['datafolder']
47+
# methodname = args_dict['methodname']
5448

5549
# check that clustertype is either 'hierarchical' or 'kmeans'
5650
if clustertype not in ['hierarchical', 'kmeans']:
5751
raise ValueError("clustertype must be either 'hierarchical' or 'kmeans'")
5852

59-
# check that clustermodel is either 'linear' or 'tree'
60-
if clustermodel not in ['linear', 'tree', 'rf']:
61-
raise ValueError("clustermodel must be either 'linear', 'tree', or 'rf'")
53+
# check that clustermodel is 'linear'
54+
if clustermodel != 'linear':
55+
raise ValueError("clustermodel must be 'linear'")
6256

63-
# check that methodname is either rf or gb
64-
if methodname not in ['rf', 'gb']:
65-
raise ValueError("methodname must be either 'rf' or 'gb'")
57+
# check that methodname is rf
58+
# if methodname != 'rf':
59+
# raise ValueError("methodname must be 'rf'")
6660

6761
print("Compiling results for " + dataname + " with " + clustertype + \
6862
" clustering and " + clustermodel + " cluster model")
6963

7064
# if dataname not in results folder, skip
71-
if not os.path.exists(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}"):
65+
if not os.path.exists(f"../lfi-values/seed{seed}/{dataname}"):
7266
print("No results for " + dataname)
7367
else:
7468

@@ -79,88 +73,40 @@
7973
X = X.astype(np.float32)
8074
y = y.astype(np.float32)
8175

82-
# if the data is standardize it, we need to standardize again here
83-
if datafolder == "standardized-fulldata":
84-
scaler = StandardScaler()
85-
X = scaler.fit_transform(X)
86-
y = (y - np.mean(y)) / np.std(y)
87-
if datafolder == "standardizedX-fulldata":
88-
scaler = StandardScaler()
89-
X = scaler.fit_transform(X)
90-
91-
# if X has more than 5k rows, sample 5k rows of X and y
92-
# if X.shape[0] > 5000:
93-
# np.random.seed(42)
94-
# indices = np.random.choice(X.shape[0], 5000, replace=False)
95-
# X = X[indices]
96-
# y = y[indices]
97-
9876
# split data into training and testing
9977
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,
10078
random_state = seed)
10179

102-
103-
# X, y, names_covariates = load_regr_data(dataname, dir_data)
104-
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
105-
# random_state = seed)
106-
# read in lmdi variants
107-
# glm = ["ridge", "lasso", "elastic"]
108-
# normalize = {True: "normed", False: "nonnormed"}
109-
# square = {True: "squared", False: "nosquared"}
110-
# leaf_average = {True: "leafavg", False: "noleafavg"}
111-
# ranking = {True: "rank", False: "norank"}
11280
glm = ["elastic"]
113-
normalize = {False: "nonnormed"}
114-
square = {False: "nosquared"}
115-
leaf_average = {False: "noleafavg"}
11681
ranking = {False: "norank"}
11782

11883
# create the mapping of variants to argument mappings
11984
lfi_methods = []
12085
for g in glm:
121-
for n in normalize:
122-
for s in square:
123-
for r in ranking:
124-
if (not n) and (s):
125-
continue
126-
# create the name the variant will be stored under
127-
variant_name = f"{g}_{normalize[n]}_{square[s]}_{ranking[r]}"
128-
# store the arguments for the lmdi+ explainer
129-
arg_map = {"glm": g, "normalize": n, "square": s,
130-
"ranking": r}
131-
lfi_methods.append(variant_name)
86+
for r in ranking:
87+
# create the name the variant will be stored under
88+
variant_name = f"{g}_{ranking[r]}"
89+
# store the arguments for the lmdi+ explainer
90+
arg_map = {"glm": g, "ranking": r}
91+
lfi_methods.append(variant_name)
13292
lfi_methods.append("lmdi_baseline")
13393

13494
# for each variant, read in the array
13595
lfi_value_dict = {}
13696
for variant in lfi_methods:
13797
# read in the variant
138-
lmdi = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/{variant}.csv", delimiter = ",")
98+
lmdi = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/{variant}.csv", delimiter = ",")
13999
# get the mse of the variant
140100
lfi_value_dict[variant] = lmdi
141101

142102
lfi_value_dict["rawdata"] = X_test
143103
lfi_value_dict["random"] = X_test
144-
lfi_value_dict["shap"] = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/shap.csv", delimiter = ",")
145-
lfi_value_dict["lime"] = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/lime.csv", delimiter = ",")
104+
lfi_value_dict["shap"] = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/shap.csv", delimiter = ",")
105+
lfi_value_dict["lime"] = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/lime.csv", delimiter = ",")
146106

147107
# metrics when predicting according to decision tree
148108
variant_mse_means = []
149109
variant_mse_sds = []
150-
# variant_r2_means = []
151-
# variant_r2_sds = []
152-
153-
# within cluster variance
154-
# variant_variance_means = []
155-
# variant_variance_sds = []
156-
157-
# metrics when predicting mean of cluster
158-
# variant_avg_mse_means = []
159-
# variant_avg_mse_sds = []
160-
# variant_avg_r2_means = []
161-
# variant_avg_r2_sds = []
162-
163-
# k_size_info_maps = {}
164110

165111
for k in range(1, 11):
166112

@@ -187,7 +133,7 @@
187133
cluster_coefs = np.full((100, k, X_test.shape[1]), np.nan)
188134
cluster_sizes = []
189135

190-
if variant_name == "elastic_nonnormed_nosquared_norank":
136+
if variant_name == "elastic_norank":
191137
# create mappings with the random seeds as keys and a
192138
# list of numpy arrays as values
193139
global_train_X = defaultdict(list)
@@ -209,54 +155,36 @@
209155
# randomly split the data into train and test (50/50)
210156
X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = \
211157
train_test_split(X_cluster, y_cluster, test_size=0.5, random_state=rand)
212-
213-
if variant_name == "elastic_nonnormed_nosquared_norank":
158+
159+
# let global model use same train/test split as LMDI+
160+
if variant_name == "elastic_norank":
214161
# add the train and test data to the lists
215162
global_train_X[rand].append(X_train_cluster)
216163
global_train_y[rand].append(y_train_cluster)
217164
global_test_X[rand].append(X_test_cluster)
218165
global_test_y[rand].append(y_test_cluster)
219166

220167
# fit cluster model
221-
if clustermodel == 'linear':
222-
est = LinearRegression()
223-
elif clustermodel == 'tree':
224-
est = DecisionTreeRegressor(max_depth=3,
225-
random_state=42)
226-
else:
227-
est = RandomForestRegressor(n_estimators=100,
228-
max_depth=3,
229-
random_state=42)
168+
est = LinearRegression()
230169
est.fit(X_train_cluster, y_train_cluster)
231170

232171
# get coefs
233-
if clustermodel == 'linear':
234-
cluster_coefs[rand, clust, :] = est.coef_
172+
cluster_coefs[rand, clust, :] = est.coef_
235173

236174
# get predictions
237175
y_pred = est.predict(X_test_cluster)
238176

239177
# get performance
240178
cluster_mses[rand, clust] = mean_squared_error(y_test_cluster, y_pred)
241179

242-
# average the cluster coefs
243-
if clustermodel == 'linear':
244-
cluster_coefs_avg = np.mean(cluster_coefs, axis=0)
245-
# if k == 5:
246-
# if seed == 0:
247-
# result_dir = f"../cluster-results/{methodname}"
248-
# if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")):
249-
# os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}"))
250-
# # write the cluster labels along with the first two columns of X to csv
251-
# np.savetxt(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/{k}clusters_clust{clust}_{variant_name}_coefs.csv", cluster_coefs_avg, delimiter=",")
252180
if k == 4:
253-
result_dir = f"../cluster-results/{methodname}"
181+
result_dir = f"../cluster-results"
254182
if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")):
255183
os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}"))
256184
# write the cluster labels along with the first two columns of X to csv
257185
np.savetxt(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/k{k}_{variant_name}_labels.csv", labels, delimiter=",")
258186

259-
if variant_name == "elastic_nonnormed_nosquared_norank":
187+
if variant_name == "elastic_norank":
260188
# combine the train and test data for each seed
261189
for key in range(100):
262190
global_train_X[key] = np.concatenate(global_train_X[key])
@@ -265,15 +193,7 @@
265193
global_test_y[key] = np.concatenate(global_test_y[key])
266194

267195
# fit model on global data
268-
if clustermodel == 'linear':
269-
est = LinearRegression()
270-
elif clustermodel == 'tree':
271-
est = DecisionTreeRegressor(max_depth=3,
272-
random_state=42)
273-
else:
274-
est = RandomForestRegressor(n_estimators=100,
275-
max_depth=3,
276-
random_state=42)
196+
est = LinearRegression()
277197
est.fit(global_train_X[key], global_train_y[key])
278198

279199
# get predictions
@@ -284,19 +204,15 @@
284204
else:
285205
variant_mse["global_" + variant_name].append(mean_squared_error(global_test_y[key], y_pred_global))
286206
variant_mse["global_" + variant_name] = np.array(variant_mse["global_" + variant_name])
287-
print(variant_mse["global_" + variant_name])
288207

289208
variant_mse[variant_name] = np.average(cluster_mses, axis=1, weights=cluster_sizes)
290-
# print(variant_mse)
291209

292210
# turn variant_mse into a dataframe with key as column name and mse as value
293211
variant_mse_df = pd.DataFrame(variant_mse)
294-
# print(variant_mse_df.shape)
212+
295213
# take the average of each column
296214
variant_mse_mean = variant_mse_df.mean(axis=0)
297215
# take the sd of each column
298-
# print(variant_mse_df.shape)
299-
# print(variant_mse_df)
300216
variant_mse_sd = variant_mse_df.std(axis=0)
301217

302218
# save to list
@@ -306,13 +222,10 @@
306222
# aggregate the list of pd.Series into a dataframe
307223
variant_mse_means_df = pd.DataFrame(variant_mse_means)
308224
variant_mse_sds_df = pd.DataFrame(variant_mse_sds)
309-
# print(variant_mse_means_df)
310-
# print(variant_mse_sds_df)
311225

312226
# write each of the dataframes to a csv
313227
# if the path does not exist, create it
314-
# result_dir = f"../cluster-results/{datafolder}/{methodname}/split-post-cluster"
315-
result_dir = f"../cluster-results/{methodname}"
228+
result_dir = f"../cluster-results"
316229
if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")):
317230
os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}"))
318231
variant_mse_means_df.to_csv(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/cluster_mse_mean.csv")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
#SBATCH --partition=jsteinhardt
3+
4+
dataname="361260"
5+
clustertype="kmeans"
6+
clustermodel="linear"
7+
# methodname="rf"
8+
# datafolder="fulldata"
9+
10+
source activate mdi
11+
command="compile-results.py --dataname $dataname --seed ${1} --clustertype $clustertype --clustermodel $clustermodel" # --methodname ${5} --datafolder $datafolder"
12+
# command="compile-results.py --dataname $dataname --seed $seed --clustertype $clustertype --clustermodel $clustermodel --methodname $methodname --datafolder $datafolder"
13+
14+
# Execute the command
15+
python $command
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
#SBATCH --partition=jsteinhardt
3+
4+
slurm_script="compile-results.sh"
5+
modeltype=("linear")
6+
ids=("361260")
7+
clusttype=("kmeans")
8+
seeds=(0 1 2 3 4)
9+
10+
for seed in "${seeds[@]}"; do
11+
sbatch $slurm_script $seed # Submit SLURM job using the specified script
12+
done

0 commit comments

Comments
 (0)