|
5 | 5 | # sklearn imports |
6 | 6 | from sklearn.model_selection import train_test_split |
7 | 7 | from sklearn.linear_model import LinearRegression |
8 | | -from sklearn.metrics import mean_squared_error, r2_score |
9 | | -from sklearn.tree import DecisionTreeRegressor |
10 | | -from sklearn.ensemble import RandomForestRegressor |
| 8 | +from sklearn.metrics import mean_squared_error |
11 | 9 | from sklearn.cluster import KMeans |
12 | | -from sklearn.preprocessing import StandardScaler |
13 | 10 |
|
14 | 11 | # hierarchical clustering imports |
15 | 12 | from scipy.cluster import hierarchy |
16 | 13 |
|
17 | | -# data getter imports |
18 | | -from data_loader import load_regr_data |
19 | | - |
20 | 14 | # filesystem imports |
21 | 15 | import os |
22 | 16 | from os.path import join as oj |
|
37 | 31 | parser.add_argument('--seed', type=int, default=None) |
38 | 32 | parser.add_argument('--clustertype', type=str, default=None) |
39 | 33 | parser.add_argument('--clustermodel', type=str, default=None) |
40 | | - parser.add_argument('--datafolder', type=str, default=None) |
41 | | - parser.add_argument('--methodname', type=str, default=None) |
| 34 | + # parser.add_argument('--datafolder', type=str, default=None) |
| 35 | + # parser.add_argument('--methodname', type=str, default=None) |
42 | 36 | args = parser.parse_args() |
43 | 37 |
|
44 | 38 | # convert namespace to a dictionary |
|
49 | 43 | seed = args_dict['seed'] |
50 | 44 | clustertype = args_dict['clustertype'] |
51 | 45 | clustermodel = args_dict['clustermodel'] |
52 | | - datafolder = args_dict['datafolder'] |
53 | | - methodname = args_dict['methodname'] |
| 46 | + # datafolder = args_dict['datafolder'] |
| 47 | + # methodname = args_dict['methodname'] |
54 | 48 |
|
55 | 49 | # check that clustertype is either 'hierarchical' or 'kmeans' |
56 | 50 | if clustertype not in ['hierarchical', 'kmeans']: |
57 | 51 | raise ValueError("clustertype must be either 'hierarchical' or 'kmeans'") |
58 | 52 |
|
59 | | - # check that clustermodel is either 'linear' or 'tree' |
60 | | - if clustermodel not in ['linear', 'tree', 'rf']: |
61 | | - raise ValueError("clustermodel must be either 'linear', 'tree', or 'rf'") |
| 53 | + # check that clustermodel is 'linear' |
| 54 | + if clustermodel != 'linear': |
| 55 | + raise ValueError("clustermodel must be 'linear'") |
62 | 56 |
|
63 | | - # check that methodname is either rf or gb |
64 | | - if methodname not in ['rf', 'gb']: |
65 | | - raise ValueError("methodname must be either 'rf' or 'gb'") |
| 57 | + # check that methodname is rf |
| 58 | + # if methodname != 'rf': |
| 59 | + # raise ValueError("methodname must be 'rf'") |
66 | 60 |
|
67 | 61 | print("Compiling results for " + dataname + " with " + clustertype + \ |
68 | 62 | " clustering and " + clustermodel + " cluster model") |
69 | 63 |
|
70 | 64 | # if dataname not in results folder, skip |
71 | | - if not os.path.exists(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}"): |
| 65 | + if not os.path.exists(f"../lfi-values/seed{seed}/{dataname}"): |
72 | 66 | print("No results for " + dataname) |
73 | 67 | else: |
74 | 68 |
|
|
79 | 73 | X = X.astype(np.float32) |
80 | 74 | y = y.astype(np.float32) |
81 | 75 |
|
82 | | - # if the data is standardize it, we need to standardize again here |
83 | | - if datafolder == "standardized-fulldata": |
84 | | - scaler = StandardScaler() |
85 | | - X = scaler.fit_transform(X) |
86 | | - y = (y - np.mean(y)) / np.std(y) |
87 | | - if datafolder == "standardizedX-fulldata": |
88 | | - scaler = StandardScaler() |
89 | | - X = scaler.fit_transform(X) |
90 | | - |
91 | | - # if X has more than 5k rows, sample 5k rows of X and y |
92 | | - # if X.shape[0] > 5000: |
93 | | - # np.random.seed(42) |
94 | | - # indices = np.random.choice(X.shape[0], 5000, replace=False) |
95 | | - # X = X[indices] |
96 | | - # y = y[indices] |
97 | | - |
98 | 76 | # split data into training and testing |
99 | 77 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, |
100 | 78 | random_state = seed) |
101 | 79 |
|
102 | | - |
103 | | - # X, y, names_covariates = load_regr_data(dataname, dir_data) |
104 | | - # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, |
105 | | - # random_state = seed) |
106 | | - # read in lmdi variants |
107 | | - # glm = ["ridge", "lasso", "elastic"] |
108 | | - # normalize = {True: "normed", False: "nonnormed"} |
109 | | - # square = {True: "squared", False: "nosquared"} |
110 | | - # leaf_average = {True: "leafavg", False: "noleafavg"} |
111 | | - # ranking = {True: "rank", False: "norank"} |
112 | 80 | glm = ["elastic"] |
113 | | - normalize = {False: "nonnormed"} |
114 | | - square = {False: "nosquared"} |
115 | | - leaf_average = {False: "noleafavg"} |
116 | 81 | ranking = {False: "norank"} |
117 | 82 |
|
118 | 83 | # create the mapping of variants to argument mappings |
119 | 84 | lfi_methods = [] |
120 | 85 | for g in glm: |
121 | | - for n in normalize: |
122 | | - for s in square: |
123 | | - for r in ranking: |
124 | | - if (not n) and (s): |
125 | | - continue |
126 | | - # create the name the variant will be stored under |
127 | | - variant_name = f"{g}_{normalize[n]}_{square[s]}_{ranking[r]}" |
128 | | - # store the arguments for the lmdi+ explainer |
129 | | - arg_map = {"glm": g, "normalize": n, "square": s, |
130 | | - "ranking": r} |
131 | | - lfi_methods.append(variant_name) |
| 86 | + for r in ranking: |
| 87 | + # create the name the variant will be stored under |
| 88 | + variant_name = f"{g}_{ranking[r]}" |
| 89 | + # store the arguments for the lmdi+ explainer |
| 90 | + arg_map = {"glm": g, "ranking": r} |
| 91 | + lfi_methods.append(variant_name) |
132 | 92 | lfi_methods.append("lmdi_baseline") |
133 | 93 |
|
134 | 94 | # for each variant, read in the array |
135 | 95 | lfi_value_dict = {} |
136 | 96 | for variant in lfi_methods: |
137 | 97 | # read in the variant |
138 | | - lmdi = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/{variant}.csv", delimiter = ",") |
| 98 | + lmdi = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/{variant}.csv", delimiter = ",") |
139 | 99 | # get the mse of the variant |
140 | 100 | lfi_value_dict[variant] = lmdi |
141 | 101 |
|
142 | 102 | lfi_value_dict["rawdata"] = X_test |
143 | 103 | lfi_value_dict["random"] = X_test |
144 | | - lfi_value_dict["shap"] = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/shap.csv", delimiter = ",") |
145 | | - lfi_value_dict["lime"] = np.loadtxt(f"../lfi-values/{datafolder}/{methodname}/seed{seed}/{dataname}/lime.csv", delimiter = ",") |
| 104 | + lfi_value_dict["shap"] = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/shap.csv", delimiter = ",") |
| 105 | + lfi_value_dict["lime"] = np.loadtxt(f"../lfi-values/seed{seed}/{dataname}/lime.csv", delimiter = ",") |
146 | 106 |
|
147 | 107 | # metrics when predicting according to decision tree |
148 | 108 | variant_mse_means = [] |
149 | 109 | variant_mse_sds = [] |
150 | | - # variant_r2_means = [] |
151 | | - # variant_r2_sds = [] |
152 | | - |
153 | | - # within cluster variance |
154 | | - # variant_variance_means = [] |
155 | | - # variant_variance_sds = [] |
156 | | - |
157 | | - # metrics when predicting mean of cluster |
158 | | - # variant_avg_mse_means = [] |
159 | | - # variant_avg_mse_sds = [] |
160 | | - # variant_avg_r2_means = [] |
161 | | - # variant_avg_r2_sds = [] |
162 | | - |
163 | | - # k_size_info_maps = {} |
164 | 110 |
|
165 | 111 | for k in range(1, 11): |
166 | 112 |
|
|
187 | 133 | cluster_coefs = np.full((100, k, X_test.shape[1]), np.nan) |
188 | 134 | cluster_sizes = [] |
189 | 135 |
|
190 | | - if variant_name == "elastic_nonnormed_nosquared_norank": |
| 136 | + if variant_name == "elastic_norank": |
191 | 137 | # create mappings with the random seeds as keys and a |
192 | 138 | # list of numpy arrays as values |
193 | 139 | global_train_X = defaultdict(list) |
|
209 | 155 | # randomly split the data into train and test (50/50) |
210 | 156 | X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = \ |
211 | 157 | train_test_split(X_cluster, y_cluster, test_size=0.5, random_state=rand) |
212 | | - |
213 | | - if variant_name == "elastic_nonnormed_nosquared_norank": |
| 158 | + |
| 159 | + # let global model use same train/test split as LMDI+ |
| 160 | + if variant_name == "elastic_norank": |
214 | 161 | # add the train and test data to the lists |
215 | 162 | global_train_X[rand].append(X_train_cluster) |
216 | 163 | global_train_y[rand].append(y_train_cluster) |
217 | 164 | global_test_X[rand].append(X_test_cluster) |
218 | 165 | global_test_y[rand].append(y_test_cluster) |
219 | 166 |
|
220 | 167 | # fit cluster model |
221 | | - if clustermodel == 'linear': |
222 | | - est = LinearRegression() |
223 | | - elif clustermodel == 'tree': |
224 | | - est = DecisionTreeRegressor(max_depth=3, |
225 | | - random_state=42) |
226 | | - else: |
227 | | - est = RandomForestRegressor(n_estimators=100, |
228 | | - max_depth=3, |
229 | | - random_state=42) |
| 168 | + est = LinearRegression() |
230 | 169 | est.fit(X_train_cluster, y_train_cluster) |
231 | 170 |
|
232 | 171 | # get coefs |
233 | | - if clustermodel == 'linear': |
234 | | - cluster_coefs[rand, clust, :] = est.coef_ |
| 172 | + cluster_coefs[rand, clust, :] = est.coef_ |
235 | 173 |
|
236 | 174 | # get predictions |
237 | 175 | y_pred = est.predict(X_test_cluster) |
238 | 176 |
|
239 | 177 | # get performance |
240 | 178 | cluster_mses[rand, clust] = mean_squared_error(y_test_cluster, y_pred) |
241 | 179 |
|
242 | | - # average the cluster coefs |
243 | | - if clustermodel == 'linear': |
244 | | - cluster_coefs_avg = np.mean(cluster_coefs, axis=0) |
245 | | - # if k == 5: |
246 | | - # if seed == 0: |
247 | | - # result_dir = f"../cluster-results/{methodname}" |
248 | | - # if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")): |
249 | | - # os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")) |
250 | | - # # write the cluster labels along with the first two columns of X to csv |
251 | | - # np.savetxt(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/{k}clusters_clust{clust}_{variant_name}_coefs.csv", cluster_coefs_avg, delimiter=",") |
252 | 180 | if k == 4: |
253 | | - result_dir = f"../cluster-results/{methodname}" |
| 181 | + result_dir = f"../cluster-results" |
254 | 182 | if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")): |
255 | 183 | os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")) |
256 | 184 | # write the cluster labels along with the first two columns of X to csv |
257 | 185 | np.savetxt(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/k{k}_{variant_name}_labels.csv", labels, delimiter=",") |
258 | 186 |
|
259 | | - if variant_name == "elastic_nonnormed_nosquared_norank": |
| 187 | + if variant_name == "elastic_norank": |
260 | 188 | # combine the train and test data for each seed |
261 | 189 | for key in range(100): |
262 | 190 | global_train_X[key] = np.concatenate(global_train_X[key]) |
|
265 | 193 | global_test_y[key] = np.concatenate(global_test_y[key]) |
266 | 194 |
|
267 | 195 | # fit model on global data |
268 | | - if clustermodel == 'linear': |
269 | | - est = LinearRegression() |
270 | | - elif clustermodel == 'tree': |
271 | | - est = DecisionTreeRegressor(max_depth=3, |
272 | | - random_state=42) |
273 | | - else: |
274 | | - est = RandomForestRegressor(n_estimators=100, |
275 | | - max_depth=3, |
276 | | - random_state=42) |
| 196 | + est = LinearRegression() |
277 | 197 | est.fit(global_train_X[key], global_train_y[key]) |
278 | 198 |
|
279 | 199 | # get predictions |
|
284 | 204 | else: |
285 | 205 | variant_mse["global_" + variant_name].append(mean_squared_error(global_test_y[key], y_pred_global)) |
286 | 206 | variant_mse["global_" + variant_name] = np.array(variant_mse["global_" + variant_name]) |
287 | | - print(variant_mse["global_" + variant_name]) |
288 | 207 |
|
289 | 208 | variant_mse[variant_name] = np.average(cluster_mses, axis=1, weights=cluster_sizes) |
290 | | - # print(variant_mse) |
291 | 209 |
|
292 | 210 | # turn variant_mse into a dataframe with key as column name and mse as value |
293 | 211 | variant_mse_df = pd.DataFrame(variant_mse) |
294 | | - # print(variant_mse_df.shape) |
| 212 | + |
295 | 213 | # take the average of each column |
296 | 214 | variant_mse_mean = variant_mse_df.mean(axis=0) |
297 | 215 | # take the sd of each column |
298 | | - # print(variant_mse_df.shape) |
299 | | - # print(variant_mse_df) |
300 | 216 | variant_mse_sd = variant_mse_df.std(axis=0) |
301 | 217 |
|
302 | 218 | # save to list |
|
306 | 222 | # aggregate the list of pd.Series into a dataframe |
307 | 223 | variant_mse_means_df = pd.DataFrame(variant_mse_means) |
308 | 224 | variant_mse_sds_df = pd.DataFrame(variant_mse_sds) |
309 | | - # print(variant_mse_means_df) |
310 | | - # print(variant_mse_sds_df) |
311 | 225 |
|
312 | 226 | # write each of the dataframes to a csv |
313 | 227 | # if the path does not exist, create it |
314 | | - # result_dir = f"../cluster-results/{datafolder}/{methodname}/split-post-cluster" |
315 | | - result_dir = f"../cluster-results/{methodname}" |
| 228 | + result_dir = f"../cluster-results" |
316 | 229 | if not os.path.exists(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")): |
317 | 230 | os.makedirs(oj(result_dir, clustertype, clustermodel, dataname, f"seed{seed}")) |
318 | 231 | variant_mse_means_df.to_csv(f"{result_dir}/{clustertype}/{clustermodel}/{dataname}/seed{seed}/cluster_mse_mean.csv") |
|
0 commit comments