Skip to content

Commit 1ebb80e

Browse files
authored
Merge pull request #18 from tamnva/development
add feature important
2 parents 153b392 + dc4ec60 commit 1ebb80e

File tree

12 files changed

+185
-49
lines changed

12 files changed

+185
-49
lines changed

hydroecolstm/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
__version__ = "0.3.4"
1+
__version__ = "0.3.5"
22
__author__ = 'Tam V. Nguyen'
33
__credits__ = 'Helmholtz Centre for Environmental Research (UFZ)'
44

55
from .import data, interface, model, utility, train
6-
__all__ = ["data", "interface", "model", "utility", "train"]
6+
__all__ = ["data", "interface", "model", "utility", "train", "feat_importance"]

hydroecolstm/data/read_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,4 @@ def read_scale_data(config):
225225
data['y_train_scale'] = data['y_scaler'].transform(x=data["y_train"])
226226
data['y_valid_scale'] = data['y_scaler'].transform(x=data["y_valid"])
227227

228-
return data
228+
return data

hydroecolstm/data/scaler.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,37 +23,55 @@ def fit(self, x=None, method=None):
2323

2424
for i, method_name in zip(range(len(method)), method):
2525
if method_name == "MinMaxScaler":
26+
27+
# If cannot scale by MinMaxScaler then do not scale
28+
if torch.any(self.maxs[i] - self.mins[i] == 0.0):
29+
print("max-min is zero, cannot use MinMaxScaler, no transform")
30+
self.mins[self.maxs[i] - self.mins[i] == 0.0] = 0.0
31+
self.maxs[self.maxs[i] - self.mins[i] == 0.0] = 1.0
32+
2633
scaler_a.append(self.mins[i])
2734
scaler_b.append(self.maxs[i] - self.mins[i])
35+
2836
elif method_name=="Z-score":
37+
38+
if torch.any(self.stds[i] == 0.0):
39+
print("standard deviation is zero, cannot use Z-score, no transform")
40+
self.means[self.stds[i] == 0.] = 0.0
41+
self.stds[self.stds[i] == 0.] = 1.0
42+
2943
scaler_a.append(self.means[i])
3044
scaler_b.append(self.stds[i])
45+
3146
elif method_name=="None":
3247
scaler_a.append(0.0)
3348
scaler_b.append(1.0)
49+
3450
else:
3551
print("Error: unknown scaler")
3652
SystemExit("Program stop, please change scaler")
3753

3854
scaler_ab = torch.cat((torch.tensor(scaler_a, dtype=torch.float32),
3955
torch.tensor(scaler_b, dtype=torch.float32)), 0)
4056

41-
self.scaler_parameter = torch.reshape(scaler_ab,
42-
(2,len(scaler_a)))
57+
self.scaler_parameter = torch.reshape(
58+
scaler_ab,(2,len(scaler_a)))
4359

44-
def transform(self, x:dict[str:torch.tensor]=None) -> list:
60+
def transform(self, x:dict[str:torch.tensor]=None) -> list:
4561
x_scale = {}
4662
for object_id in x:
47-
x_scale[object_id] = torch.div(torch.sub(x[object_id],
48-
self.scaler_parameter[0,:]),
49-
self.scaler_parameter[1,:])
63+
x_scale[object_id] = torch.div(
64+
torch.sub(x[object_id],self.scaler_parameter[0,:]),
65+
self.scaler_parameter[1,:])
5066
return x_scale
5167

5268
def inverse(self, x:list=None) -> list:
5369
x_inverse = {}
5470
for object_id in x:
55-
x_inverse[object_id] = torch.add(self.scaler_parameter[0,:],
56-
x[object_id]*self.scaler_parameter[1,:])
71+
x_inverse[object_id] = torch.add(
72+
self.scaler_parameter[0,:],
73+
x[object_id]*self.scaler_parameter[1,:])
74+
5775
return x_inverse
5876

5977
def _column_mins(input_tensor: torch.tensor=None):
@@ -105,7 +123,8 @@ def get_scaler_name(config):
105123
scaler_name_input.append(name)
106124

107125
# scaler name target
108-
scaler_name_target = config["scaler_target_features"]*len(config["target_features"])
126+
scaler_name_target = config["scaler_target_features"]*len(
127+
config["target_features"])
109128

110129
return scaler_name_input, scaler_name_target
111130

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .perm_feat_importance import pfib
2+
3+
__all__= ["pfib"]
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
2+
import copy
3+
import torch
4+
import pandas as pd
5+
from hydroecolstm.utility.evaluation_function import EvaluationFunction
6+
7+
#features=features
8+
#x_test_scale=data["x_test_scale"]
9+
#y_test=data["y_test"]
10+
#y_scaler=data["y_scaler"]
11+
#trained_model=model
12+
#objective_function_name="NSE"
13+
#y_column_name=data["y_column_name"]
14+
#nskip=config["warmup_length"]
15+
#seed=100
16+
17+
# Permutation feature important basin wise
18+
def pfib(features: str, x_test_scale:dict[str, torch.Tensor],
19+
y_test:dict[str, torch.Tensor], y_scaler,
20+
trained_model, objective_function_name:str,
21+
nskip:int, y_column_name:str, seed:int=None):
22+
23+
# Evaluation function
24+
objective = EvaluationFunction(function_name=objective_function_name,
25+
nskip=nskip, y_column_name=y_column_name)
26+
27+
#obj = objective(y_test,
28+
# y_scaler.inverse(trained_model.evaluate(x_test_scale)))
29+
30+
# Loop over features
31+
for i in range(len(features)):
32+
x_perm = {}
33+
34+
for key, x in zip(x_test_scale.keys(), x_test_scale.values()):
35+
36+
# Shuffle index of feature i
37+
if seed is not None:
38+
torch.manual_seed(0)
39+
40+
idx = torch.randperm(x.shape[0])
41+
42+
# Shuffle data
43+
x_copy = copy.deepcopy(x)
44+
x_copy[:,i] = x_copy[idx, i]
45+
46+
# Save permutated data for each key
47+
x_perm[key] = copy.deepcopy(x_copy)
48+
49+
prediction = y_scaler.inverse(trained_model.evaluate(x_perm))
50+
51+
if i == 0:
52+
output = objective(y_test, prediction)
53+
output.columns = features[i] + "_" + output.columns
54+
#output.rename(columns={output.columns[0]: features[i]},
55+
# inplace=True)
56+
else:
57+
temp = objective(y_test, prediction)
58+
temp.columns = features[i] + "_" + temp.columns
59+
60+
output = pd.concat([output, temp], axis=1)
61+
#output[features[i]] = objective(
62+
# y_test, prediction)["objective_function_value"]
63+
64+
#output.columns = ["s"]
65+
66+
return output #.subtract(obj['objective_function_value'], axis=0)
67+
68+

hydroecolstm/interface/utility.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,9 +341,12 @@ def config_to_text(config):
341341
elif key == "valid_period":
342342
out_text.append(" - " + str(config["valid_period"][0])[:16] + "\n")
343343
out_text.append(" - " + str(config["valid_period"][1])[:16] + "\n")
344-
else:
344+
elif key == "test_period":
345345
out_text.append(" - " + str(config["test_period"][0])[:16] + "\n")
346-
out_text.append(" - " + str(config["test_period"][1])[:16] + "\n")
346+
out_text.append(" - " + str(config["test_period"][1])[:16] + "\n")
347+
else:
348+
out_text.append(" - " + str(config["forecast_period"][0])[:16] + "\n")
349+
out_text.append(" - " + str(config["forecast_period"][1])[:16] + "\n")
347350
except:
348351
# Non list object writte in 1 line
349352
out_text.append(key +": " + str(config[key]) + "\n")

hydroecolstm/train/custom_loss.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ def __init__(self, loss_function:str):
88
# Dict of all available loss functions
99
loss_functions = {"MSE": self.MSE,
1010
"RMSE": self.RMSE,
11+
"RMSE_normalize": self.RMSE_normalize,
1112
"MAE": self.MAE,
1213
"NSE_complement": self.NSE_complement}
1314

1415
# Use this loss function
15-
1616
self.loss_function = loss_functions[loss_function]
1717

1818
def forward(self, y_true:torch.Tensor, y_predict:torch.Tensor) -> torch.Tensor:
@@ -23,7 +23,10 @@ def forward(self, y_true:torch.Tensor, y_predict:torch.Tensor) -> torch.Tensor:
2323

2424
mask = ~torch.isnan(y_true)
2525
loss = self.loss_function(y_true, y_predict, mask)
26-
26+
27+
if torch.isnan(loss).any():
28+
raise ValueError("loss is nan, cannot train the model, check training data")
29+
2730
return loss
2831

2932
# Mean square error
@@ -52,8 +55,18 @@ def RMSE(self, y_true:torch.Tensor, y_predict:torch.Tensor,
5255

5356
# Root Mean Square Error
5457
rmse = self.MSE(y_true, y_predict, mask)**0.5
58+
5559
return rmse
5660

61+
def RMSE_normalize(self, y_true:torch.Tensor, y_predict:torch.Tensor,
62+
mask:torch.Tensor)-> torch.Tensor:
63+
64+
# Root Mean Square Error
65+
rmse_normalize = self.MSE(y_true, y_predict, mask)**0.5/ torch.mean(
66+
y_true[mask])
67+
68+
return rmse_normalize
69+
5770
# Complement to 1 of the Nash-Sutcliffe (or 1- Nash sutcliffe)
5871
def NSE_complement(self, y_true:torch.Tensor, y_predict:torch.Tensor,
5972
mask:torch.Tensor)-> torch.Tensor:
@@ -66,14 +79,3 @@ def NSE_complement(self, y_true:torch.Tensor, y_predict:torch.Tensor,
6679

6780
# Minimize loss, so output should be sse/ssd, which is 1 - NSE
6881
return sse/ssd
69-
70-
71-
#x = CustomLoss(config["loss_function"])
72-
73-
74-
75-
76-
77-
78-
79-

hydroecolstm/train/trainer.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import numpy as np
22
import pandas as pd
33
from ray import train
4+
import ray
45
import tempfile
56
import os
67
import copy
78
import torch
89
from torch.utils.data import DataLoader
9-
from pathlib import Path
1010
from hydroecolstm.train.custom_loss import CustomLoss
1111
from hydroecolstm.data.custom_dataset import CustomDataset
1212

@@ -124,7 +124,7 @@ def train(self,
124124

125125
print(f"Epoch [{epoch+1}/{self.n_epochs}], ",
126126
f"average_train_loss = {train_loss_epoch[-1]:.8f}, ",
127-
f"avearge_valid_loss = {valid_loss_epoch[-1]:.8f}")
127+
f"average_valid_loss = {valid_loss_epoch[-1]:.8f}")
128128

129129
# Early stopping based on validation loss and make checkpoint
130130
flag = early_stopping(valid_loss_epoch[-1], self.model)
@@ -157,11 +157,6 @@ def train(self,
157157
valid_loss_epoch,
158158
check_point)
159159

160-
# Save loss_epoch incase of automatic hyperparam optim with tune
161-
#self.loss_epoch.to_csv(
162-
# Path(self.out_dir, str(np.random.randint(1, 1e9)) + ".txt"),
163-
# sep='\t')
164-
165160
return self.model
166161

167162
# Save intermediate result at check points
@@ -183,9 +178,10 @@ def _save_check_point(self, train_loss_epoch, valid_loss_epoch,
183178
valid_loss_epoch,
184179
check_point)
185180

186-
train.report({'loss': train_loss_epoch[-1],
187-
'loss_epoch': loss_epoch},
188-
checkpoint=checkpoint)
181+
if ray.train._internal.session.get_session():
182+
train.report({'loss': train_loss_epoch[-1],
183+
'loss_epoch': loss_epoch},
184+
checkpoint=checkpoint)
189185

190186
# Create data frame of epoch number, train loss, valid loss
191187
def _create_train_loss_df(self, train_loss_epoch, valid_loss_epoch,

hydroecolstm/utility/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .evaluation_function import EvaluationFunction
22
from .plot import plot
3+
from .format_conversion import tensor_to_pandas_df
34

45
__all__= ["EvaluationFunction", "plot"]

hydroecolstm/utility/evaluation_function.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import torch
2+
import pandas as pd
23

34
class EvaluationFunction():
4-
def __init__(self, function_name:str, nskip:int):
5+
def __init__(self, function_name:str, nskip:int, y_column_name:str):
56

67
# Dict of all available evaluation functions
78
evaluation_functions = {"MSE": self.MSE, "RMSE": self.RMSE,
@@ -10,6 +11,8 @@ def __init__(self, function_name:str, nskip:int):
1011
# Selected evaluation function
1112
self.eval_function = evaluation_functions[function_name]
1213
self.nskip = nskip
14+
self.function_name = function_name
15+
self.y_column_name = y_column_name
1316

1417
def __call__(self, y_true:torch.Tensor, y_predict:torch.Tensor) -> torch.Tensor:
1518

@@ -19,16 +22,21 @@ def __call__(self, y_true:torch.Tensor, y_predict:torch.Tensor) -> torch.Tensor:
1922
for key in y_true.keys():
2023
eval_values[key] = self.eval_function(y_true[key][self.nskip:,],
2124
y_predict[key][self.nskip:,])
22-
23-
avg_eval_values = sum(sum(eval_values.values()))/((len(eval_values))*eval_values[next(iter(eval_values))].shape[0])
2425

25-
return eval_values, avg_eval_values
26+
27+
df = pd.DataFrame(torch.stack(list(eval_values.values())).numpy())
28+
df.index = eval_values.keys()
29+
df.columns = [self.function_name + "_" + name
30+
for name in self.y_column_name]
31+
32+
return df
2633

2734
def MSE(self, ytrue:torch.Tensor, ypredict:torch.Tensor):
2835
mask = ~torch.isnan(ytrue)
2936
mse = []
3037
for i in range(ytrue.shape[1]):
31-
mse.append(torch.mean((ytrue[:,i][mask[:,i]] - ypredict[:,i][mask[:,i]])**2))
38+
mse.append(torch.mean((ytrue[:,i][mask[:,i]] -
39+
ypredict[:,i][mask[:,i]])**2))
3240
mse = torch.stack(mse)
3341
return mse
3442

@@ -38,23 +46,20 @@ def RMSE(self, ytrue:torch.Tensor, ypredict:torch.Tensor):
3846
rmse = mse**0.5
3947
return rmse
4048

41-
# 1 - Nash–Sutcliffe efficiency (NSE)
49+
# Nash–Sutcliffe efficiency (NSE)
4250
def NSE(self, ytrue:torch.Tensor, ypredict:torch.Tensor):
4351
mask = ~torch.isnan(ytrue)
4452

4553
# Sum of Square Error (sse) = sum((true-predict)^2)
4654
# Sum of Square Difference around mean (ssd) = sum((true-mean_true)^2)
4755
sse = []
4856
ssd = []
57+
4958
for i in range(ytrue.shape[1]):
5059
sse.append(torch.sum((ytrue[:,i][mask[:,i]] - ypredict[:,i][mask[:,i]])**2))
5160
ssd.append(torch.sum((ytrue[:,i][mask[:,i]] - torch.nanmean(ytrue[:,i]))**2))
5261

53-
# get 1 - nse, here I call it as nse
5462
nse = 1.0 - torch.stack(sse)/torch.stack(ssd)
55-
56-
if torch.isnan(nse).any():
57-
raise ValueError("nan values found when calculating NSE - zero division")
5863

5964
return nse
6065

0 commit comments

Comments
 (0)