Open
Description
The current implementation of data.py/load_dataset()
instantiates a standard scaler by default.
def load_dataset(dataset_dir, batch_size, val_batch_size=None, test_batch_size=None):
if val_batch_size is None:
val_batch_size = batch_size
if test_batch_size is None:
test_batch_size = batch_size
data = {}
for category in ["train", "val", "test"]:
cat_data = np.load(os.path.join(dataset_dir, category + ".npz"))
data["x_" + category] = cat_data["x"]
data["y_" + category] = cat_data["y"]
scaler = StandardScaler(data["x_train"][..., 0])
for category in ["train", "val", "test"]:
data["x_" + category][..., 0] = scaler.transform(data["x_" + category][..., 0])
data["y_" + category][..., 0] = scaler.transform(data["y_" + category][..., 0])
data_train = PaddedDataset(batch_size, data["x_train"], data["y_train"])
data["train_loader"] = DataLoader(data_train, batch_size, shuffle=True)
data_val = PaddedDataset(val_batch_size, data["x_val"], data["y_val"])
data["val_loader"] = DataLoader(data_val, val_batch_size, shuffle=False)
data_test = PaddedDataset(test_batch_size, data["x_test"], data["y_test"])
data["test_loader"] = DataLoader(data_test, test_batch_size, shuffle=False)
data["scaler"] = scaler
return data
The goal is to be able to isolate the scalar from the data loading method, and support more scalars eventually.