Description
I'm using Keras to train a model in Google Colab. During training, I achieve high performance with a small mean absolute error (MAE) as shown by the training metrics. However, when I use model.predict() on the training data, the performance significantly drops (i.e., the MAE is much higher). I'm trying to understand why this discrepancy occurs.
I'm using cross-validation, and the issue persists across almost all folds. For example, in the graph below from fold 3, the training MAE decreases to around 3.5, but when I calculate the MAE using mean_absolute_error on the predictions from model.predict(), it increases to about 5.6.
I've checked the shape and type of both y_train and y_train_pred, as I noticed others have faced similar issues due to shape mismatches. Despite ensuring they match, the discrepancy still exists.
Here's my code for the training loop and result display:
# Define the number of folds
n_folds = 17
# Create bins for the target variable
n_bins = 4
kbd = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
y_binned = kbd.fit_transform(all_y).flatten()
# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed_value)
# Initialize lists to store the results
train_mae_list = []
test_mae_list = []
train_r2_list = []
test_r2_list = []
train_loss_list = []
test_loss_list = []
# Loop through the folds
for fold, (train_index, test_index) in enumerate(skf.split(X_ppg_norm, y_binned), 1):
# Split the data into train and test sets
X_train_ppg, X_test_ppg = X_ppg_norm[train_index], X_ppg_norm[test_index]
# X_train_gaussian, X_test_gaussian = X_gaussian_norm[train_index], X_gaussian_norm[test_index]
X_train_demo, X_test_demo = X_demo_norm[train_index], X_demo_norm[test_index]
y_train, y_test = all_y[train_index], all_y[test_index]
print(f"test_indices: {test_index}")
# Create the model
model = create_model((10, 4, 243), (4,))
# model.compile(optimizer=Adam(learning_rate=0.01), loss=Huber(delta=1.0), metrics=['mae'], clipnorm=1.0)
# Define the optimizer with gradient clipping
optimizer = Adam(learning_rate=0.01)
# Compile the model
model.compile(optimizer=optimizer, loss=Huber(delta=1.0), metrics=['mae'])
lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=4, min_lr=1e-15, verbose=1)
early_stopping = EarlyStopping(
monitor='val_mae',
patience=150,
restore_best_weights=True,
verbose=1
)
# Train the model
history = model.fit(
[X_train_ppg, X_train_demo],
y_train,
epochs=300,
batch_size=16,
validation_data=([X_test_ppg, X_test_demo], y_test),
callbacks=[lr_scheduler, early_stopping],
verbose=0
)
# Evaluate the model
y_train_pred = model.predict([X_train_ppg, X_train_demo])
y_test_pred = model.predict([X_test_ppg, X_test_demo])
print(f"Shape of y_train_pred: {y_train_pred.shape}")
print(f"type of y_train_pred: {type(y_train_pred)}")
print(f"Shape of y_train: {y_train.shape}")
print(f"type of y_train: {type(y_train_pred)}")
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae_list.append(train_mae)
test_mae_list.append(test_mae)
train_r2_list.append(train_r2)
test_r2_list.append(test_r2)
train_loss_list.append(history.history['loss'])
test_loss_list.append(history.history['val_loss'])
# Debug print for each fold
print(f"Fold {fold} - Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
# Plot training history for the current fold
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title(f'Model Loss - Fold {fold}')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 20])
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Test MAE')
plt.title(f'Model MAE - Fold {fold}')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.ylim([0, 20])
plt.legend()
plt.tight_layout()
plt.show()
# Plot scatter plot for the current fold
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_test_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title(f'True vs Predicted Values - Fold {fold}')
plt.axis('equal')
plt.axis('square')
plt.xlim([y_test.min(), y_test.max()])
plt.ylim([y_test.min(), y_test.max()])
plt.show()