TOPMODELvsML_CCH/CART.R at main · Jackbakker4444/TOPMODELvsML_CCH · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# This statement removes everything from R's memory
rm(list=ls())

# Select your working directory (Exercise folder)
setwd("C://Data/CCH/Synthesis_report/CART")

# Install libraries if not already installed

if (!requireNamespace("dplyr", quietly = T)) {install.packages("dplyr")}
if (!requireNamespace("sf", quietly = T)) {install.packages("sf")}
if (!requireNamespace("caret", quietly = T)) {install.packages("caret")}
if (!requireNamespace("hydroGOF", quietly = T)) {install.packages("hydroGOF")}
if (!requireNamespace("rpart", quietly = T)) {install.packages("rpart")}
if (!requireNamespace("rpart.plot", quietly = T)) {install.packages("rpart.plot")}
if (!requireNamespace("leaflet", quietly = T)) {install.packages("leaflet")}
if (!requireNamespace("ggplot2", quietly = T)) {install.packages("ggplot2")}
if (!requireNamespace("patchwork", quietly = T)) {install.packages("patchwork")}
if (!requireNamespace("plotly", quietly = T)) {install.packages("plotly")}
if (!requireNamespace("reshape2", quietly = T)) {install.packages("reshape2")}
if (!requireNamespace("randomForest", quietly = T)) {install.packages("randomForest")}

# Load required libraries
library(dplyr)             # Data manipulation
library(sf)                # Simple feature access
library(caret)             # Classification and regression training
library(hydroGOF)          # Goodness-of-fit for model evaluation
library(rpart)             # Recursive partitioning and decision trees
library(rpart.plot)        # Plot 'rpart' models
library(leaflet)           # Interactive web maps
library(ggplot2)           # Elegent data visualization
library(patchwork)         # Combining multiple plots
library(plotly)            # Interactive web graphics
library(reshape2)          # Flexibly reshape data
library(randomForest)

# Load functions (for later)
source("./aux_fun/map_lux.R")         # Creating iceland map with basins
source("./aux_fun/plots.R")               # Visualize data and results
source("./aux_fun/merge_clean_data.R")    # Preprocess catchment data
source("./aux_fun/transfer_lux.R")    # Transfer learning

# Read gauges shapefile
gauges   <- st_read("./Data/CAMELS-LUX_shapefiles/stream-gauges_CAMELS-LUX.shp")

# Plot Luxembourg with catchments and gauges
#map_lux()          # Require internet to load background layer!

# Fill in ID number of the selected catchment
# Creatng a list to loop over
basin_list = c(52, 15, 36)
all_performance = list()
all_peak_performance = list()

for (ID in basin_list){
  print(paste("================================"))
  print(paste("START CATCHMENT ID:", ID))
  print(paste("================"))

  # load dataset
  load_lux_catchment <- function(ID) {

    file_path <- paste0("./Data/CAMELS-LUX/timeseries/hourly/CAMELS_LUX_hydromet_timeseries_ID_", ID, ".csv")

    # Read the CSV file
    df <- read.csv(file_path, sep = ",", header = TRUE, stringsAsFactors = FALSE)

    # Convert the date
    df$Date <- as.POSIXct(df$Date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")

    # Remove NAs
    clean_df <- na.omit(df)

    return(clean_df)
  }

  catchment_ts = load_lux_catchment(ID)

  # include a lag to incorporate "memory"
  catchment_ts <- catchment_ts %>%
    arrange(Date) %>%
    mutate(
      RR_rad_lag1  = lag(RR_rad, 1),   # Rain 1 hour ago
      RR_rad_lag3  = lag(RR_rad, 3),   # Rain 3 hours ago
      RR_rad_lag6  = lag(RR_rad, 6),   # Rain 6 hours ago
      RR_rad_lag12 = lag(RR_rad, 12),  # Rain 12 hours ago
      RR_rad_lag24 = lag(RR_rad, 24)   # Rain 24 hours ago
    )
  catchment_ts <- na.omit(catchment_ts)

  #dim(catchment_ts)

  # inspect data
  #ts_plot(catchment_ts)

  ####### split data into training and testing data ################
  # training data
  train_data <- catchment_ts %>%
    filter(Date >= as.POSIXct("2004-11-01", tz = "Etc/GMT-1") &
             Date < as.POSIXct("2011-01-01", tz = "Etc/GMT-1"))
  #ts_plot(train_data)
  # test data 2020
  test_data <- catchment_ts %>%
    filter(Date >= as.POSIXct("2020-01-01", tz = "Etc/GMT-1") &
             Date < as.POSIXct("2021-01-01", tz = "Etc/GMT-1"))

  # Plot histograms for all variables in training set
  #hist_df(train_data)

  # Plot histograms for all variables in test set
  #hist_df(test_data)

  ######### Train the rpart model ########################
  # 1 Create cp parameter space
  grid_rpart <- expand.grid(
    cp = seq(0.001, 0.03, by = 0.001)               # This is example!
  )

  # 2 Selecting the sampling method and fold or iterations
  train_control_rpart <- trainControl(method = "cv",     # Cross-validation
                                      number = 5)      # K value = ...

  # 3 Train your model
  model_rpart <- train(Q ~ .- Date - Qspec - Qflag,
                       data = train_data,             # The target variable
                       method    = "rpart",              # The selected algorithm
                       trControl = train_control_rpart,        # The selected sampling method
                       tuneGrid  = grid_rpart)                 # The created parameter space

  # Cross-validation plot cp vs performance
  #cv_plot(model_rpart)
  print(paste0("The cp value with highest performance is ", model_rpart$bestTune$cp))

  # train best model
  best_model_rpart <- rpart(Q ~ .- Date - Qspec - Qflag,
                            data = train_data,
                            cp = model_rpart$bestTune$cp)

  # Plot your tree
  print(rpart.plot(best_model_rpart))

  ######### Train the rf model ########################
  # 1 Create cp parameter space
  grid_rf <- expand.grid(
    mtry = c(4, 6, 8)
  )

  # 2 Selecting the sampling method and fold or iterations
  train_control_rf <- trainControl(method = "cv",     # Cross-validation
                                   number = 3)      # K value = ...

  # 3 Train your model
  model_rf <- train(Q ~ .- Date - Qspec - Qflag,
                    data = train_data,             # The target variable
                    method    = "rf",              # The selected algorithm
                    trControl = train_control_rf,  # The selected sampling method
                    tuneGrid  = grid_rf,
                    ntree = 50)                   # The created parameter space

  print(paste0("The mtry value with highest performance is ", model_rf$bestTune$mtry))

  # train best model
  best_model_rf <- randomForest(Q ~ .- Date - Qspec - Qflag,
                                data = train_data,
                                mtry = model_rf$bestTune$mtry,
                                ntree = 50)


  ####### Test the models ################
  # Simulated discharge for the training and test sets
  train_pred_rpart  = predict(best_model_rpart, train_data)
  test_pred_rpart   = predict(best_model_rpart, test_data)
  train_pred_rf  = predict(best_model_rf, train_data)
  test_pred_rf   = predict(best_model_rf, test_data)


  # Check the performance of the model.
  Performance <- data.frame(
    Basin = ID,
    Model = c("CART", "CART", "Random Forest", "Random Forest"),
    Set   = c("Training", "Test", "Training", "Test"),
    RMSE  = c(
      rmse(train_pred_rpart, train_data$Q),      # RPART Train
      rmse(test_pred_rpart, test_data$Q),        # RPART Test
      rmse(train_pred_rf, train_data$Q),         # RF Train
      rmse(test_pred_rf, test_data$Q)            # RF Test
    ),
    KGE   = c(
      KGE(train_pred_rpart, train_data$Q),       # RPART Train
      KGE(test_pred_rpart, test_data$Q),         # RPART Test
      KGE(train_pred_rf, train_data$Q),          # RF Train
      KGE(test_pred_rf, test_data$Q)             # RF Test
    )
  )
  # save in global dataset
  all_performance[[as.character(ID)]] = Performance

  #Performance %>%
  #knitr::kable(caption = paste0("Regression tree model performance metrics for
  #discharge prediction at catchment ", ID, "."))
  ############### Inspect results ###################

  # Feature Importance
  feature_importance(best_model_rpart)

  print(varImpPlot(best_model_rf, main = "Random Forest Feature Importance"))

  #################### Test Flood accuracy on 5 peaks ################

  # find 5 peaks
  temp_Q = test_data$Q
  peak_indices = c()

  # loop 5 times
  for(i in 1:5) {
    # find highest discharge
    current_max_idx = which.max(temp_Q)
    peak_indices = c(peak_indices, current_max_idx)

    # mask out to not pick again
    window_start = max(1, current_max_idx - 24)
    window_end   = min(length(temp_Q), current_max_idx + 24)

    temp_Q[window_start:window_end] <- NA
  }

  # indices for all peaks
  event_indices = c()
  for(idx in peak_indices) {
    window_start = max(1, idx - 24)
    window_end   = min(nrow(test_data), idx + 24)
    event_indices = c(event_indices, window_start:window_end)
  }

  # sort them
  event_indices = sort(unique(event_indices))

  # subset data using indices
  obs_peaks        = test_data$Q[event_indices]
  rpart_pred_peaks = test_pred_rpart[event_indices]
  rf_pred_peaks    = test_pred_rf[event_indices]

  # calculate KGE metrics
  Peak_Performance <- data.frame(
    Basin = ID,
    Model = c("CART", "Random Forest"),
    Set   = c("Top 5 floods", "Top 5 floods"),
    KGE   = c(
      KGE(rpart_pred_peaks, obs_peaks),
      KGE(rf_pred_peaks, obs_peaks)
    )
  )
  # save in global dataset
  all_peak_performance[[as.character(ID)]] = Peak_Performance

  # Plot one of the peaks to visually see how the models did!
  # Let's plot the absolute highest peak (the first one we found)
  # Script below was produced by Google Gemini on 20/04/2026
  first_peak_idx <- peak_indices[1]
  plot_window <- max(1, first_peak_idx - 24):min(nrow(test_data), first_peak_idx + 24)

  plot_df <- data.frame(
    Date = test_data$Date[plot_window],
    Observed = test_data$Q[plot_window],
    CART = test_pred_rpart[plot_window],
    RF = test_pred_rf[plot_window]
  )

  print(ggplot(plot_df, aes(x = Date)) +
    geom_line(aes(y = Observed, color = "Observed (Q)"), size = 1) +
    geom_line(aes(y = CART, color = "CART Predicted"), linetype = "dashed") +
    geom_line(aes(y = RF, color = "RF Predicted"), linetype = "twodash", size = 1) +
    theme_bw() +
    scale_color_manual(values = c("Observed (Q)" = "black", "CART Predicted" = "red", "RF Predicted" = "blue")) +
    labs(title = "Highest Peak Event: Observed vs Predicted", y = "Discharge (Q)", x = "Date")
  )
}


final_performance <- bind_rows(all_performance)
final_peak_performance <- bind_rows(all_peak_performance)

print("OVERALL PERFORMANCE:")
print(knitr::kable(final_performance))

print("PEAK FLOW PERFORMANCE:")
print(knitr::kable(final_peak_performance))

# 1. Create a dataframe for the full test year (2020)
plot_df_full <- data.frame(
  Date = test_data$Date,
  Observed = test_data$Q,
  CART = test_pred_rpart,
  RF = test_pred_rf
)

# 2. Filter the dataframe to only include Jan 1st through April 30th
plot_df_subset <- plot_df_full %>%
  filter(Date < as.POSIXct("2020-03-15", tz = "Etc/GMT-1") &
           Date > as.POSIXct("2020-01-20", tz = "Etc/GMT-1"))

# 3. Extract the start and end dates for the 5 peak windows
peak_windows <- data.frame(
  xmin = test_data$Date[pmax(1, peak_indices - 24)],
  xmax = test_data$Date[pmin(nrow(test_data), peak_indices + 24)]
)

# 4. Generate the plot
year_plot <- ggplot() +
  # Add shaded yellow rectangles for the top 5 peak windows
  geom_rect(data = peak_windows, aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf),
            fill = "gold", alpha = 0.3, inherit.aes = FALSE) +

  # Add the Observed and Predicted lines using the SUBSETTED data
  geom_line(data = plot_df_subset, aes(x = Date, y = Observed, color = "Observed (Q)"), size = 0.6) +
  geom_line(data = plot_df_subset, aes(x = Date, y = CART, color = "CART Predicted"), size = 0.4) +
  geom_line(data = plot_df_subset, aes(x = Date, y = RF, color = "RF Predicted"), size = 0.4) +

  # Aesthetics and theme
  theme_bw() +
  # Force the plot window to only show the limits of our 4-month subset
  coord_cartesian(xlim = c(min(plot_df_subset$Date), max(plot_df_subset$Date))) +
  scale_color_manual(name = "Legend",
                     values = c("Observed (Q)" = "black",
                                "CART Predicted" = "red",
                                "RF Predicted" = "blue")) +
  labs(title = paste("2.5 Months of 2020 Discharge & Flood Peaks (Catchment", ID, ")"),
       subtitle = "Yellow areas correspond to the +/- 24h window around peak events",
       y = "Discharge (Q)",
       x = "Date") +
  theme(legend.position = "bottom")

# Print the plot
print(year_plot)