flood-forecasting/example-configs/floodhub-settings-config.yml at a4c3f7cc0a0eea8cca630cc277093e959b02ff10 · google-research/flood-forecasting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ==============================================================================
# GoogleHydrology Configuration File
# ==============================================================================
# This YAML file defines all parameters for a training run for the Mean Embedding
# Forecast LSTM, which as of 2025 is the operational river forecast model
# powering the Google FloodHub. This file represents something that is very close
# to replicating the training settings for the operational model. There are some
# minor differences, due to small differences in the way this open source pipeline
# is set up compared to the operational pipeline (which is an entirely different
# codebase), with this current codebase focusing on usability and efficiency.
# This config file will get you relatively close if you train on the whole
# MultiMet Caravan dataset.

# The model is described in this paper:
#   Gauch, M., et al. "How to deal w\_ missing input data."
#   Hydrology and Earth System Sciences 29.21 (2025): 6221-6235.
#   https://hess.copernicus.org/articles/29/6221/2025/

# --- 1. General Experiment Settings -------------------------------------------
# Basic identifiers and housekeeping for the experiment.

# Unique name for this experiment. Output folders will use this name.
experiment_name: google-floodhub-settings

# Directory where model artifacts (weights, config copy, logs) will be saved.
# IMPORTANT: You must ensure this path exists or can be created.
run_dir:

# Controls how much information is printed to the console during training.
# Options: DEBUG, INFO, WARNING, ERROR. 'INFO' is good standard practice.
logging_level: INFO

# If True, PyTorch will try to find operations that cause NaN or Inf values.
# Slows down training; usually keep False unless debugging crashes.
detect_anomaly: False

# If True, warnings (like data storage issues) are only printed once.
print_warnings_once: True


# --- 2. Data Configuration ----------------------------------------------------
# Defines what data is used, where it comes from, and how it's split.

# Specifies the class of dataset loader to use. 'multimet' is used for Caravan
# data with multiple meteorological forcing products.
dataset: multimet

# -- File Paths (UPDATE THESE TO MATCH YOUR SYSTEM) --
# Paths to text files containing lists of basin IDs for training, validation, and testing.
train_basin_file: ~/flood-forecasting/example-configs/multimet-basins-list-without-chirps.txt
validation_basin_file: ~/flood-forecasting/example-configs/multimet-basins-list-without-chirps.txt
test_basin_file: ~/flood-forecasting/example-configs/multimet-basins-list-without-chirps.txt

# Directory containing the target data (streamflow observations).
targets_data_dir: Caravan-nc
# Directory containing static catchment attributes (e.g., area, elevation).
statics_data_dir: Caravan-nc
# Directory containing dynamic meteorological forcing data (rain, temp, etc.).
# 'gs://' indicates data is streamed directly from Google Cloud Storage.
dynamics_data_dir: gs://caravan-multimet/v1.1

# -- Time Periods --
# Define the start and end dates for each data split (format: dd/mm/yyyy).

# Training Period: The model learns from data in this date range.
# It adjusts its internal weights to minimize prediction errors during this time.
train_start_date: 01/01/1982
train_end_date: 31/12/2023

# Validation Period: Data in this range is used to check model performance *during* training.
# It helps prevent overfitting (learning training data too perfectly but failing on new data)
# and is used to tune hyperparameters like the learning rate.
validation_start_date: 01/01/1982
validation_end_date: 31/12/2023

# Test Period: A completely separate period used ONLY after training is finished.
# It provides the final, unbiased evaluation of how well the model performs on
# unseen future data.
test_start_date: 01/01/1982
test_end_date: 31/12/2023

# -- Input Features (Dynamic) --
# 'hindcast_inputs': Historical weather data the model sees to learn current state.
hindcast_inputs:
  hres: # ECMWF High-Resolution operational forecast
    - hres_surface_net_solar_radiation
    - hres_surface_net_thermal_radiation
    - hres_surface_pressure
    - hres_temperature_2m
    - hres_total_precipitation
  graphcast: # AI-based weather forecast model inputs
    - graphcast_temperature_2m
    - graphcast_total_precipitation
  imerg: # Satellite precipitation product
    - imerg_precipitation
  cpc: # Gauge-based precipitation product
    - cpc_precipitation

# 'forecast_inputs': Future weather data (forecasts) the model uses to predict ahead.
forecast_inputs:
  hres:
    - hres_surface_net_solar_radiation
    - hres_surface_net_thermal_radiation
    - hres_surface_pressure
    - hres_temperature_2m
    - hres_total_precipitation
  graphcast:
    - graphcast_temperature_2m
    - graphcast_total_precipitation

# 'union_mapping': Fills missing values (NaNs) in one dataset with values from another.
# The primary dataset (key) uses data from the fallback dataset (value) to fill gaps.
# Format: {primary_feature_with_gaps: fallback_feature_to_fill_from}
union_mapping:
  cpc_precipitation: era5land_total_precipitation
  imerg_precipitation: era5land_total_precipitation
  graphcast_temperature_2m: era5land_temperature_2m
  graphcast_total_precipitation: era5land_total_precipitation
  hres_surface_net_solar_radiation: era5land_surface_net_solar_radiation
  hres_surface_net_thermal_radiation: era5land_surface_net_thermal_radiation
  hres_surface_pressure: era5land_surface_pressure
  hres_temperature_2m: era5land_temperature_2m
  hres_total_precipitation: era5land_total_precipitation

# -- Input Features (Static) --
# Catchment characteristics that do not change over time.
static_attributes:
  - p_mean
  - pet_mean_ERA5_LAND
  - aridity_ERA5_LAND
  - frac_snow
  - moisture_index_ERA5_LAND
  - seasonality_ERA5_LAND
  - high_prec_freq
  - high_prec_dur
  - low_prec_freq
  - low_prec_dur
  - aet_mm_syr
  - ari_ix_sav
  - crp_pc_sse
  - ele_mt_sav
  - ero_kh_sav
  - for_pc_sse
  - gdp_ud_ssu
  - gla_pc_sse
  - glc_pc_s01
  - glc_pc_s02
  - glc_pc_s03
  - glc_pc_s04
  - glc_pc_s05
  - glc_pc_s06
  - glc_pc_s07
  - glc_pc_s08
  - glc_pc_s09
  - glc_pc_s10
  - glc_pc_s11
  - glc_pc_s12
  - glc_pc_s13
  - glc_pc_s14
  - glc_pc_s15
  - glc_pc_s16
  - glc_pc_s17
  - glc_pc_s18
  - glc_pc_s19
  - glc_pc_s20
  - glc_pc_s21
  - glc_pc_s22
  - hft_ix_s09
  - hft_ix_s93
  - inu_pc_slt
  - inu_pc_smn
  - inu_pc_smx
  - ire_pc_sse
  - kar_pc_sse
  - lka_pc_sse
  - nli_ix_sav
  - pac_pc_sse
  - pet_mm_syr
  - pnv_pc_s01
  - pnv_pc_s02
  - pnv_pc_s03
  - pnv_pc_s04
  - pnv_pc_s05
  - pnv_pc_s06
  - pnv_pc_s07
  - pnv_pc_s08
  - pnv_pc_s09
  - pnv_pc_s10
  - pnv_pc_s11
  - pnv_pc_s12
  - pnv_pc_s13
  - pnv_pc_s14
  - pnv_pc_s15
  - ppd_pk_sav
  - pre_mm_syr
  - prm_pc_sse
  - rdd_mk_sav
  - snw_pc_syr
  - swc_pc_syr
  - tmp_dc_syr
  - urb_pc_sse
  - wet_pc_s01
  - wet_pc_s02
  - wet_pc_s03
  - wet_pc_s04
  - wet_pc_s05
  - wet_pc_s06
  - wet_pc_s07
  - wet_pc_s08
  - wet_pc_s09
  - wet_pc_sg1
  - wet_pc_sg2

# -- Targets --
# The variable(s) we are trying to predict.
target_variables:
- streamflow


# --- 3. Model Architecture ----------------------------------------------------
# Defines the structure of the neural network.

# The specific model class to use from the model zoo.
# 'mean_embedding_forecast_lstm' uses separate LSTMs for past and future,
# averaging embeddings from multiple weather products.
model: mean_embedding_forecast_lstm

# Size of the hidden state vector in the LSTM layers. Higher = more capacity but
# higher risk of overfitting.
hidden_size: 512

# The type of output head.
# These CMAL settings come from https://hess.copernicus.org/articles/26/1673/2022
head: cmal
n_distributions: 3
n_samples: 7500
negative_sample_handling: clip

# Number of days of past data the model uses as input for a single prediction.
seq_length: 365

# Number of days into the future to predict.
lead_time: 7

# Number of days the hindcast (past) and forecast (future) LSTMs overlap.
forecast_overlap: 365

# If True, adds a feature indicating how many days ahead the forecast is for.
timestep_counter: True

# -- Dropout --
# Regularization technique to prevent overfitting.
output_dropout: 0.4    # Fraction of neurons to randomly deactivate in the output layer.

# -- Initialization --
# Sets initial values for model weights before training starts.
initial_forget_bias: 3 # High initial forget gate bias helps LSTMs learn long-term dependencies.
weight_init_opts:      # specific initialization schemes for different parts of the network.
- lstm-ih-xavier
- lstm-hh-orthogonal
- fc-xavier

# -- Sub-network Configurations --
# Architectures for the small feed-forward networks that process specific input types
# before they go into the main LSTMs.
statics_embedding:
  type: fc                         # Fully Connected (dense) network
  hiddens: [100, 100, 20]          # Three layers with 100, 100, and 20 neurons each
  activation: [tanh, tanh, linear] # Activation functions for each layer
  dropout: 0.0
hindcast_embedding:
  type: fc
  hiddens: [100, 20]
  activation: [tanh, linear]
  dropout: 0.0
forecast_embedding:
  type: fc
  hiddens: [20, 20, 20, 20]
  activation: [tanh, tanh, tanh, linear]
  dropout: 0.0


# --- 4. Training Configuration ------------------------------------------------
# Controls how the model learns from data.

# Choose which GPU to run the code with. On most systems you can see the
# available GPUs using `nvidia-smi`. The most common naming scheme for GPUs
# is `cuda:0`, 'cuda:1`, etc.
# Set this arguement to `cpu` for training on a CPU
device: cuda:0

# The optimization algorithm. Adam is a standard, robust choice.
optimizer: Adam

# The loss function to minimize during training.
loss: cmalloss

# -- Training Loop --
# Number of times the model sees the entire training dataset.
epochs: 125

# Number of samples processed at once before updating weights.
batch_size: 512

# Maximum number of weight updates per epoch (useful for very large datasets).
max_updates_per_epoch: 2000

# -- Learning Rate Scheduler --
# Adjusts the learning rate during training to improve convergence.
learning_rate_strategy: ReduceLROnPlateau # Lowers LR when validation loss stops improving.
initial_learning_rate: 0.0005             # Starting step size for weight updates.
learning_rate_drop_factor: 0.5            # Multiply LR by this when dropping (unused).
learning_rate_epochs_drop: 200            # Effectively infinite (since larger than # epochs)

# -- Regularization & Stability --
# Prevents exploding gradients which can destabilize training.
clip_gradient_norm: 1

# Adds small random noise to targets during training to improve robustness.
target_noise_std: 0.005

# --- 5. Validation & Evaluation -----------------------------------------------
# Settings for checking model performance during and after training.

# Metrics to calculate during validation and print to the console.
metrics:
- NSE       # Nash-Sutcliffe Efficiency (standard hydrology metric, 1=perfect, <0=worse than mean).
- KGE       # Kling-Gupta Efficiency (balances correlation, variability error, and bias error. 1=perfect).
#- Alpha-NSE # Variability ratio (std_sim / std_obs). Ideal value is 1.
#- Beta-NSE  # Standardized bias ( (mean_sim - mean_obs) / std_obs ). Ideal value is 0.
#- Beta-KGE  # Bias ratio (mean_sim / mean_obs). Ideal value is 1.
#- Pearson-r # Pearson correlation coefficient. Measures timing/shape agreement only. Ideal value is 1.

# How often (in epochs) to run validation.
validate_every: 1

# Number of basins to use for validation. -1 means use ALL validation basins.
validate_n_random_basins: -1

# During testing/validation, ignore periods where observations are completely missing.
tester_skip_obs_all_nan: True

# Method for aggregating multiple predictions if ensemble is used (median is robust).
tester_sample_reduction: median

# Forces negative predictions to zero for these variables (physically realistic for streamflow).
clip_targets_to_zero:
- streamflow

# Defines which time steps are used to calculate loss. 8 means only the last 8 days
# of the sequence contributes to the error (focuses model on the forecast period).
predict_last_n: 8

# If True, samples with all-zero inputs are treated as invalid data gaps.
allzero_samples_are_invalid: False


# --- 6. System & Runtime ------------------------------------------------------
# Computational settings.

# Number of CPU processes used to load data in parallel. Higher can be faster.
num_workers: 0
# How often to print training loss to Tensorboard.
log_loss_every_nth_update: 50

# Data caching settings to speed up training by keeping data in RAM.
cache:
  enabled: False          # Set to True if you have enough RAM (~250GB for Caravan).
  byte_limit: 10000000000 # Max RAM to use for cache (in bytes, ~10GB here)