1- export get_backend, calibrate
1+ export get_backend, calibrate, model_run
22
33abstract type AbstractBackend end
44
55struct JuliaBackend <: AbstractBackend end
6- abstract type SlurmBackend <: AbstractBackend end
6+
7+ abstract type HPCBackend <: AbstractBackend end
8+ abstract type SlurmBackend <: HPCBackend end
9+
710struct CaltechHPCBackend <: SlurmBackend end
811struct ClimaGPUBackend <: SlurmBackend end
912
13+ struct DerechoBackend <: HPCBackend end
14+
1015"""
1116 get_backend()
1217
@@ -18,6 +23,8 @@ function get_backend()
1823 (r" ^clima.gps.caltech.edu$" , ClimaGPUBackend),
1924 (r" ^login[1-4].cm.cluster$" , CaltechHPCBackend),
2025 (r" ^hpc-(\d\d )-(\d\d ).cm.cluster$" , CaltechHPCBackend),
26+ (r" derecho([1-8])$" , DerechoBackend),
27+ (r" dec(\d\d\d\d )$" , DerechoBackend), # This should be more specific
2128 ]
2229
2330 for (pattern, backend) in HOSTNAMES
@@ -28,12 +35,12 @@ function get_backend()
2835end
2936
3037"""
31- module_load_string(T) where {T<:Type{SlurmBackend}}
38+ module_load_string(backend)
3239
3340Return a string that loads the correct modules for a given backend when executed via bash.
3441"""
3542function module_load_string(:: Type{CaltechHPCBackend} )
36- return """ export MODULEPATH=/groups/esm/modules:\$ MODULEPATH
43+ return """ export MODULEPATH=" /groups/esm/modules:\$ MODULEPATH"
3744 module purge
3845 module load climacommon/2024_05_27"""
3946end
@@ -43,32 +50,14 @@ function module_load_string(::Type{ClimaGPUBackend})
4350 module load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline"""
4451end
4552
46- """
47- calibrate(::Type{JuliaBackend}, config::ExperimentConfig)
48- calibrate(::Type{JuliaBackend}, experiment_dir::AbstractString)
49-
50- Run a calibration in Julia.
51-
52- Takes an ExperimentConfig or an experiment folder.
53- If no backend is passed, one is chosen via `get_backend`.
54- This function is intended for use in a larger workflow, assuming that all needed
55- model interface and observation map functions are set up for the calibration.
56-
57- # Example
58- Run: `julia --project=experiments/surface_fluxes_perfect_model`
59- ```julia
60- import ClimaCalibrate
61-
62- # Generate observational data and load interface
63- experiment_dir = dirname(Base.active_project())
64- include(joinpath(experiment_dir, "generate_data.jl"))
65- include(joinpath(experiment_dir, "observation_map.jl"))
66- include(joinpath(experiment_dir, "model_interface.jl"))
53+ function module_load_string(:: Type{DerechoBackend} )
54+ return """ export MODULEPATH="/glade/campaign/univ/ucit0011/ClimaModules-Derecho:\$ MODULEPATH"
55+ module purge
56+ module load climacommon
57+ module list
58+ """
59+ end
6760
68- # Initialize and run the calibration
69- eki = ClimaCalibrate.calibrate(experiment_dir)
70- ```
71- """
7261calibrate(config:: ExperimentConfig ; ekp_kwargs... ) =
7362 calibrate(get_backend(), config; ekp_kwargs... )
7463
@@ -86,9 +75,8 @@ function calibrate(
8675 config:: ExperimentConfig ;
8776 ekp_kwargs... ,
8877)
89- initialize(config; ekp_kwargs... )
9078 (; n_iterations, ensemble_size) = config
91- eki = nothing
79+ eki = initialize(config; ekp_kwargs ... )
9280 for i in 0 : (n_iterations - 1 )
9381 @info " Running iteration $i "
9482 for m in 1 : ensemble_size
@@ -103,75 +91,80 @@ function calibrate(
10391end
10492
10593"""
106- calibrate(::Type{SlurmBackend }, config::ExperimentConfig; kwargs...)
107- calibrate(::Type{SlurmBackend }, experiment_dir; kwargs...)
94+ calibrate(::Type{AbstractBackend }, config::ExperimentConfig; kwargs...)
95+ calibrate(::Type{AbstractBackend }, experiment_dir; kwargs...)
10896
10997Run a full calibration, scheduling the forward model runs on Caltech's HPC cluster.
11098
11199Takes either an ExperimentConfig or an experiment folder.
112100
101+ Available Backends: CaltechHPCBackend, ClimaGPUBackend, DerechoBackend, JuliaBackend
102+
103+
113104# Keyword Arguments
114105- `experiment_dir: Directory containing experiment configurations.
115106- `model_interface: Path to the model interface file.
116- - `slurm_kwargs `: Dictionary of slurm arguments, passed through to `sbatch` .
117- - `verbose::Bool`: Enable verbose output for debugging .
107+ - `hpc_kwargs `: Dictionary of resource arguments, passed to the job scheduler .
108+ - `verbose::Bool`: Enable verbose logging .
118109
119110# Usage
120111Open julia: `julia --project=experiments/surface_fluxes_perfect_model`
121112```julia
122- import ClimaCalibrate: CaltechHPCBackend, calibrate
113+ using ClimaCalibrate
123114
124- experiment_dir = dirname(Base.active_project() )
115+ experiment_dir = joinpath(pkgdir(ClimaCalibrate), "experiments", "surface_fluxes_perfect_model" )
125116model_interface = joinpath(experiment_dir, "model_interface.jl")
126117
127118# Generate observational data and load interface
128119include(joinpath(experiment_dir, "generate_data.jl"))
129120include(joinpath(experiment_dir, "observation_map.jl"))
130121include(model_interface)
131122
132- slurm_kwargs = kwargs(time = 3)
133- eki = calibrate(CaltechHPCBackend, experiment_dir; model_interface, slurm_kwargs);
123+ hpc_kwargs = kwargs(time = 3)
124+ backend = get_backend()
125+ eki = calibrate(backend, experiment_dir; model_interface, hpc_kwargs);
134126```
135127"""
136128function calibrate(
137- b:: Type{<:SlurmBackend } ,
129+ b:: Type{<:HPCBackend } ,
138130 experiment_dir:: AbstractString ;
139- slurm_kwargs ,
131+ hpc_kwargs ,
140132 ekp_kwargs... ,
141133)
142- calibrate(b, ExperimentConfig(experiment_dir); slurm_kwargs , ekp_kwargs... )
134+ calibrate(b, ExperimentConfig(experiment_dir); hpc_kwargs , ekp_kwargs... )
143135end
144136
145137function calibrate(
146- b:: Type{<:SlurmBackend } ,
138+ b:: Type{<:HPCBackend } ,
147139 config:: ExperimentConfig ;
148140 experiment_dir = dirname(Base. active_project()),
149141 model_interface = abspath(
150142 joinpath(experiment_dir, " .." , " .." , " model_interface.jl" ),
151143 ),
152144 verbose = false ,
153- slurm_kwargs = Dict(:time_limit => 45 , :ntasks => 1 ),
145+ reruns = 1 ,
146+ hpc_kwargs,
154147 ekp_kwargs... ,
155148)
156149 # ExperimentConfig is created from a YAML file within the experiment_dir
157150 (; n_iterations, output_dir, ensemble_size) = config
158151 @info " Initializing calibration" n_iterations ensemble_size output_dir
159- initialize(config; ekp_kwargs... )
160152
161- eki = nothing
153+ eki = initialize(config; ekp_kwargs ... )
162154 module_load_str = module_load_string(b)
163155 for iter in 0 : (n_iterations - 1 )
164156 @info " Iteration $iter "
165157 jobids = map(1 : ensemble_size) do member
166158 @info " Running ensemble member $member "
167- sbatch_model_run(
159+ model_run(
160+ b,
168161 iter,
169162 member,
170163 output_dir,
171164 experiment_dir,
172165 model_interface,
173166 module_load_str;
174- slurm_kwargs ,
167+ hpc_kwargs ,
175168 )
176169 end
177170
@@ -182,14 +175,69 @@ function calibrate(
182175 experiment_dir,
183176 model_interface,
184177 module_load_str;
185- slurm_kwargs ,
178+ hpc_kwargs ,
186179 verbose,
180+ reruns,
187181 )
188- report_iteration_status(statuses, output_dir, iter)
189182 @info " Completed iteration $iter , updating ensemble"
190183 G_ensemble = observation_map(iter)
191184 save_G_ensemble(config, iter, G_ensemble)
192185 eki = update_ensemble(config, iter)
193186 end
194187 return eki
195188end
189+
190+ # Dispatch on backend type to unify `calibrate` for all HPCBackends
191+ # Scheduler interfaces should not depend on backend struct
192+ """
193+ model_run(backend, iter, member, output_dir, experiment_dir; model_interface, verbose, hpc_kwargs)
194+
195+ Construct and execute a command to run a single forward model on a given job scheduler.
196+
197+ Dispatches on `backend` to run [`slurm_model_run`](@ref) or [`pbs_model_run`](@ref).
198+
199+ Arguments:
200+ - iter: Iteration number
201+ - member: Member number
202+ - output_dir: Calibration experiment output directory
203+ - experiment_dir: Directory containing the experiment's Project.toml
204+ - model_interface: File containing the model interface
205+ - module_load_str: Commands which load the necessary modules
206+ - hpc_kwargs: Dictionary containing the resources for the job. Easily generated using [`kwargs`](@ref).
207+ """
208+ model_run(
209+ b:: Type{<:SlurmBackend} ,
210+ iter,
211+ member,
212+ output_dir,
213+ experiment_dir,
214+ model_interface,
215+ module_load_str;
216+ hpc_kwargs,
217+ ) = slurm_model_run(
218+ iter,
219+ member,
220+ output_dir,
221+ experiment_dir,
222+ model_interface,
223+ module_load_str;
224+ hpc_kwargs,
225+ )
226+ model_run(
227+ b:: Type{DerechoBackend} ,
228+ iter,
229+ member,
230+ output_dir,
231+ experiment_dir,
232+ model_interface,
233+ module_load_str;
234+ hpc_kwargs,
235+ ) = pbs_model_run(
236+ iter,
237+ member,
238+ output_dir,
239+ experiment_dir,
240+ model_interface,
241+ module_load_str;
242+ hpc_kwargs,
243+ )
0 commit comments