|
1 | 1 | using Distributed |
2 | 2 | using Logging |
3 | 3 |
|
4 | | -export SlurmManager, PBSManager, default_worker_pool, set_worker_loggers |
| 4 | +export add_workers, |
| 5 | + SlurmManager, PBSManager, default_worker_pool, set_worker_loggers |
5 | 6 |
|
6 | 7 | # Set the time limit for the Julia worker to be contacted by the main process, default = "60.0s" |
7 | 8 | # https://docs.julialang.org/en/v1/manual/environment-variables/#JULIA_WORKER_TIMEOUT |
@@ -444,3 +445,103 @@ function set_worker_loggers(workers = workers()) |
444 | 445 | end |
445 | 446 | end |
446 | 447 | end |
| 448 | + |
| 449 | + |
| 450 | +function is_pbs_available() |
| 451 | + return all([ |
| 452 | + !isnothing(Sys.which("qstat")), |
| 453 | + !isnothing(Sys.which("pbsnodes")), |
| 454 | + !isnothing(Sys.which("qsub")), |
| 455 | + ]) |
| 456 | +end |
| 457 | + |
| 458 | + |
| 459 | +function is_slurm_available() |
| 460 | + return all([ |
| 461 | + !isnothing(Sys.which("sinfo")), |
| 462 | + !isnothing(Sys.which("srun")), |
| 463 | + !isnothing(Sys.which("sbatch")), |
| 464 | + ]) |
| 465 | +end |
| 466 | + |
| 467 | +function is_cluster_environment() |
| 468 | + return is_pbs_available() || is_slurm_available() |
| 469 | +end |
| 470 | + |
| 471 | +const DEFAULT_WALLTIME = 60 |
| 472 | + |
| 473 | +default_cpu_kwargs(::SlurmManager) = (; |
| 474 | + cpus_per_task = 1, |
| 475 | + time = format_slurm_time(DEFAULT_WALLTIME), |
| 476 | + backend_worker_kwargs(get_backend())..., |
| 477 | +) |
| 478 | +default_cpu_kwargs(::PBSManager) = (; |
| 479 | + l_select = "ncpus=1", |
| 480 | + l_walltime = format_pbs_time(DEFAULT_WALLTIME), |
| 481 | + backend_worker_kwargs(get_backend())..., |
| 482 | +) |
| 483 | + |
| 484 | +default_gpu_kwargs(::SlurmManager) = (; |
| 485 | + gpus_per_task = 1, |
| 486 | + cpus_per_task = 4, |
| 487 | + time = format_slurm_time(DEFAULT_WALLTIME), |
| 488 | + backend_worker_kwargs(get_backend())..., |
| 489 | +) |
| 490 | +default_gpu_kwargs(::PBSManager) = (; |
| 491 | + l_select = "ngpus=1:ncpus=4", |
| 492 | + l_walltime = format_pbs_time(DEFAULT_WALLTIME), |
| 493 | + backend_worker_kwargs(get_backend())..., |
| 494 | +) |
| 495 | + |
| 496 | +function get_manager(cluster = :auto, nworkers = 1) |
| 497 | + if cluster == :slurm || (cluster == :auto && is_slurm_available()) |
| 498 | + SlurmManager(nworkers) |
| 499 | + elseif cluster == :pbs || (cluster == :auto && is_pbs_available()) |
| 500 | + PBSManager(nworkers) |
| 501 | + else |
| 502 | + error( |
| 503 | + "Unknown cluster type: $cluster. Valid options are :auto, :pbs, :slurm, or :local", |
| 504 | + ) |
| 505 | + end |
| 506 | +end |
| 507 | + |
| 508 | +""" |
| 509 | + add_workers( |
| 510 | + nworkers; |
| 511 | + device = :gpu, |
| 512 | + cluster = :auto, |
| 513 | + kwargs... |
| 514 | + ) |
| 515 | +
|
| 516 | +Add `nworkers` worker processes to the current Julia session, automatically detecting and configuring for the available computing environment. |
| 517 | +
|
| 518 | +# Arguments |
| 519 | +- `nworkers::Int`: The number of worker processes to add. |
| 520 | +- `device::Symbol = :gpu`: The target compute device type, either `:gpu` (1 GPU, 4 CPU cores) or `:cpu` (1 CPU core). |
| 521 | +- `cluster::Symbol = :auto`: The cluster management system to use. Options: |
| 522 | + * `:auto`: Auto-detect available cluster environment (SLURM, PBS, or local) |
| 523 | + * `:slurm`: Force use of SLURM scheduler |
| 524 | + * `:pbs`: Force use of PBS scheduler |
| 525 | + * `:local`: Force use of local processing (standard `addprocs`) |
| 526 | +- `kwargs`: Other kwargs can be passed directly through to `addprocs`. |
| 527 | +""" |
| 528 | +function add_workers(nworkers::Int; device = :gpu, cluster = :auto, kwargs...) |
| 529 | + if cluster == :local || (cluster == :auto && !is_cluster_environment()) |
| 530 | + # Use standard addprocs for local computation |
| 531 | + @info "Using local processing mode, adding $nworkers worker$(nworkers == 1 ? "" : "s")" |
| 532 | + return addprocs(nworkers) |
| 533 | + else |
| 534 | + # Select the manager based on environment or explicit selection |
| 535 | + manager = get_manager(cluster, nworkers) |
| 536 | + @info "Using $(nameof(typeof(manager))) to add $nworkers workers" |
| 537 | + |
| 538 | + default_kwargs = |
| 539 | + device == :gpu ? default_gpu_kwargs(manager) : |
| 540 | + default_cpu_kwargs(manager) |
| 541 | + |
| 542 | + # Merge the default kwargs with the user-provided kwargs, user kwargs take precedence |
| 543 | + merged_kwargs = merge(default_kwargs, kwargs) |
| 544 | + |
| 545 | + return addprocs(manager; merged_kwargs...) |
| 546 | + end |
| 547 | +end |
0 commit comments