Skip to content

Commit 059655e

Browse files
Add debug scripts for ESMF/MPI concurrency error on E3SM Unified 1.11.0rc6 (#934)
1 parent dcd5719 commit 059655e

File tree

9 files changed

+257
-0
lines changed

9 files changed

+257
-0
lines changed

.vscode/e3sm_diags.code-workspace

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,22 @@
6464
"env": {
6565
"PYTHONPATH": "${workspaceFolder}"
6666
}
67+
},
68+
{
69+
"name": "Python: Remote Attach",
70+
"type": "debugpy",
71+
"request": "attach",
72+
"justMyCode": false,
73+
"connect": {
74+
"host": "localhost",
75+
"port": 5678
76+
},
77+
"pathMappings": [
78+
{
79+
"localRoot": "${workspaceFolder}",
80+
"remoteRoot": "/path/to/your/code/on/remote"
81+
}
82+
]
6783
}
6884
]
6985
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# srun -N 1 -t 01:00:00 --pty bash
2+
# source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc7_chrysalis.sh
3+
# ssh -4 -L 5678:localhost:5678 user@compute-node
4+
# Run and Debug View -> Python: Remote Attach -> Attach to Remote
5+
#
6+
# Compute env (Conda): source /lcrc/soft/climate/e3sm-unified/base/etc/profile.d/conda.sh && conda activate e3sm_unified_1.11.0rc7_chrysalis
7+
# Compute env (Spack): source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc7_chrysalis.sh
8+
9+
import os
10+
11+
# ESMF_MPIRUN: Disables the use of mpirun by setting it to "no".
12+
os.environ["ESMF_MPIRUN"] = "no"
13+
# Set the communication method to mpiuni to disable MPI
14+
os.environ["ESMF_COMM"] = "mpiuni"
15+
import esmpy as ESMF
16+
17+
# Initialize ESMF Manager with MPI. -- still crashes
18+
ESMF.Manager(debug=True)
19+
20+
# Finalize ESMF with MPI. -- still crashes
21+
ESMF.ESMP_Finalize()
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# srun -N 1 -t 01:00:00 --pty bash
2+
# source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc7_chrysalis.sh
3+
# ssh -4 -L 5678:localhost:5678 user@compute-node
4+
# Run and Debug View -> Python: Remote Attach -> Attach to Remote
5+
#
6+
# Compute env (Conda): source /lcrc/soft/climate/e3sm-unified/base/etc/profile.d/conda.sh && conda activate e3sm_unified_1.11.0rc7_chrysalis
7+
# Compute env (Spack): source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc7_chrysalis.sh
8+
9+
10+
import dask
11+
import esmpy as ESMF
12+
import numpy as np
13+
from dask import bag
14+
from xesmf.backend import Grid
15+
16+
# Initialize ESMF with MPI only once
17+
ESMF.Manager(debug=True)
18+
19+
def xesmf_code(_):
20+
Grid(
21+
np.array((576, 361)),
22+
staggerloc=ESMF.StaggerLoc.CENTER,
23+
coord_sys=ESMF.CoordSys.SPH_DEG,
24+
num_peri_dims=None
25+
)
26+
27+
config = {"scheduler": "processes", "multiprocessing.context": "fork"}
28+
29+
with dask.config.set(config):
30+
print(f"Running with 2 workers and 'fork'")
31+
print("*"*50)
32+
try:
33+
b = bag.from_sequence(range(2))
34+
results = b.map(xesmf_code).compute(num_workers=2)
35+
except Exception as e:
36+
print(f"{type(e).__name__}: Message: {str(e)}")
37+
else:
38+
print("Done")
39+
40+
# Finalize ESMF with MPI.
41+
ESMF.ESMP_Finalize()
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# 1. srun -N 1 -t 01:00:00 --pty bash
2+
# 2. source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc6_chrysalis.sh
3+
# 3. ssh -4 -L 5678:localhost:5678 user@compute-node
4+
# 4. Run and Debug View -> Python: Remote Attach -> Attach to Remote
5+
6+
import esmpy as ESMF
7+
import numpy as np
8+
from dask.distributed import Client
9+
from xesmf.backend import Grid
10+
11+
ESMF.Manager(debug=True, mpi=False)
12+
13+
14+
import debugpy
15+
debugpy.listen(("0.0.0.0", 5678))
16+
print("Waiting for debugger attach...")
17+
debugpy.wait_for_client()
18+
19+
def xesmf_code(_):
20+
Grid(
21+
np.array((576, 361)),
22+
staggerloc=ESMF.StaggerLoc.CENTER,
23+
coord_sys=ESMF.CoordSys.SPH_DEG,
24+
num_peri_dims=None
25+
)
26+
27+
# Open MPI has detected that this process has attempted to initialize
28+
# MPI (via MPI_INIT or MPI_INIT_THREAD) more than once. This is
29+
# # erroneous.
30+
if __name__ == "__main__":
31+
run_workers = [2] # Example worker counts
32+
client = Client(processes=False, n_workers=2)
33+
34+
for num_workers in run_workers:
35+
print(f"\nRunning with {num_workers} workers using Dask distributed")
36+
print("*" * 50)
37+
try:
38+
futures = client.map(xesmf_code, range(num_workers))
39+
results = client.gather(futures)
40+
except Exception as e:
41+
print(f"{type(e).__name__}: Message: {str(e)}")
42+
else:
43+
print("Done")
44+
45+
46+
client.close()
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import dask
2+
import esmpy as ESMF
3+
import numpy as np
4+
from dask import bag
5+
from xesmf.backend import Grid
6+
import multiprocessing
7+
8+
# Explicitly initialize ESMF in the main process
9+
ESMF.Manager(debug=True)
10+
11+
def xesmf_code(_):
12+
Grid(
13+
np.array((576, 361)),
14+
staggerloc=ESMF.StaggerLoc.CENTER,
15+
coord_sys=ESMF.CoordSys.SPH_DEG,
16+
num_peri_dims=None
17+
)
18+
19+
if __name__ == '__main__':
20+
multiprocessing.set_start_method('forkserver', force=True) # Avoid 'fork'
21+
22+
config = {"scheduler": "processes", "multiprocessing.context": "forkserver"}
23+
with dask.config.set(config):
24+
print(f"Running with 2 workers and 'forkserver'")
25+
print("*"*50)
26+
try:
27+
b = bag.from_sequence(range(2))
28+
results = b.map(xesmf_code).compute(num_workers=2)
29+
except Exception as e:
30+
print(f"{type(e).__name__}: Message: {str(e)}")
31+
else:
32+
print("Done")
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from mpi4py import MPI
2+
3+
# Initialize MPI
4+
if not MPI.Is_initialized():
5+
print("MPI is not initialized. Initializing.")
6+
MPI.Init()
7+
else:
8+
print("MPI is already initialized.")
9+
10+
# Perform your MPI operations here
11+
comm = MPI.COMM_WORLD
12+
rank = comm.Get_rank()
13+
size = comm.Get_size()
14+
15+
print(f"MPI process with rank {rank} out of {size} is running.")
16+
17+
# Finalize MPI
18+
if not MPI.Is_finalized():
19+
print("MPI is not finalized. Finalizing.")
20+
MPI.Finalize()
21+
else:
22+
print("MPI is already finalized.")
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
# Need to do this before importing MPI.
3+
import mpi4py
4+
mpi4py.rc.initialize = False # do not initialize MPI automatically
5+
mpi4py.rc.finalize = False # do not finalize MPI automatically
6+
"""
7+
8+
# import mpi4py
9+
# mpi4py.rc.initialize = False # do not initialize MPI automatically
10+
# mpi4py.rc.finalize = False # do not finalize MPI automatically
11+
12+
# PMI2_Init failed to intialize. Return code: 14
13+
from mpi4py import MPI
14+
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# srun -N 1 -t 01:00:00 --pty bash
2+
# source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc8_chrysalis.sh
3+
# ssh -4 -L 5678:localhost:5678 user@compute-node
4+
# Run and Debug View -> Python: Remote Attach -> Attach to Remote
5+
#
6+
# Compute env (Conda): source /lcrc/soft/climate/e3sm-unified/base/etc/profile.d/conda.sh && conda activate e3sm_unified_1.11.0rc8_chrysalis
7+
# Compute env (Spack): source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.11.0rc8_chrysalis.sh
8+
9+
import os
10+
import sys
11+
12+
import numpy
13+
14+
from e3sm_diags.parameter.core_parameter import CoreParameter
15+
from e3sm_diags.run import runner
16+
17+
# import debugpy
18+
# debugpy.listen(("0.0.0.0", 5678))
19+
# print("Waiting for debugger attach...")
20+
# debugpy.wait_for_client()
21+
22+
short_name = 'v2.LR.historical_0201'
23+
test_ts = '/lcrc/group/e3sm/ac.forsyth2/zppy_weekly_comprehensive_v2_output/test_pr651_both_commits_20250117/v2.LR.historical_0201/post/atm/180x360_aave/ts/monthly/2yr'
24+
25+
param = CoreParameter()
26+
27+
# Model
28+
param.test_data_path = '/lcrc/group/e3sm/ac.forsyth2/zppy_weekly_comprehensive_v2_output/test_pr651_both_commits_20250117/v2.LR.historical_0201/post/atm/180x360_aave/clim/2yr'
29+
param.test_name = 'v2.LR.historical_0201'
30+
param.short_test_name = short_name
31+
32+
# Ref
33+
34+
# Obs
35+
param.reference_data_path = '/lcrc/group/e3sm/diagnostics/observations/Atm/climatology/'
36+
37+
38+
# Output dir
39+
param.results_dir = 'model_vs_obs_1982-1983'
40+
41+
# Additional settings
42+
param.run_type = 'model_vs_obs'
43+
param.diff_title = 'Model - Observations'
44+
param.multiprocessing = True
45+
param.num_workers = 8
46+
#param.fail_on_incomplete = True
47+
params = [param]
48+
49+
# Run
50+
cfg_path = "auxiliary_tools/cdat_regression_testing/933-esmf-mpi/v2_run.cfg"
51+
sys.argv.extend(["--diags", cfg_path])
52+
53+
runner.sets_to_run = ['lat_lon', ]
54+
runner.run_diags(params)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[#]
2+
sets = ["lat_lon"]
3+
case_id = "MERRA2"
4+
variables = ["TREFMXAV"]
5+
regions = ["global"]
6+
ref_name = "MERRA2"
7+
reference_name = "MERRA2 Reanalysis"
8+
regions = ["global"]
9+
seasons=["ANN"]
10+
contour_levels = [-35, -30, -25, -20, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 35, 40]
11+
diff_levels = [-12, -8, -4, -2, -1, -0.5, 0.5, 1, 2, 4, 8, 12]

0 commit comments

Comments
 (0)