Skip to content

Commit 8b73ad3

Browse files
[production/RRFS.v1] fix RRFS/REFS restart reproducibility and DEBUG crash issues for RRFSv1 operational implementation (#2925)
* Update .gitmodules * Fix rrfs (conus13km) decomp and restart tests * update .gitmodules * Update FV3 * update for rrfsens restart bitwise reproducibility * update FV3 * revert FV3 * update FV3 * REFS ensemble restart fix (#43) * update for rrfsens restart bitwise reproducibility --------- * update FV3 and stochastic_physics; fix DEBUG crash for REFS * update FV3 * update FV3 to fix C3 compiling error * bl_date update * WCOSS2 RRFS Tests Pass * add hercules RT log: passed * remove unused conf file * revert ufsatm and stochastic_physics back to community * update ufsatm and stochastic_physics ---------
1 parent 970fa85 commit 8b73ad3

17 files changed

+682
-625
lines changed

FV3

Submodule FV3 updated from 81e6d10 to f861e49

tests/bl_date.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
export BL_DATE=20250123
1+
export BL_DATE=20251017

tests/default_vars.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,8 +1402,8 @@ export SHAL_CNV=.false.
14021402
export DO_SAT_ADJ=.false.
14031403
export DO_DEEP=.false.
14041404
export CCPP_SUITE='FV3_HRRR'
1405-
export INPES=12
1406-
export JNPES=12
1405+
export INPES=10
1406+
export JNPES=8
14071407
export NPX=397
14081408
export NPY=233
14091409
export NPZ=65

tests/fv3_conf/fv3_qsub.IN_acorn

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@
55
#PBS -N @[JBNME]
66
#PBS -A @[ACCNR]
77
#PBS -q @[QUEUE]
8-
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[TPN]:mpiprocs=@[TPN]:mem=500G
8+
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[NCPUS]:mpiprocs=@[TPN]:ompthreads=@[THRD]:mem=500G
99
#PBS -l place=excl
1010
#PBS -l walltime=00:@[WLCLK]:00
1111

1212
set -eux
13-
echo -n " $( date +%s )," > job_timestamp.txt
1413

1514
cd $PBS_O_WORKDIR
1615

16+
echo -n " $( date +%s )," > job_timestamp.txt
17+
1718
set +x
1819
module use $PWD/modulefiles
1920
module load modules.fv3
@@ -30,7 +31,7 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
3031
export ESMF_RUNTIME_PROFILE=ON
3132
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
3233

33-
mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe
34+
mpiexec -n @[TASKS] -ppn @[TPN] --cpu-bind core -depth @[THRD] ./fv3.exe
3435

3536
echo "Model ended: " `date`
3637
echo -n " $( date +%s )," >> job_timestamp.txt

tests/fv3_conf/fv3_qsub.IN_wcoss2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@
55
#PBS -N @[JBNME]
66
#PBS -A @[ACCNR]
77
#PBS -q @[QUEUE]
8-
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[TPN]:mpiprocs=@[TPN]:mem=500G
8+
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[NCPUS]:mpiprocs=@[TPN]:ompthreads=@[THRD]:mem=500G
99
#PBS -l place=excl
1010
#PBS -l walltime=00:@[WLCLK]:00
1111

1212
set -eux
13-
echo -n " $( date +%s )," > job_timestamp.txt
1413

1514
cd $PBS_O_WORKDIR
1615

16+
echo -n " $( date +%s )," > job_timestamp.txt
17+
1718
set +x
1819
module use $PWD/modulefiles
1920
module load modules.fv3
@@ -30,7 +31,7 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
3031
export ESMF_RUNTIME_PROFILE=ON
3132
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
3233

33-
mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe
34+
mpiexec -n @[TASKS] -ppn @[TPN] --cpu-bind core -depth @[THRD] ./fv3.exe
3435

3536
echo "Model ended: " `date`
3637
echo -n " $( date +%s )," >> job_timestamp.txt

tests/logs/RegressionTests_hercules.log

Lines changed: 317 additions & 287 deletions
Large diffs are not rendered by default.

tests/logs/RegressionTests_wcoss2.log

Lines changed: 315 additions & 285 deletions
Large diffs are not rendered by default.

tests/rt.conf

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -199,12 +199,9 @@ RUN | hrrr_control_restart_dyn32_phy32 | - noaacloud
199199

200200
COMPILE | rrfs_dyn32_phy32_faster | intel | -DAPP=ATM -DFASTER=ON -DCCPP_SUITES=FV3_HRRR -D32BIT=ON -DCCPP_32BIT=ON | - noaacloud | fv3 |
201201
RUN | conus13km_control | - noaacloud | baseline |
202-
RUN | conus13km_2threads | - noaacloud | | conus13km_control
203-
RUN | conus13km_restart_mismatch | - noaacloud | baseline | conus13km_control
204-
205-
# Expected to fail:
206-
# RUN | conus13km_restart | - noaacloud | | conus13km_control
207-
# RUN | conus13km_decomp | - noaacloud | | conus13km_control
202+
RUN | conus13km_2threads | - noaacloud | |
203+
RUN | conus13km_restart | - noaacloud | | conus13km_control
204+
RUN | conus13km_decomp | - noaacloud | |
208205

209206
COMPILE | rrfs_dyn64_phy32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_HRRR -DCCPP_32BIT=ON | - noaacloud | fv3 |
210207
RUN | rap_control_dyn64_phy32 | - noaacloud | baseline |
@@ -213,13 +210,10 @@ COMPILE | rrfs_dyn32_phy32_debug | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_H
213210
RUN | rap_control_debug_dyn32_phy32 | - noaacloud | baseline |
214211
RUN | hrrr_control_debug_dyn32_phy32 | - noaacloud | baseline |
215212
RUN | conus13km_debug | - noaacloud | baseline |
216-
RUN | conus13km_debug_qr | - derecho noaacloud | |
217-
RUN | conus13km_debug_2threads | - derecho noaacloud | |
213+
RUN | conus13km_debug_qr | - noaacloud | |
214+
RUN | conus13km_debug_2threads | - noaacloud | |
218215
RUN | conus13km_radar_tten_debug | - noaacloud | baseline |
219216

220-
# Expected to fail:
221-
# RUN | conus13km_debug_decomp | - noaacloud | |
222-
223217
COMPILE | rrfs_dyn64_phy32_debug | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_HRRR -DCCPP_32BIT=ON -DDEBUG=ON | - noaacloud | fv3 |
224218
RUN | rap_control_dyn64_phy32_debug | - noaacloud | baseline |
225219

@@ -371,12 +365,9 @@ RUN | hrrr_control_decomp_dyn32_phy32 | + hera hercules
371365
RUN | rap_restart_dyn32_phy32 | + hera hercules | | rap_control_dyn32_phy32
372366
RUN | hrrr_control_restart_dyn32_phy32 | + hera hercules | | hrrr_control_dyn32_phy32
373367
RUN | conus13km_control | + hera hercules | baseline |
374-
RUN | conus13km_2threads | + hera hercules | | conus13km_control
375-
RUN | conus13km_restart_mismatch | + hera hercules | baseline | conus13km_control
376-
377-
# Expected to fail:
378-
# RUN | conus13km_restart | + hera hercules | | conus13km_control
379-
# RUN | conus13km_decomp | + hera hercules | | conus13km_control
368+
RUN | conus13km_2threads | + hera hercules | |
369+
RUN | conus13km_decomp | + hera hercules | |
370+
RUN | conus13km_restart | + hera hercules | | conus13km_control
380371

381372
COMPILE | atm_dyn64_phy32 | gnu | -DAPP=ATM -DCCPP_32BIT=ON | + hera hercules | fv3 |
382373
RUN | rap_control_dyn64_phy32 | + hera hercules | baseline |
@@ -387,11 +378,9 @@ RUN | hrrr_control_debug_dyn32_phy32 | + hera hercules
387378
RUN | conus13km_debug | + hera hercules | baseline |
388379
RUN | conus13km_debug_qr | + hera hercules | |
389380
RUN | conus13km_debug_2threads | + hera hercules | |
381+
RUN | conus13km_debug_decomp | + hera hercules | |
390382
RUN | conus13km_radar_tten_debug | + hera hercules | baseline |
391383

392-
# Expected to fail:
393-
# RUN | conus13km_debug_decomp | + hera derecho hercules | |
394-
395384
COMPILE | atm_dyn64_phy32_debug | gnu | -DAPP=ATM -DCCPP_32BIT=ON -DDEBUG=ON | + hera hercules | fv3 |
396385
RUN | rap_control_dyn64_phy32_debug | + hera hercules | baseline |
397386

tests/rt.conf_rrfs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,9 @@ RUN | hrrr_control_restart_dyn32_phy32 | - noaacloud
8686

8787
COMPILE | rrfs_dyn32_phy32_faster | intel | -DAPP=ATM -DFASTER=ON -DCCPP_SUITES=FV3_HRRR -D32BIT=ON -DCCPP_32BIT=ON -DENABLE_PARALLELRESTART=NO -DENABLE_RRFS_WAR=NO | - noaacloud | fv3 |
8888
RUN | conus13km_control | - noaacloud | baseline |
89-
RUN | conus13km_2threads | - noaacloud | | conus13km_control
90-
RUN | conus13km_restart_mismatch | - noaacloud | baseline | conus13km_control
91-
92-
# Expected to fail:
93-
# RUN | conus13km_restart | - noaacloud | | conus13km_control
94-
# RUN | conus13km_decomp | - noaacloud | | conus13km_control
89+
RUN | conus13km_2threads | - noaacloud | |
90+
RUN | conus13km_decomp | - noaacloud | |
91+
RUN | conus13km_restart | - noaacloud | | conus13km_control
9592

9693
COMPILE | rrfs_dyn64_phy32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_HRRR -DCCPP_32BIT=ON -DENABLE_PARALLELRESTART=NO -DENABLE_RRFS_WAR=NO | - noaacloud | fv3 |
9794
RUN | rap_control_dyn64_phy32 | - noaacloud | baseline |
@@ -102,11 +99,9 @@ RUN | hrrr_control_debug_dyn32_phy32 | - noaacloud
10299
RUN | conus13km_debug | - noaacloud | baseline |
103100
RUN | conus13km_debug_qr | - derecho noaacloud | |
104101
RUN | conus13km_debug_2threads | - derecho noaacloud | |
102+
RUN | conus13km_debug_decomp | - noaacloud | |
105103
RUN | conus13km_radar_tten_debug | - noaacloud | baseline |
106104

107-
# Expected to fail:
108-
# RUN | conus13km_debug_decomp | - noaacloud | |
109-
110105
COMPILE | rrfs_dyn64_phy32_debug | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_HRRR -DCCPP_32BIT=ON -DDEBUG=ON -DENABLE_PARALLELRESTART=NO -DENABLE_RRFS_WAR=NO | - noaacloud | fv3 |
111106
RUN | rap_control_dyn64_phy32_debug | - noaacloud | baseline |
112107

0 commit comments

Comments
 (0)