Skip to content

Commit b1efc60

Browse files
authored
Merge pull request E3SM-Project#3077 from E3SM-Project/bartgol/eamxx/ghci-snl-cuda
EAMxx: add support for ghci-snl-cuda in standalone testing
2 parents d291456 + 5da88db commit b1efc60

File tree

8 files changed

+90
-63
lines changed

8 files changed

+90
-63
lines changed

.github/actions/test-all-scream/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ runs:
9595
if: always()
9696
uses: actions/upload-artifact@v4
9797
with:
98-
name: log-files-${{ inputs.build_type }}
98+
name: log-files-${{ inputs.build_type }}-${{ inputs.machine }}
9999
path: |
100100
components/eamxx/ctest-build/*/Testing/Temporary/Last*.log
101101
components/eamxx/ctest-build/*/ctest_resource_file.json

.github/workflows/eamxx-scripts-tests.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ on:
88
paths:
99
- components/eamxx/scripts/**
1010
- components/eamxx/cime_config/*.py
11-
pull_request_review:
12-
types: [submitted]
1311

1412
# Manual run for debug purposes only
1513
workflow_dispatch:
@@ -19,13 +17,10 @@ on:
1917
- cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time
2018

2119
concurrency:
22-
# Two runs are in the same group if:
23-
# - they have the same trigger
24-
# - if trigger=pull_request/pull_request_review, the PR number must match
25-
# - if trigger=workflow_dispatch/schedule: no concurrency
26-
group: ${{ github.workflow }}-${{ github.event_name }}-${{
27-
(github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
28-
}}
20+
# Two runs are in the same group if they are testing the same git ref
21+
# - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
22+
# - for other triggers, the ref is the branch tested
23+
group: ${{ github.workflow }}-${{ github.ref }}
2924
cancel-in-progress: true
3025

3126
jobs:

.github/workflows/eamxx-standalone-testing.yml

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ on:
1111
- components/eam/src/physics/p3/scream/**
1212
- components/eam/src/physics/cam/**
1313
- .github/workflows/eamxx-standalone-testing.yml
14-
pull_request_review:
15-
types: [submitted]
1614

1715
# Manual run is used to bless
1816
workflow_dispatch:
@@ -23,6 +21,7 @@ on:
2321
type: choice
2422
options:
2523
- gcc-openmp
24+
- gcc-cuda
2625
bless:
2726
description: 'Generate baselines'
2827
required: true
@@ -33,13 +32,10 @@ on:
3332
- cron: '0 7 * * *' # Runs at 7 AM UTC, which is midnight MT during Standard Time
3433

3534
concurrency:
36-
# Two runs are in the same group if:
37-
# - they have the same trigger
38-
# - if trigger=pull_request/pull_request_review, the PR number must match
39-
# - if trigger=workflow_dispatch/schedule: no concurrency
40-
group: ${{ github.workflow }}-${{ github.event_name }}-${{
41-
(github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
42-
}}
35+
# Two runs are in the same group if they are testing the same git ref
36+
# - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
37+
# - for other triggers, the ref is the branch tested
38+
group: ${{ github.workflow }}-${{ github.ref }}
4339
cancel-in-progress: true
4440

4541
jobs:
@@ -86,30 +82,46 @@ jobs:
8682
generate: ${{ env.generate }}
8783
submit: ${{ env.submit }}
8884
cmake-configs: Kokkos_ENABLE_OPENMP=ON
89-
# cuda:
90-
# # Disable until the CUDA container is up and running. When CUDA container is availabe, remove
91-
# # this line and uncomment the next if
92-
# if: false
93-
# # Runs always for pull_request. For workflow_dispatch, user must request this machine
94-
# # if: ${{ github.event_name == 'pull_request' || contains(github.event.inputs.jobs_to_run, 'openmp-gcc') }}
95-
# runs-on: [self-hosted, cuda]
96-
# strategy:
97-
# fail-fast: false
98-
# matrix:
99-
# build_type: [sp, dbg, fpe, opt]
100-
# name: cuda-${{ matrix.build_type }}
101-
# steps:
102-
# - name: Show action trigger
103-
# uses: ./.github/actions/print-workflow-trigger
104-
# - name: Check out the repository
105-
# uses: actions/checkout@v4
106-
# with:
107-
# persist-credentials: false
108-
# show-progress: false
109-
# submodules: recursive
110-
# - name: Run tests
111-
# uses: ./.github/actions/test-all-scream
112-
# with:
113-
# build_type: ${{ matrix.build_type }}
114-
# machine: ghci-snl-cuda
115-
# run_type: at-run
85+
gcc-cuda:
86+
runs-on: [self-hosted, ghci-snl-cuda, cuda, gcc]
87+
strategy:
88+
fail-fast: false
89+
matrix:
90+
build_type: [sp, dbg, opt]
91+
if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'gcc-cuda') }}
92+
name: gcc-cuda / ${{ matrix.build_type }}
93+
steps:
94+
- name: Check out the repository
95+
uses: actions/checkout@v4
96+
with:
97+
persist-credentials: false
98+
show-progress: false
99+
submodules: recursive
100+
- name: Show action trigger
101+
uses: ./.github/actions/show-workflow-trigger
102+
- name: Check for skip labels
103+
if: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_review' }}
104+
uses: ./.github/actions/check-skip-labels
105+
with:
106+
skip_labels: 'AT: skip gcc,AT: skip cuda,AT: skip eamxx-sa,AT: skip eamxx-all'
107+
token: ${{ secrets.GITHUB_TOKEN }}
108+
pr_number: ${{ github.event.pull_request.number }}
109+
- name: Set test-all inputs based on event specs
110+
run: |
111+
echo "submit=false" >> $GITHUB_ENV
112+
echo "generate=false" >> $GITHUB_ENV
113+
if [ "${{ github.event_name }}" == "schedule" ]; then
114+
echo "submit=true" >> $GITHUB_ENV
115+
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
116+
if [ "${{ inputs.bless }}" == "true" ]; then
117+
echo "generate=true" >> $GITHUB_ENV
118+
fi
119+
fi
120+
- name: Run tests
121+
uses: ./.github/actions/test-all-scream
122+
with:
123+
build_type: ${{ matrix.build_type }}
124+
machine: ghci-snl-cuda
125+
generate: ${{ env.generate }}
126+
submit: ${{ env.submit }}
127+
cmake-configs: Kokkos_ARCH_VOLTA70=ON;CMAKE_CUDA_ARCHITECTURES=70

.github/workflows/eamxx-v1-testing.yml

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ on:
1111
- components/eam/src/physics/p3/scream/**
1212
- components/eam/src/physics/cam/**
1313
- .github/workflows/eamxx-v1-testing.yml
14-
pull_request_review:
15-
types: [submitted]
1614

1715
# Manual run is used to bless
1816
workflow_dispatch:
@@ -29,13 +27,10 @@ on:
2927
type: boolean
3028

3129
concurrency:
32-
# Two runs are in the same group if:
33-
# - they have the same trigger
34-
# - if trigger=pull_request/pull_request_review: the PR number must match
35-
# - if trigger=workflow_dispatch: no concurrency
36-
group: ${{ github.workflow }}-${{ github.event_name }}-${{
37-
(github.event_name == 'pull_request' || github.event_name == 'pull_request_review') && github.event.pull_request.number || github.run_id
38-
}}
30+
# Two runs are in the same group if they are testing the same git ref
31+
# - if trigger=pull_request, the ref is refs/pull/<PR_NUMBER>/merge
32+
# - for other triggers, the ref is the branch tested
33+
group: ${{ github.workflow }}-${{ github.ref }}
3934
cancel-in-progress: true
4035

4136
jobs:
@@ -54,10 +49,6 @@ jobs:
5449
short_name: SMS_D_Ln5.ne4pg2_oQU480.F2010-SCREAMv1-MPASSI.scream-mam4xx-all_mam4xx_procs
5550
fail-fast: false
5651
name: cpu-gcc / ${{ matrix.test.short_name }}
57-
# Run this workflow if:
58-
# - workflow_dispatch: user requested this job.
59-
# - schedule: always:
60-
# - pull_request/pull_request_review: matching skip label is NOT found
6152
if: ${{ !(github.event_name == 'workflow_dispatch' && github.event.inputs.jobs_list != 'cpu-gcc') }}
6253
steps:
6354
- name: Check out the repository
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Common settings for our ghci images
2+
include(${CMAKE_CURRENT_LIST_DIR}/ghci-snl.cmake)
3+
4+
# Set SCREAM_MACHINE
5+
set(SCREAM_MACHINE ghci-snl-cuda CACHE STRING "")
6+
7+
# Enable CUDA in kokkos
8+
set (EKAT_MACH_FILES_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../../externals/ekat/cmake/machine-files)
9+
include (${EKAT_MACH_FILES_PATH}/kokkos/cuda.cmake)
10+
11+
set(EKAT_MPI_NP_FLAG "-n" CACHE STRING "The mpirun flag for designating the total number of ranks")
12+
13+
# TODO: rebuild cuda image with cuda-aware MPI, so we can set this to ON
14+
option(SCREAM_MPI_ON_DEVICE "Whether to use device pointers for MPI calls" OFF)

components/eamxx/scripts/machines_specs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,16 @@ def setup(cls):
215215
super().setup_base("ghci-snl-cpu")
216216
cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cpu"
217217

218+
###############################################################################
219+
class GHCISNLCuda(Machine):
220+
###############################################################################
221+
concrete = True
222+
@classmethod
223+
def setup(cls):
224+
super().setup_base(name="ghci-snl-cuda",num_bld_res=16,num_run_res=1)
225+
cls.baselines_dir = "/projects/e3sm/baselines/scream/ghci-snl-cuda"
226+
cls.gpu_arch = "cuda"
227+
218228
###############################################################################
219229
class Lassen(Machine):
220230
###############################################################################

components/eamxx/src/diagnostics/tests/wind_speed_tests.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ TEST_CASE("wind_speed")
6262
register_diagnostics();
6363

6464
constexpr int ntests = 5;
65+
#ifdef NDEBUG
66+
constexpr int ulp_tol = 1;
67+
#else
68+
constexpr int ulp_tol = 0;
69+
#endif
6570
for (int itest=0; itest<ntests; ++itest) {
6671
// Randomize wind
6772
randomize(uv,engine,pdf);
@@ -87,7 +92,7 @@ TEST_CASE("wind_speed")
8792
for (int ilev=0; ilev<nlevs; ++ilev) {
8893
const auto u = uv_h (icol,0,ilev);
8994
const auto v = uv_h (icol,1,ilev);
90-
REQUIRE (ws_h(icol,ilev) == std::sqrt(u*u+v*v));
95+
REQUIRE_THAT (ws_h(icol,ilev), Catch::Matchers::WithinULP(std::sqrt(u*u+v*v),ulp_tol));
9196
}
9297
}
9398
}

components/eamxx/src/physics/rrtmgp/tests/generate_baseline.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,15 @@ int main (int argc, char** argv) {
3333
auto logger = std::make_shared<logger_t>("",LogLevel::info,comm);
3434

3535
// Get filenames from command line
36-
if (argc != 3) {
36+
if (argc < 3) {
3737
std::string msg = "Missing required inputs. Usage:\n";
3838
msg += argv[0];
3939
msg += " inputfile baseline\n";
4040
logger->error(msg);
4141
return 1;
4242
}
43-
std::string inputfile(argv[argc-2]);
44-
std::string baseline(argv[argc-1]);
43+
std::string inputfile(argv[1]);
44+
std::string baseline(argv[2]);
4545

4646
// Initialize yakl
4747
yakl::init();

0 commit comments

Comments
 (0)