Skip to content

Commit ff41a39

Browse files
authored
Merge pull request #260 from sjsprecious/update_casper_gpu
Add MI300A support on Casper.
2 parents 028bbd7 + f0d0a0c commit ff41a39

File tree

4 files changed

+85
-20
lines changed

4 files changed

+85
-20
lines changed

machines/casper/config_batch.xml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
<submit_args>
55
<argument> -l gpu_type=$GPU_TYPE </argument>
66
</submit_args>
7-
<directives queue="casper" gpu_enabled="true">
7+
<directives queue="casper" gpu_enabled="true" gpu_type="!mi300a">
88
<directive default="/bin/bash" > -S {{ shell }} </directive>
9-
<directive> -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem={{ mem_per_node }}GB:ngpus={{ ngpus_per_node }}:mps=1 </directive>
9+
<directive> -l select={{ num_nodes }}:ncpus={{ max_cputasks_per_gpu_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem={{ mem_per_node }}GB:ngpus={{ ngpus_per_node }}:mps=1 </directive>
10+
</directives>
11+
<directives queue="casper" gpu_enabled="true" gpu_type="mi300a">
12+
<directive default="/bin/bash" > -S {{ shell }} </directive>
13+
<directive> -l select={{ num_nodes }}:ncpus={{ max_cputasks_per_gpu_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem={{ mem_per_node }}GB:ngpus={{ ngpus_per_node }} </directive>
1014
</directives>
1115
<directives queue="casper" gpu_enabled="false">
1216
<directive default="/bin/bash" > -S {{ shell }} </directive>

machines/casper/config_machines.xml

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<machine MACH="casper">
2-
<DESC>NCAR GPU platform, os is Linux, 36 pes/node, batch system is pbs</DESC>
2+
<DESC>NCAR GPU platform, os is Linux, heterogeneous platform (AMD or Intel CPUs + NVIDIA or AMD GPUs), batch system is pbs</DESC>
33
<OS>LINUX</OS>
44
<COMPILERS>intel,nvhpc,gnu</COMPILERS>
55
<MPILIBS>openmpi</MPILIBS>
@@ -13,7 +13,7 @@
1313
<BATCH_SYSTEM>pbs</BATCH_SYSTEM>
1414
<SUPPORTED_BY>ASAP/CISL</SUPPORTED_BY>
1515
<MAX_TASKS_PER_NODE>62</MAX_TASKS_PER_NODE> <!-- different for various CPU nodes; check NHUG documentation for details -->
16-
<MEM_PER_TASK>10</MEM_PER_TASK>
16+
<MEM_PER_TASK>100</MEM_PER_TASK>
1717
<MAX_MEM_PER_NODE>690</MAX_MEM_PER_NODE> <!-- different for various CPU/GPU nodes; check NHUG documentation for details -->
1818
<MAX_GPUS_PER_NODE>4</MAX_GPUS_PER_NODE> <!-- different for various GPU nodes; check NHUG documentation for details -->
1919
<MAX_MPITASKS_PER_NODE>62</MAX_MPITASKS_PER_NODE> <!-- different for various CPU nodes; check NHUG documentation for details -->
@@ -54,14 +54,28 @@
5454
</modules>
5555
<modules mpilib="openmpi">
5656
<command name="load">openmpi/5.0.6</command>
57-
<command name="load">netcdf-mpi/4.9.2</command>
57+
<command name="load">netcdf-mpi/4.9.3</command>
5858
<command name="load">parallel-netcdf/1.14.0</command>
59-
</modules>
60-
<modules>
61-
<command name="load">parallelio/2.6.4</command>
62-
<command name="load">esmf/8.8.0</command>
59+
<command name="load">parallelio/2.6.6</command>
60+
<command name="load">esmf/8.8.1</command>
6361
<command name="load">ncarcompilers/1.0.0</command>
6462
</modules>
63+
<!-- specific settings for MI300A module environment -->
64+
<modules mpilib="openmpi" gpu_type="mi300a">
65+
<command name="load">ncarenv/25.08-rocm</command>
66+
<command name="reset"/>
67+
<command name="load">cmake</command>
68+
<command name="load">rocthrust/6.3.3</command>
69+
<command name="load">comgr/6.3.3</command>
70+
<command name="unload">hdf5</command>
71+
<command name="unload">netcdf</command>
72+
<command name="load">openblas/0.3.30</command>
73+
<command name="load">netcdf-mpi/4.9.3</command>
74+
<command name="load">parallel-netcdf/1.14.1</command>
75+
<command name="load">parallelio/2.6.6</command>
76+
<command name="load">esmf-mpi/8.9.0</command>
77+
<command name="load">ncarcompilers/1.1.0</command>
78+
</modules>
6579
</module_system>
6680
<environment_variables>
6781
<env name="OMP_STACKSIZE">256M</env>
@@ -71,9 +85,21 @@
7185
<env name="ESMF_RUNTIME_PROFILE">ON</env>
7286
<env name="ESMF_RUNTIME_PROFILE_OUTPUT">SUMMARY</env>
7387
</environment_variables>
74-
<environment_variables gpu_type="!none">
88+
<environment_variables gpu_type="v100">
89+
<env name="NCAR_LIBS_CUDA">-lcuda -lcudart</env>
90+
</environment_variables>
91+
<environment_variables gpu_type="a100">
7592
<env name="NCAR_LIBS_CUDA">-lcuda -lcudart</env>
7693
</environment_variables>
94+
<environment_variables gpu_type="h100">
95+
<env name="NCAR_LIBS_CUDA">-lcuda -lcudart</env>
96+
</environment_variables>
97+
<environment_variables compiler="nvhpc">
98+
<env name="NVCC_WRAPPER_DEFAULT_COMPILER">nvc++</env>
99+
</environment_variables>
100+
<environment_variables compiler="aocc">
101+
<env name="NVCC_WRAPPER_DEFAULT_COMPILER">hipcc</env>
102+
</environment_variables>
77103
<resource_limits>
78104
<resource name="RLIMIT_STACK">-1</resource>
79105
</resource_limits>

machines/casper/gnu_casper.cmake

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,41 @@
11
if (COMP_NAME STREQUAL gptl)
22
string(APPEND CPPDEFS " -DHAVE_NANOTIME -DBIT64 -DHAVE_SLASHPROC -DHAVE_GETTIMEOFDAY")
33
endif()
4-
string(APPEND CMAKE_C_FLAGS_RELEASE " -O2 -g")
5-
string(APPEND CMAKE_Fortran_FLAGS_RELEASE " -O2 -g")
6-
set(MPICC "mpicc")
7-
set(MPICXX "mpicxx")
8-
set(MPIFC "mpif90")
9-
set(SCC "gcc")
10-
set(SCXX "g++")
11-
set(SFC "gfortran")
4+
5+
if (NOT DEBUG)
6+
if (GPU_TYPE STREQUAL none OR GPU_TYPE STREQUAL "mi300a") # Large-memory HTC nodes (AMD EPYC 9554P CPU) or MI300A GPU
7+
string(APPEND CFLAGS " -march=znver4 -mtune=znver4")
8+
string(APPEND CXXFLAGS " -march=znver4 -mtune=znver4")
9+
string(APPEND FFLAGS " -march=znver4 -mtune=znver4 -fstack-arrays")
10+
string(APPEND LDFLAGS " -march=znver4 -mtune=znver4")
11+
elseif(GPU_TYPE STREQUAL a100) # AMD EPYC Milan 7763 CPU
12+
string(APPEND CFLAGS " -march=znver3 -mtune=znver3")
13+
string(APPEND CXXFLAGS " -march=znver3 -mtune=znver3")
14+
string(APPEND FFLAGS " -march=znver3 -mtune=znver3 -fstack-arrays")
15+
string(APPEND LDFLAGS " -march=znver3 -mtune=znver3")
16+
elseif(GPU_TYPE STREQUAL h100) # Intel Xeon Gold 6430 CPU
17+
string(APPEND CFLAGS " -march=sapphirerapids -mtune=sapphirerapids")
18+
string(APPEND CXXFLAGS " -march=sapphirerapids -mtune=sapphirerapids")
19+
string(APPEND FFLAGS " -march=sapphirerapids -mtune=sapphirerapids -fstack-arrays")
20+
string(APPEND LDFLAGS " -march=sapphirerapids -mtune=sapphirerapids")
21+
else() # V100 GPU nodes or small-memory HTC nodes
22+
string(APPEND CFLAGS " -march=cascadelake -mtune=cascadelake")
23+
string(APPEND CXXFLAGS " -march=cascadelake -mtune=cascadelake")
24+
string(APPEND FFLAGS " -march=cascadelake -mtune=cascadelake -fstack-arrays")
25+
string(APPEND LDFLAGS " -march=cascadelake -mtune=cascadelake")
26+
endif()
27+
endif()
28+
29+
if (GPU_TYPE STREQUAL "mi300a")
30+
string(APPEND SLIBS " -lopenblas") # -llapack -lblas
31+
endif()
32+
if (MPILIB STREQUAL mpi-serial)
33+
string(APPEND SLIBS " -ldl")
34+
endif()
35+
string(APPEND SLIBS " -L${NETCDF_PATH}/lib -lnetcdf -lnetcdff")
36+
message("GPU_TYPE is ${GPU_TYPE}")
37+
message("OPENACC_GPU_OFFLOAD is ${OPENACC_GPU_OFFLOAD}")
38+
message("OPENMP_GPU_OFFLOAD is ${OPENMP_GPU_OFFLOAD}")
1239

1340
if (USE_KOKKOS)
1441
# Generic setting that are used regardless of Architecture or Kokkos backend
@@ -17,14 +44,19 @@ if (USE_KOKKOS)
1744
string(APPEND CPPDEFS " -DGPU -DTHRUST_IGNORE_CUB_VERSION_CHECK -DHOMMEXX_ENABLE_GPU")
1845
string(APPEND KOKKOS_OPTIONS " -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_OPENMP=OFF -DKokkos_ENABLE_AGGRESSIVE_VECTORIZATION=OFF")
1946
if (GPU_TYPE STREQUAL v100)
20-
string(APPEND KOKKOS_OPTIONS " -DKOKKOS_ARCH_VOLTA70=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
47+
string(APPEND KOKKOS_OPTIONS " -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
2148
string(APPEND CXXFLAGS " -extended-lambda -Wext-lambda-captures-this -std=c++17 -arch=sm_70")
2249
elseif(GPU_TYPE STREQUAL a100)
2350
string(APPEND KOKKOS_OPTIONS " -DKokkos_ARCH_AMPERE80=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
2451
string(APPEND CXXFLAGS " -extended-lambda -Wext-lambda-captures-this -std=c++17 -arch=sm_80")
2552
elseif(GPU_TYPE STREQUAL h100)
2653
string(APPEND KOKKOS_OPTIONS " -DKokkos_ARCH_HOPPER90=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
2754
string(APPEND CXXFLAGS " -extended-lambda -Wext-lambda-captures-this -std=c++17 -arch=sm_90")
55+
elseif(GPU_TYPE STREQUAL mi300a)
56+
string(APPEND KOKKOS_OPTIONS " -DKokkos_ENABLE_HIP=ON -DKokkos_ARCH_ZEN4=ON -DAMDGPU_TARGETS=GFX942 -DKokkos_ARCH_AMD_GFX942=ON") # Currently theta-l_kokkos dycore does not handle APU correctly (i.e., -DKokkos_ARCH_AMD_GFX942_APU=ON)
57+
string(APPEND CXXFLAGS " -D__HIP_PLATFORM_AMD__ -std=c++17 -Wno-mismatched-tags --offload-arch=gfx942 -munsafe-fp-atomics -fno-gpu-rdc -x hip -I$ENV{NCAR_INC_OPENMPI}")
58+
set(SCXX "hipcc")
59+
set(MPICXX ${SCXX})
2860
else()
2961
message(FATAL_ERROR "GPU_TYPE ${GPU_TYPE} not supported")
3062
endif()
@@ -42,4 +74,7 @@ if (USE_KOKKOS)
4274
set(CMAKE_Fortran_FLAGS "-fallow-argument-mismatch" CACHE STRING "" FORCE) # only works with gnu v10 and above
4375
endif()
4476
string(APPEND LDFLAGS " -lstdc++ -lkokkoscontainers -lkokkoscore -lkokkossimd ")
77+
if (GPU_TYPE STREQUAL "mi300a")
78+
string(APPEND LDFLAGS " -lhiprtc -lamdhip64 ")
79+
endif()
4580
endif()

machines/casper/nvhpc_casper.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ if (USE_KOKKOS)
3333
string(APPEND CPPDEFS " -DGPU -DTHRUST_IGNORE_CUB_VERSION_CHECK -DHOMMEXX_ENABLE_GPU")
3434
string(APPEND KOKKOS_OPTIONS " -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_OPENMP=OFF -DKokkos_ENABLE_AGGRESSIVE_VECTORIZATION=OFF")
3535
if (GPU_TYPE STREQUAL v100)
36-
string(APPEND KOKKOS_OPTIONS " -DKOKKOS_ARCH_VOLTA70=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
36+
string(APPEND KOKKOS_OPTIONS " -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")
3737
string(APPEND CXXFLAGS " -extended-lambda -Wext-lambda-captures-this -std=c++17 -arch=sm_70")
3838
elseif(GPU_TYPE STREQUAL a100)
3939
string(APPEND KOKKOS_OPTIONS " -DKokkos_ARCH_AMPERE80=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF")

0 commit comments

Comments
 (0)