Skip to content
Merged
36 changes: 36 additions & 0 deletions doc/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,39 @@
===============================================================

Tag name: cam6_
Originator(s): johnmauff, pel
Date: April 28, 2025
One-line Summary:
Github PR URL: https://github.com/ESCOMP/CAM/pull/

Purpose of changes (include the issue number and title text for each relevant GitHub issue):
Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1316

The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary.
This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate
the load imbalance that is caused by the special treatment of panels at the corners of the cubed sphere, it is possible to reduce
the cost of this subroutine by changing the way that the subroutine is written.

Describe any changes made to build system: N/A

Describe any changes made to the namelist: N/A

List any changes to the defaults for the boundary datasets: N/A

Describe any substantial timing or memory changes: N/A

Code reviewed by:

List all files eliminated: N/A

List all files added and what they do: N/A

List all existing files that have been modified, and describe the changes:

....

Summarize any changes to answers:
No answer changes, all b4b

===============================================================

Expand Down
57 changes: 29 additions & 28 deletions src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#define FVM_TIMERS .FALSE.
module fvm_consistent_se_cslam
use shr_kind_mod, only: r8=>shr_kind_r8
use dimensions_mod, only: nc, nhe, nlev, ntrac, np, nhr, nhc, ngpc, ns, nht
Expand Down Expand Up @@ -107,7 +108,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
endif

kblk = kmax-kmin+1
!call t_startf('fvm:before_Qnhc')
if(FVM_TIMERS) call t_startf('fvm:before_Qnhc')
do ie=nets,nete
do k=kmin,kmax
elem(ie)%sub_elem_mass_flux(:,:,:,k) = dt_fvm*elem(ie)%sub_elem_mass_flux(:,:,:,k)*fvm(ie)%dp_ref_inverse(k)
Expand All @@ -120,11 +121,11 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
call ghostpack(ghostbufQnhc,fvm(ie)%c(1-nhc:nc+nhc,1-nhc:nc+nhc,kmin:kmax,q),kblk,kptr,ie)
enddo
end do
!call t_stopf('fvm:before_Qnhc')
!call t_startf('fvm:ghost_exchange:Qnhc')
if(FVM_TIMERS) call t_stopf('fvm:before_Qnhc')
if(FVM_TIMERS) call t_startf('fvm:ghost_exchange:Qnhc')
call ghost_exchange(hybridnew,ghostbufQnhc,location='ghostbufQnhc')
!call t_stopf('fvm:ghost_exchange:Qnhc')
!call t_startf('fvm:orthogonal_swept_areas')
if(FVM_TIMERS) call t_stopf('fvm:ghost_exchange:Qnhc')
if(FVM_TIMERS) call t_startf('fvm:orthogonal_swept_areas')
do ie=nets,nete
do k=kmin,kmax
fvm(ie)%se_flux (1:nc,1:nc,:,k) = elem(ie)%sub_elem_mass_flux(:,:,:,k)
Expand Down Expand Up @@ -152,14 +153,14 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
end do
enddo

!call t_stopf('fvm:orthogonal_swept_areas')
if(FVM_TIMERS) call t_stopf('fvm:orthogonal_swept_areas')
do ie=nets,nete
! Intel compiler version 2023.0.0 on derecho had significant slowdown on subroutine interface without
! these pointers.
fcube => fvm(ie)%c(:,:,:,:)
spherecentroid => fvm(ie)%spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe)
do k=kmin,kmax
!call t_startf('FVM:tracers_reconstruct')
if(FVM_TIMERS) call t_startf('FVM:tracers_reconstruct')
call reconstruction(fcube,nlev,k,&
ctracer(:,:,:,:),irecons_tracer,llimiter,ntrac,&
nc,nhe,nhr,nhc,nht,ns,nhr+(nhe-1),&
Expand All @@ -170,10 +171,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
fvm(ie)%rot_matrix,fvm(ie)%centroid_stretch,&
fvm(ie)%vertex_recons_weights,fvm(ie)%vtx_cart,&
irecons_tracer_lev(k))
!call t_stopf('FVM:tracers_reconstruct')
!call t_startf('fvm:swept_flux')
if(FVM_TIMERS) call t_stopf('FVM:tracers_reconstruct')
if(FVM_TIMERS) call t_startf('fvm:swept_flux')
call swept_flux(elem(ie),fvm(ie),k,ctracer,irecons_tracer_lev(k),gsweights,gspts)
!call t_stopf('fvm:swept_flux')
if(FVM_TIMERS) call t_stopf('fvm:swept_flux')
end do
end do
!
Expand All @@ -193,7 +194,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
!
!
if (large_Courant_incr) then
!call t_startf('fvm:fill_halo_fvm:large_Courant')
if(FVM_TIMERS) call t_startf('fvm:fill_halo_fvm:large_Courant')
!if (kmin_jet<kmin.or.kmax_jet>kmax) then
! call endrun('ERROR: kmax_jet must be .le. kmax passed to run_consistent_se_cslam')
!end if
Expand All @@ -203,19 +204,19 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
kmax_jet_local = min(kmax_jet,kmax)
klev = kmax_jet-kmin_jet+1
call fill_halo_fvm(ghostbufQ1,elem,fvm,hybridnew,nets,nete,1,kmin_jet_local,kmax_jet_local,klev,active=ActiveJetThread)
!call t_stopf('fvm:fill_halo_fvm:large_Courant')
!call t_startf('fvm:large_Courant_number_increment')
if(FVM_TIMERS) call t_stopf('fvm:fill_halo_fvm:large_Courant')
if(FVM_TIMERS) call t_startf('fvm:large_Courant_number_increment')
if(ActiveJetThread) then
do k=kmin_jet_local,kmax_jet_local !1,nlev
do ie=nets,nete
call large_courant_number_increment(fvm(ie),k)
end do
end do
endif
!call t_stopf('fvm:large_Courant_number_increment')
if(FVM_TIMERS) call t_stopf('fvm:large_Courant_number_increment')
end if

!call t_startf('fvm:end_of_reconstruct_subroutine')
if(FVM_TIMERS) call t_startf('fvm:end_of_reconstruct_subroutine')
do k=kmin,kmax
!
! convert to mixing ratio
Expand Down Expand Up @@ -251,7 +252,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
elem(ie)%sub_elem_mass_flux(:,:,:,k)=0
end do
end do
!call t_stopf('fvm:end_of_reconstruct_subroutine')
if(FVM_TIMERS) call t_stopf('fvm:end_of_reconstruct_subroutine')
!$OMP END PARALLEL
call omp_set_nested(.false.)
end subroutine run_consistent_se_cslam
Expand Down Expand Up @@ -281,7 +282,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
REAL(KIND=r8), dimension(2,8) :: x_start, dgam_vec
REAL(KIND=r8) :: gamma_max, displ_first_guess

REAL(KIND=r8) :: flux,flux_tracer(ntrac)
REAL(KIND=r8) :: flux,flux_tracer(ntrac),w

REAL(KIND=r8), dimension(num_area) :: dp_area

Expand All @@ -306,7 +307,6 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! prepare for air/tracer update
!
! dp = fvm%dp_fvm(1-nhe:nc+nhe,1-nhe:nc+nhe,ilev)
dp = fvm%dp_fvm(1-nhc:nc+nhc,1-nhc:nc+nhc,ilev)
fvm%dp_fvm(1:nc,1:nc,ilev) = fvm%dp_fvm(1:nc,1:nc,ilev)*fvm%area_sphere
do itr=1,ntrac
Expand Down Expand Up @@ -538,14 +538,14 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! iterate to get flux area
!
!call t_startf('fvm:swept_area:get_gamma')
if(FVM_TIMERS) call t_startf('fvm:swept_area:get_gamma')
do iarea=1,num_area
dp_area(iarea) = dp(idx(1,iarea,i,j,iside),idx(2,iarea,i,j,iside))
end do
call get_flux_segments_area_iterate(x,x_static,dx_static,dx,x_start,dgam_vec,num_seg,num_seg_static,&
num_seg_max,num_area,dp_area,flowcase,gamma,mass_flux_se(i,j,iside),0.0_r8,gamma_max, &
gsweights,gspts,ilev)
!call t_stopf('fvm:swept_area:get_gamma')
if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_gamma')
!
! pack segments for high-order weights computation
!
Expand All @@ -560,27 +560,28 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! compute higher-order weights
!
!call t_startf('fvm:swept_area:get_high_order_w')
if(FVM_TIMERS) call t_startf('fvm:swept_area:get_high_order_w')
call get_high_order_weights_over_areas(x,dx,num_seg,num_seg_max,num_area,weights,ngpc,&
gsweights, gspts,irecons_tracer)
!call t_stopf('fvm:swept_area:get_high_order_w')
if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_high_order_w')
!
!**************************************************
!
! remap air and tracers
!
!**************************************************
!
!call t_startf('fvm:swept_area:remap')
if(FVM_TIMERS) call t_startf('fvm:swept_area:remap')
flux=0.0_r8; flux_tracer=0.0_r8
do iarea=1,num_area
if (num_seg(iarea)>0) then
ii=idx(1,iarea,i,j,iside); jj=idx(2,iarea,i,j,iside)
flux=flux+weights(1,iarea)*dp(ii,jj)
do itr=1,ntrac
do iw=1,irecons_tracer_actual
flux_tracer(itr) = flux_tracer(itr)+weights(iw,iarea)*ctracer(iw,ii,jj,itr)
end do
do iw=1,irecons_tracer_actual
w = weights(iw,iarea)
do itr=1,ntrac
flux_tracer(itr) = flux_tracer(itr)+w*ctracer(iw,ii,jj,itr)
end do
end do
end if
end do
Expand Down Expand Up @@ -614,7 +615,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
fvm%dp_fvm(i-1,j,ilev ) = fvm%dp_fvm(i-1,j,ilev )+flux
fvm% c(i-1,j,ilev,1:ntrac) = fvm% c(i-1,j,ilev,1:ntrac)+flux_tracer(1:ntrac)
end if
!call t_stopf('fvm:swept_area:remap')
if(FVM_TIMERS) call t_stopf('fvm:swept_area:remap')
end if
end do
end do
Expand Down
Loading