Skip to content

Commit 593076d

Browse files
committed
Allow ESolnManager to work with SP2 FG
This commit updates the Main_MPI code that was added in previous commits to allow the new MPI code associated with the ESolnManager to work with SP2 FG. To do this, it updates the DataResp and the PQMult to not send their MPI send or receives on the MPI_COMM_WORLD but on either comm_world or comm_leader depending on the para_method. For both the DATARESP and the JQMULT, it performs both by sending the work to the leader tasks. Because these tasks are rather simple and fast, the leader tasks do not send the worker tasks anything. When running with FG on equal processor counts with or without -P gives bit-for-bit result. It appears that the running with FG with different processor counts does not give bit-for-bit results, this may be an issue before this task. Running without FG at different processor counts gives the same results, but there are difference when running with and without FG. I'll open and investigate that issue further.
1 parent 61c01a4 commit 593076d

1 file changed

Lines changed: 45 additions & 21 deletions

File tree

f90/MPI/Main_MPI.f90

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ Subroutine Master_job_fwdPred(sigma,d1,eAll,comm,trial)
680680

681681
! Compute the model Responces
682682
if (EsMgr_save_in_file) then
683-
call Master_job_DataResp(nTx, sigma, d1, trial_lcl)
683+
call Master_job_DataResp(nTx, sigma, d1, trial_lcl, comm_leader)
684684
else
685685
do iTx=1,nTx
686686
do i = 1,d1%d(iTx)%nDt
@@ -706,34 +706,44 @@ end subroutine Master_job_fwdPred
706706

707707
!----------------------------------------------------------------------------
708708
!########################## Master_job_DataResp ############################
709-
subroutine Master_job_DataResp(nTx, sigma, d, trial)
709+
subroutine Master_job_DataResp(nTx, sigma, d, trial, comm)
710710

711711
implicit none
712712

713713
integer, intent(in) :: nTx
714714
type (modelParam_t), intent(in) :: sigma
715715
type (dataVectorMTX_t), intent(inout) :: d
716716
logical, intent(in), optional :: trial
717+
integer, intent(in), optional :: comm
717718

718719
character (len=*), parameter :: JOB_NAME = "DATARESP"
719720

720721
integer :: dest, nTasks, remainder, iTx
721722
integer :: iTx_min, iTx_max, i, j, k
722723
logical :: trial_lcl
724+
integer :: comm_current
723725

724726
if (present(trial)) then
725727
trial_lcl = trial
726728
else
727729
trial_lcl = .false.
728730
endif
729731

732+
if (para_method.eq.0) then
733+
comm_current = comm_world
734+
else
735+
comm_current = comm_leader
736+
end if
737+
738+
modem_ctx % comm_current = comm
739+
730740
call create_worker_job_task_place_holder
731741

732-
nTasks = nTx / number_of_workers
733-
remainder = modulo(nTx, number_of_Workers)
742+
nTasks = nTx / size_leader
743+
remainder = modulo(nTx, size_leader)
734744
iTx_max = 0
735745

736-
do dest = 1, number_of_workers
746+
do dest = 1, size_leader
737747
iTx_min = iTx_max + 1
738748
iTx_max = iTx_min + nTasks - 1
739749

@@ -749,15 +759,15 @@ subroutine Master_job_DataResp(nTx, sigma, d, trial)
749759
worker_job_task % trial = trial_lcl
750760

751761
call Pack_worker_job_task
752-
call MPI_Send(worker_job_package, Nbytes, MPI_PACKED, dest, FROM_MASTER, MPI_COMM_WORLD, ierr)
762+
call MPI_Send(worker_job_package, Nbytes, MPI_PACKED, dest, FROM_MASTER, comm_current, ierr)
753763
write(ioMPI, '(a10,a16,i5,a8,i5,a11,i5)') trim(job_name), ': Send Per from ', iTx_min, ' to', iTx_max, ' to ', dest
754764
end if
755765
end do
756766

757-
remainder = modulo(nTx, number_of_workers)
767+
remainder = modulo(nTx, size_leader)
758768
iTx_max = 0
759769

760-
do dest = 1, number_of_workers
770+
do dest = 1, size_leader
761771
iTx_min = iTx_max + 1
762772
iTx_max = iTx_min + nTasks - 1
763773

@@ -768,11 +778,9 @@ subroutine Master_job_DataResp(nTx, sigma, d, trial)
768778

769779
if (iTx_max >= iTx_min) then
770780
call create_data_vec_place_holder(d, start_iTx=iTx_min, end_iTx=iTx_max)
771-
call MPI_Recv(data_para_vec, Nbytes, MPI_PACKED, dest, FROM_WORKER, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
781+
call MPI_Recv(data_para_vec, Nbytes, MPI_PACKED, dest, FROM_WORKER, comm_current, MPI_STATUS_IGNORE, ierr)
772782
call UnPack_data_para_vec(d, start_iTx=iTx_min, end_iTx=iTx_max)
773783
end if
774-
775-
776784
end do
777785

778786
end subroutine Master_job_DataResp
@@ -1176,6 +1184,7 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
11761184
integer :: dest, nTasks, remainder, iTx
11771185
integer :: iTx_min, iTx_max, i, j, k
11781186
logical :: flag
1187+
integer :: comm_current
11791188

11801189
logical, dimension(number_of_workers) :: task_is_working
11811190
logical, dimension(nTx) :: transmitters_processing, transmitters_done
@@ -1188,6 +1197,14 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
11881197
use_starting_guess_lcl = .false.
11891198
end if
11901199

1200+
if (para_method.eq.0) then
1201+
comm_current = comm_world
1202+
else
1203+
comm_current = comm_leader
1204+
end if
1205+
1206+
modem_ctx % comm_current = comm_current
1207+
11911208
sending = .true.
11921209

11931210
task_is_working(:) = .false.
@@ -1204,7 +1221,7 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
12041221
cycle
12051222
end if
12061223

1207-
do dest = 1, number_of_workers
1224+
do dest = 1, size_leader
12081225
if (task_is_working(dest)) then
12091226
cycle
12101227
end if
@@ -1217,7 +1234,7 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
12171234

12181235
call create_worker_job_task_place_holder
12191236
call Pack_worker_job_task
1220-
call MPI_Send(worker_job_package, Nbytes, MPI_PACKED, dest, FROM_MASTER, MPI_COMM_WORLD, ierr)
1237+
call MPI_Send(worker_job_package, Nbytes, MPI_PACKED, dest, FROM_MASTER, comm_current, ierr)
12211238
transmitters_processing(iTx) = .true.
12221239
task_is_working(dest) = .true.
12231240
exit
@@ -1226,11 +1243,11 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
12261243

12271244
! Recv any jobs
12281245
! See if anyone is sending us a message...
1229-
call MPI_Iprobe(MPI_ANY_SOURCE, FROM_WORKER, MPI_COMM_WORLD, flag, MPI_STATUS_IGNORE, ierr)
1246+
call MPI_Iprobe(MPI_ANY_SOURCE, FROM_WORKER, comm_current, flag, MPI_STATUS_IGNORE, ierr)
12301247

12311248
if (flag) then ! Someone is sending us a message
12321249
call create_worker_job_task_place_holder
1233-
call MPI_Recv(worker_job_package, Nbytes, MPI_PACKED, MPI_ANY_SOURCE, FROM_WORKER, MPI_COMM_WORLD, STATUS, ierr)
1250+
call MPI_Recv(worker_job_package, Nbytes, MPI_PACKED, MPI_ANY_SOURCE, FROM_WORKER, comm_current, STATUS, ierr)
12341251
call Unpack_worker_job_task
12351252

12361253
dest = worker_job_task % taskid
@@ -1240,7 +1257,7 @@ subroutine Master_job_PQMult(nTx, sigma, dsigma, use_starting_guess)
12401257
call zero(dsigma_recv(iTx))
12411258

12421259
call create_model_param_place_holder(dsigma_recv(iTx))
1243-
call MPI_Recv(sigma_para_vec, Nbytes, MPI_PACKED, dest, FROM_WORKER, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
1260+
call MPI_Recv(sigma_para_vec, Nbytes, MPI_PACKED, dest, FROM_WORKER, comm_current, MPI_STATUS_IGNORE, ierr)
12441261
call Unpack_model_para_values(dsigma_recv(iTx))
12451262

12461263
transmitters_done(iTx) = .true.
@@ -1775,6 +1792,7 @@ subroutine Master_job_Distribute_Taskes(job_name,nTx,sigma,eAll_out, &
17751792
trial_lcl = .false.
17761793
end if
17771794

1795+
17781796
call get_nPol_MPI(eAll_out%solns(1))
17791797
if (rank_local.eq.-1) then ! first run!
17801798
! run initial regroup -- note this requires the comm to be
@@ -1788,6 +1806,7 @@ subroutine Master_job_Distribute_Taskes(job_name,nTx,sigma,eAll_out, &
17881806
endif
17891807
call MPI_COMM_SIZE( comm_current, size_current, ierr )
17901808
who = 0
1809+
17911810
worker_job_task%what_to_do=trim(job_name)
17921811
worker_job_task%trial=trial_lcl
17931812
call count_number_of_messages_to_RECV(eAll_out)
@@ -1864,6 +1883,7 @@ subroutine Master_job_Distribute_Taskes(job_name,nTx,sigma,eAll_out, &
18641883
which_per=worker_job_task%per_index
18651884
which_pol=worker_job_task%pol_index
18661885

1886+
modem_ctx % comm_current = comm_current
18671887
call EsMgr_get(eAll_out % solns(which_per), which_pol, 1, from=who)
18681888

18691889
write(ioMPI,'(a10,a16,i5,a8,i5,a11,i5)')trim(job_name) , &
@@ -1950,6 +1970,7 @@ subroutine Master_job_Distribute_Taskes(job_name,nTx,sigma,eAll_out, &
19501970

19511971
end subroutine Master_job_Distribute_Taskes
19521972

1973+
19531974
!################## find next job -- from back or front ##################
19541975
Subroutine find_next_job(nTx,total_jobs,counter,fromhead, eAll_out, &
19551976
& per_index,pol_index)
@@ -2054,8 +2075,6 @@ Subroutine Worker_job(sigma,d)
20542075
type (modelParam_t) :: dsigma_temp, dsigma_send, Qcomb
20552076
logical :: trial
20562077

2057-
2058-
20592078
! time
20602079
DOUBLE PRECISION :: time_passed, now, just
20612080
DOUBLE PRECISION, pointer,dimension(:) :: time_buff
@@ -2127,9 +2146,14 @@ Subroutine Worker_job(sigma,d)
21272146
! '; several TX = ',worker_job_task%several_Tx,']'
21282147

21292148
if (trim(worker_job_task%what_to_do) .eq. 'FORWARD') then
2149+
21302150
! forward modelling
21312151
per_index=worker_job_task%per_index
21322152
pol_index=worker_job_task%pol_index
2153+
2154+
modem_ctx % comm_current = comm_current
2155+
modem_ctx % rank_current = rank_current
2156+
21332157
if ((size_local.gt.1).and.(para_method.gt.0).and. &
21342158
& (rank_local.eq.0)) then
21352159
! group leader passing the command to workers
@@ -2195,7 +2219,6 @@ Subroutine Worker_job(sigma,d)
21952219

21962220
start_iTx = worker_job_task % per_index
21972221
end_iTx = worker_job_task % pol_index
2198-
21992222
worker_job_task % taskid = taskid
22002223

22012224
call zero_solnvector(e0)
@@ -2234,7 +2257,7 @@ Subroutine Worker_job(sigma,d)
22342257

22352258
call create_data_vec_place_holder(d, start_iTx=start_iTx, end_iTx=end_iTx)
22362259
call Pack_data_para_vec(d, start_iTx=start_iTx, end_iTx=end_iTx)
2237-
call MPI_Send(data_para_vec, NBytes, MPI_PACKED, 0, FROM_WORKER, MPI_COMM_WORLD, ierr)
2260+
call MPI_Send(data_para_vec, NBytes, MPI_PACKED, 0, FROM_WORKER, comm_current, ierr)
22382261
deallocate(data_para_vec)
22392262
data_para_vec => null()
22402263

@@ -2286,7 +2309,7 @@ Subroutine Worker_job(sigma,d)
22862309

22872310
call create_model_param_place_holder(dsigma_temp)
22882311
call pack_model_para_values(dsigma_temp)
2289-
call MPI_Send(sigma_para_vec, NBytes, MPI_PACKED, 0, FROM_WORKER, MPI_COMM_WORLD, ierr)
2312+
call MPI_Send(sigma_para_vec, NBytes, MPI_PACKED, 0, FROM_WORKER, comm_current, ierr)
22902313
call deall(QComb)
22912314
call deall(dsigma_temp)
22922315
call deall(dsigma_send)
@@ -2764,6 +2787,7 @@ Subroutine Worker_job(sigma,d)
27642787
call Pack_worker_job_task
27652788
call MPI_SEND(worker_job_package,Nbytes, MPI_PACKED,0, &
27662789
& FROM_WORKER, comm_current, ierr)
2790+
27672791
elseif (trim(worker_job_task%what_to_do) .eq. 'REGROUP') then
27682792
! calculate the time between two regroup events
27692793
if ((size_local.gt.1).and.(para_method.gt.0).and. &

0 commit comments

Comments
 (0)