Open
Description
Having the following code (tests matrix multiplication speed in stupid way):
program matrix_multiply
use omp_lib
implicit none
real(8), allocatable :: A(:,:), B(:,:), C(:,:)
integer(4) :: i, j, k, nbfcn4, df4
integer(8) :: nbfcn, df
integer :: ierr, num_threads
integer(8) :: start_time, end_time, elapsed_time
integer(8) :: count,count_rate,clock_rate
real(8) :: seconds_elapsed, peak, actual
real(8) :: ts1, ts2
nbfcn4 = 635
df4 = 8645
nbfcn = nbfcn4
df = df4
allocate(A(nbfcn,nbfcn), B(df,nbfcn), C(nbfcn,df), source = 0d0)
call random_seed()
call random_number(A)
call random_number(B)
call dgemm('t','t',nbfcn,df,nbfcn,1.d0,A,nbfcn,B,df,0.d0,C,nbfcn)
call SYSTEM_CLOCK(count=start_time, count_rate=clock_rate)
call cpu_time(ts1)
do i = 1, 100
call dgemm('t','t',nbfcn,df,nbfcn,1.d0,A,nbfcn,B,df,0.d0,C,nbfcn)
end do
call SYSTEM_CLOCK(count=end_time)
call cpu_time(ts2)
elapsed_time = end_time - start_time
seconds_elapsed = real(elapsed_time) / real(clock_rate)
peak = seconds_elapsed * omp_get_max_threads() * 2.45_8 * 1e9_8 * 16._8
actual = real(2 * nbfcn*nbfcn * df, kind=8) * 100
write(*,"(F6.2,A,F6.2,' ',I0)") seconds_elapsed, "s ", actual / peak * 100._8, omp_get_max_threads()
seconds_elapsed = ts2 - ts1
peak = seconds_elapsed * omp_get_max_threads() * 2.45_8 * 1e9_8 * 16._8
actual = real(2 * nbfcn*nbfcn * df, kind=8) * 100
write(*,"(F6.2,A,F6.2,' ',I0)") seconds_elapsed, "s ", actual / peak * 100._8, omp_get_max_threads()
! time * cores * freq * ops
end program matrix_multiply
For current flang, it gives me the same number for CPU_TIME calls and for SYSTEM_CLOCK calls.
Example output:
LD_LIBRARY_PATH=/home/lokalgi/test/llvm-install/lib:D_LIBRARY_PATH OMP_NUM_THREADS=8 numactl -C 24-31 ./MM2.libblis-mt.orig.f18
23.23s 9.57 8
23.23s 9.57 8
At the same time, GCC produces different timings:
OMP_NUM_THREADS=8 numactl -C 24-31 ./MM2.libblis-mt.orig.gcc
2.14s 103.76 8
17.04s 13.05 8
Expected
- SYSTEM_CLOCK takes system time (not happens)
- CPU_TIME takes total CPU time (works now)