Skip to content

update_u flop calculation #67

Description

@mneilly

I'm working through the code for su3_rhmd_hisq and calculating flops. I'm currently looking at the main loop in update_u().

FORALLSITES_OMP(i,s,private(dir,link,temp1,temp2,htemp)) {
for(dir=XUP; dir <=TUP; dir++){
uncompress_anti_hermitian( &(s->mom[dir]) , &htemp );
link = &(s->link[dir]);
mult_su3_nn(&htemp,link,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t8,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t7,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t6,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t5,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t4,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t3,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,t2,&temp2);
mult_su3_nn(&htemp,&temp2,&temp1);
scalar_mult_add_su3_matrix(link,&temp1,eps ,&temp2);
su3mat_copy(&temp2,link);
}
} END_LOOP_OMP

It appears that the reported Mflops/second is based on a flop count of 5616.

node0_printf("LINK_UPDATE: time = %e mflops = %e\n",
dtime, (double)(5616.0*volume/(1.0e6*dtime*numnodes())) );

If I calculate flops myself I get 7488 not 5616.

For a complex 3x3 matrix multiply I'm expecting:

flops = 6*n^3 + 2*n^2*(n-1) = 6*3^3 + 2*3^2*(3-1) = 198

and for the scalar multiply add I'm expecting:

flops = 2 * 2*n^2 = 2 * 2*3^2 = 36

Since matmul and scalar mul plus add occur 8 times each and there are 4 directions:

flops = 4 * 8 * (198 + 36) = 7488

5616 happens to be exactly 3/4 of 7488.

Saving temps and looking at the preprocessed output seems to confirm my count:

For the matmuls there are 36 multiplies and 30 add/sub in each of 3 loops so 198 flops.

int iii;
for (iii = 0; iii < 3; iii++)
{
    Real c0r, c0i, c1r, c1i, c2r, c2i;
    Real aikr, aiki;
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][0].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][0].imag);
    c0r = aikr * (((fsu3_matrix *)(link))->e[0][0].real);
    c0i = aikr * (((fsu3_matrix *)(link))->e[0][0].imag);
    c1r = aikr * (((fsu3_matrix *)(link))->e[0][1].real);
    c1i = aikr * (((fsu3_matrix *)(link))->e[0][1].imag);
    c2r = aikr * (((fsu3_matrix *)(link))->e[0][2].real);
    c2i = aikr * (((fsu3_matrix *)(link))->e[0][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[0][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[0][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[0][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[0][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[0][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[0][2].real);
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][1].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][1].imag);
    c0r += aikr * (((fsu3_matrix *)(link))->e[1][0].real);
    c0i += aikr * (((fsu3_matrix *)(link))->e[1][0].imag);
    c1r += aikr * (((fsu3_matrix *)(link))->e[1][1].real);
    c1i += aikr * (((fsu3_matrix *)(link))->e[1][1].imag);
    c2r += aikr * (((fsu3_matrix *)(link))->e[1][2].real);
    c2i += aikr * (((fsu3_matrix *)(link))->e[1][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[1][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[1][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[1][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[1][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[1][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[1][2].real);
    aikr = (((fsu3_matrix *)(&htemp))->e[iii][2].real);
    aiki = (((fsu3_matrix *)(&htemp))->e[iii][2].imag);
    c0r += aikr * (((fsu3_matrix *)(link))->e[2][0].real);
    c0i += aikr * (((fsu3_matrix *)(link))->e[2][0].imag);
    c1r += aikr * (((fsu3_matrix *)(link))->e[2][1].real);
    c1i += aikr * (((fsu3_matrix *)(link))->e[2][1].imag);
    c2r += aikr * (((fsu3_matrix *)(link))->e[2][2].real);
    c2i += aikr * (((fsu3_matrix *)(link))->e[2][2].imag);
    c0r -= aiki * (((fsu3_matrix *)(link))->e[2][0].imag);
    c0i += aiki * (((fsu3_matrix *)(link))->e[2][0].real);
    c1r -= aiki * (((fsu3_matrix *)(link))->e[2][1].imag);
    c1i += aiki * (((fsu3_matrix *)(link))->e[2][1].real);
    c2r -= aiki * (((fsu3_matrix *)(link))->e[2][2].imag);
    c2i += aiki * (((fsu3_matrix *)(link))->e[2][2].real);
    (((fsu3_matrix *)(&temp1))->e[iii][0].real) = c0r;
    (((fsu3_matrix *)(&temp1))->e[iii][0].imag) = c0i;
    (((fsu3_matrix *)(&temp1))->e[iii][1].real) = c1r;
    (((fsu3_matrix *)(&temp1))->e[iii][1].imag) = c1i;
    (((fsu3_matrix *)(&temp1))->e[iii][2].real) = c2r;
    (((fsu3_matrix *)(&temp1))->e[iii][2].imag) = c2i;
}

And for the scalar multiply adds there are 6 adds and 6 multiples in each of 3 loop iterations for 36 flops.

Real _temp = t8;
{
    register fsu3_matrix *aaa, *bbb, *ccc;
    register Real sss;
    register int iii;
    aaa = (link);
    bbb = (&temp1);
    ccc = (&temp2);
    sss = (_temp);
    for (iii = 0; iii < 3; iii++)
    {
        (ccc)->e[iii][0].real = (aaa)->e[iii][0].real + (sss) * (bbb)->e[iii][0].real;
        (ccc)->e[iii][0].imag = (aaa)->e[iii][0].imag + (sss) * (bbb)->e[iii][0].imag;
        (ccc)->e[iii][1].real = (aaa)->e[iii][1].real + (sss) * (bbb)->e[iii][1].real;
        (ccc)->e[iii][1].imag = (aaa)->e[iii][1].imag + (sss) * (bbb)->e[iii][1].imag;
        (ccc)->e[iii][2].real = (aaa)->e[iii][2].real + (sss) * (bbb)->e[iii][2].real;
        (ccc)->e[iii][2].imag = (aaa)->e[iii][2].imag + (sss) * (bbb)->e[iii][2].imag;
    }
};

I'm wondering what accounts for the lower count of 5616...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions