Skip to content

Fma not optimized for wasm relaxed-simd #121311

Open
@kzhsw

Description

@kzhsw

Env: Compiler explorer (https://godbolt.org/z/Yxc6e8sd4)
Version: WebAssembly clang (trunk)

Code:

Code

#include <wasm_simd128.h>

void fma_inst(float * a, float * b, float * c, float * dest)  {
    v128_t va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = wasm_f32x4_mul(va, vb);
    va = wasm_f32x4_add(va, vc);
    wasm_v128_store(dest, va);
}

void fma_operator(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = (va * vb) + vc;
    wasm_v128_store(dest, va);
}

void fma_buildin(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_elementwise_fma(va, vb, vc);
    wasm_v128_store(dest, va);
}

void fma_expected(float * a, float * b, float * c, float * dest)  {
    __f32x4 va, vb, vc;
    va = wasm_v128_load(a);
    vb = wasm_v128_load(b);
    vc = wasm_v128_load(c);
    va = __builtin_wasm_relaxed_madd_f32x4(va, vb, vc);
    wasm_v128_store(dest, va);
}

Flags: -O3 -msimd128 -mrelaxed-simd -ffast-math

Expected:
All impls optimized like using intrinsic.

fma_expected:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.get       1
        v128.load       0:p2align=0
        local.get       2
        v128.load       0:p2align=0
        f32x4.relaxed_madd
        v128.store      0:p2align=0
        end_function

Actual:

fma_inst:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_operator:
        local.get       3
        local.get       1
        v128.load       0:p2align=0
        local.get       0
        v128.load       0:p2align=0
        f32x4.mul
        local.get       2
        v128.load       0:p2align=0
        f32x4.add
        v128.store      0:p2align=0
        end_function

fma_buildin:
        local.get       3
        local.get       0
        v128.load       0:p2align=0
        local.tee       4
        f32x4.extract_lane      0
        local.get       1
        v128.load       0:p2align=0
        local.tee       5
        f32x4.extract_lane      0
        local.get       2
        v128.load       0:p2align=0
        local.tee       6
        f32x4.extract_lane      0
        call    fmaf
        f32x4.splat
        local.get       4
        f32x4.extract_lane      1
        local.get       5
        f32x4.extract_lane      1
        local.get       6
        f32x4.extract_lane      1
        call    fmaf
        f32x4.replace_lane      1
        local.get       4
        f32x4.extract_lane      2
        local.get       5
        f32x4.extract_lane      2
        local.get       6
        f32x4.extract_lane      2
        call    fmaf
        f32x4.replace_lane      2
        local.get       4
        f32x4.extract_lane      3
        local.get       5
        f32x4.extract_lane      3
        local.get       6
        f32x4.extract_lane      3
        call    fmaf
        f32x4.replace_lane      3
        v128.store      0:p2align=0
        end_function

Other info:
The same optimizations apply to x86: https://godbolt.org/z/jYKMEq4rM and arm64 https://godbolt.org/z/z5z4fdd7M

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions