Open
Description
Env: Compiler explorer (https://godbolt.org/z/Yxc6e8sd4)
Version: WebAssembly clang (trunk)
Code:
Code
#include <wasm_simd128.h>
void fma_inst(float * a, float * b, float * c, float * dest) {
v128_t va, vb, vc;
va = wasm_v128_load(a);
vb = wasm_v128_load(b);
vc = wasm_v128_load(c);
va = wasm_f32x4_mul(va, vb);
va = wasm_f32x4_add(va, vc);
wasm_v128_store(dest, va);
}
void fma_operator(float * a, float * b, float * c, float * dest) {
__f32x4 va, vb, vc;
va = wasm_v128_load(a);
vb = wasm_v128_load(b);
vc = wasm_v128_load(c);
va = (va * vb) + vc;
wasm_v128_store(dest, va);
}
void fma_buildin(float * a, float * b, float * c, float * dest) {
__f32x4 va, vb, vc;
va = wasm_v128_load(a);
vb = wasm_v128_load(b);
vc = wasm_v128_load(c);
va = __builtin_elementwise_fma(va, vb, vc);
wasm_v128_store(dest, va);
}
void fma_expected(float * a, float * b, float * c, float * dest) {
__f32x4 va, vb, vc;
va = wasm_v128_load(a);
vb = wasm_v128_load(b);
vc = wasm_v128_load(c);
va = __builtin_wasm_relaxed_madd_f32x4(va, vb, vc);
wasm_v128_store(dest, va);
}
Flags: -O3 -msimd128 -mrelaxed-simd -ffast-math
Expected:
All impls optimized like using intrinsic.
fma_expected:
local.get 3
local.get 0
v128.load 0:p2align=0
local.get 1
v128.load 0:p2align=0
local.get 2
v128.load 0:p2align=0
f32x4.relaxed_madd
v128.store 0:p2align=0
end_function
Actual:
fma_inst:
local.get 3
local.get 1
v128.load 0:p2align=0
local.get 0
v128.load 0:p2align=0
f32x4.mul
local.get 2
v128.load 0:p2align=0
f32x4.add
v128.store 0:p2align=0
end_function
fma_operator:
local.get 3
local.get 1
v128.load 0:p2align=0
local.get 0
v128.load 0:p2align=0
f32x4.mul
local.get 2
v128.load 0:p2align=0
f32x4.add
v128.store 0:p2align=0
end_function
fma_buildin:
local.get 3
local.get 0
v128.load 0:p2align=0
local.tee 4
f32x4.extract_lane 0
local.get 1
v128.load 0:p2align=0
local.tee 5
f32x4.extract_lane 0
local.get 2
v128.load 0:p2align=0
local.tee 6
f32x4.extract_lane 0
call fmaf
f32x4.splat
local.get 4
f32x4.extract_lane 1
local.get 5
f32x4.extract_lane 1
local.get 6
f32x4.extract_lane 1
call fmaf
f32x4.replace_lane 1
local.get 4
f32x4.extract_lane 2
local.get 5
f32x4.extract_lane 2
local.get 6
f32x4.extract_lane 2
call fmaf
f32x4.replace_lane 2
local.get 4
f32x4.extract_lane 3
local.get 5
f32x4.extract_lane 3
local.get 6
f32x4.extract_lane 3
call fmaf
f32x4.replace_lane 3
v128.store 0:p2align=0
end_function
Other info:
The same optimizations apply to x86: https://godbolt.org/z/jYKMEq4rM and arm64 https://godbolt.org/z/z5z4fdd7M