Open
Description
Currently, when we do code vectorization in C#, we switch to raw pointers (either managed or unmanaged) and unsafe APIs resulting in lack of safety/bounds checks. A good example is a function that calculates a sum using SIMD, today we would write it like this:
ref int r = ref MemoryMarshal.GetReference(span);
for (int i = 0; i < span.Length - Vector128<int>.Count; i += Vector128<int>.Count)
{
vSum += Vector128.LoadUnsafe(ref r, (nuint)i);
}
It would be nice to move away from this practice towards fully safe APIs:
-ref int r = ref MemoryMarshal.GetReference(span);
for (int i = 0; i < span.Length - Vector128<int>.Count; i += Vector128<int>.Count)
{
- vSum += Vector128.LoadUnsafe(ref r, (nuint)i);
+ vSum += Vector128.Create(span[i..]);
}
Unfortunately, JIT is not able to eliminate safety checks produced by Slice
despite those being redundant in this case. This makes the safer version up to 2x slower, see EgorBot/runtime-utils#226
Full C# impl we want to have 0 redundant bounds check in it:
public static int Sum(ReadOnlySpan<int> span)
{
Vector128<int> vSum = default;
// Main loop
int i;
for (i = 0; i < span.Length - Vector128<int>.Count; i += Vector128<int>.Count)
vSum += Vector128.Create(span[i..]);
// Horizontal sum
int sum = Vector128.Sum(vSum);
// Trailing elements
for (; i < span.Length; i++)
sum += span[i];
return sum;
}
Current codegen
; Method Bench:Sum(System.ReadOnlySpan`1[int]):int (FullOpts)
G_M000_IG01: ;; offset=0x0000
push rbx
sub rsp, 32
G_M000_IG02: ;; offset=0x0005
vxorps xmm0, xmm0, xmm0
xor eax, eax
mov edx, dword ptr [rcx+0x08]
lea r8d, [rdx-0x04]
test r8d, r8d
jle SHORT G_M000_IG05
G_M000_IG03: ;; offset=0x0017
mov r10d, edx
align [6 bytes for IG04]
G_M000_IG04: ;; offset=0x0020
mov r9d, edx
sub r9d, eax
mov r11d, eax
mov ebx, r9d
add rbx, r11
cmp rbx, r10
ja SHORT G_M000_IG11
mov rbx, bword ptr [rcx]
lea r11, bword ptr [rbx+4*r11]
cmp r9d, 4
jl SHORT G_M000_IG10
vpaddd xmm0, xmm0, xmmword ptr [r11]
add eax, 4
cmp eax, r8d
jl SHORT G_M000_IG04
G_M000_IG05: ;; offset=0x004E
vpsrldq xmm1, xmm0, 8
vpaddd xmm0, xmm1, xmm0
vpsrldq xmm1, xmm0, 4
vpaddd xmm0, xmm1, xmm0
vmovd r8d, xmm0
cmp eax, edx
jge SHORT G_M000_IG08
G_M000_IG06: ;; offset=0x0069
mov rbx, bword ptr [rcx]
align [0 bytes for IG07]
G_M000_IG07: ;; offset=0x006C
cmp eax, edx
jae SHORT G_M000_IG12
mov ecx, eax
add r8d, dword ptr [rbx+4*rcx]
inc eax
cmp eax, edx
jl SHORT G_M000_IG07
G_M000_IG08: ;; offset=0x007C
mov eax, r8d
G_M000_IG09: ;; offset=0x007F
add rsp, 32
pop rbx
ret
G_M000_IG10: ;; offset=0x0085
mov ecx, 6
call [System.ThrowHelper:ThrowArgumentOutOfRangeException(int)]
int3
G_M000_IG11: ;; offset=0x0091
call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
int3
G_M000_IG12: ;; offset=0x0098
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code: 158
There are other loop shapes typically used with SIMD.