Skip to content

Can slicing a Memory<T> be made faster by the JIT by eliding the call to write barrier #12414

Open
@ahsonkhan

Description

@ahsonkhan

Slicing a memory does not modify the underlying object it points to. It only modifies the index/length. However, it does create a new memory and hence has to set the object field.

This causes memory slicing to be slower since the JIT injects a call to write barrier. It makes methods like below no longer leaf methods, and hence a stack frame gets added as well.

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void Advance(int count)
{
    Debug.Assert(count >= 0 && _buffered <= int.MaxValue - count);
    _buffered += count;

    // Unsafe, do not do this.
    //UnsafeMemory<byte> temp = Unsafe.As<Memory<byte>, UnsafeMemory<byte>>(ref _buffer).Slice(count);
    //_buffer = Unsafe.As<UnsafeMemory<byte>, Memory<byte>>(ref temp);

    _buffer = _buffer.Slice(count);
}

Can the JIT elide the call to the write barrier here? Are there concerns with the GC moving the object?

From @AndyAyersMS:

Though perhaps it doesn't matter as the pointer the jit has and the pointer the struct has should have the same value whether or not GC happens in between

Disassembly:
https://www.diffchecker.com/zbiwhYXM
Note the call here:
call coreclr!coreclr_shutdown_2+0xc910

I believe this is CoreCLR!JIT_WriteBarrier

00007ff9`65e76820 System.Text.Json.Perf_MemSlice.MemSlice()
00007ff9`65e76820 57              push    rdi
00007ff9`65e76821 56              push    rsi
00007ff9`65e76822 53              push    rbx
00007ff9`65e76823 4883ec20        sub     rsp,20h
00007ff9`65e76827 488bf2          mov     rsi,rdx
00007ff9`65e7682a 3909            cmp     dword ptr [rcx],ecx
00007ff9`65e7682c 4883c110        add     rcx,10h
00007ff9`65e76830 8b790c          mov     edi,dword ptr [rcx+0Ch]
00007ff9`65e76833 83ff01          cmp     edi,1
00007ff9`65e76836 7262            jb      00007ff9`65e7689a
00007ff9`65e76838 488b11          mov     rdx,qword ptr [rcx]
00007ff9`65e7683b 8b5908          mov     ebx,dword ptr [rcx+8]
00007ff9`65e7683e ffc3            inc     ebx
00007ff9`65e76840 ffcf            dec     edi
00007ff9`65e76842 83ff01          cmp     edi,1
00007ff9`65e76845 725e            jb      00007ff9`65e768a5
00007ff9`65e76847 ffc3            inc     ebx
00007ff9`65e76849 ffcf            dec     edi
00007ff9`65e7684b 83ff01          cmp     edi,1
00007ff9`65e7684e 7260            jb      00007ff9`65e768b0
00007ff9`65e76850 ffc3            inc     ebx
00007ff9`65e76852 ffcf            dec     edi
00007ff9`65e76854 83ff01          cmp     edi,1
00007ff9`65e76857 7262            jb      00007ff9`65e768bb
00007ff9`65e76859 ffc3            inc     ebx
00007ff9`65e7685b ffcf            dec     edi
00007ff9`65e7685d 83ff01          cmp     edi,1
00007ff9`65e76860 7264            jb      00007ff9`65e768c6
00007ff9`65e76862 ffc3            inc     ebx
00007ff9`65e76864 ffcf            dec     edi
00007ff9`65e76866 83ff01          cmp     edi,1
00007ff9`65e76869 7266            jb      00007ff9`65e768d1
00007ff9`65e7686b ffc3            inc     ebx
00007ff9`65e7686d ffcf            dec     edi
00007ff9`65e7686f 83ff01          cmp     edi,1
00007ff9`65e76872 7268            jb      00007ff9`65e768dc
00007ff9`65e76874 ffc3            inc     ebx
00007ff9`65e76876 ffcf            dec     edi
00007ff9`65e76878 83ff01          cmp     edi,1
00007ff9`65e7687b 726a            jb      00007ff9`65e768e7
00007ff9`65e7687d ffc3            inc     ebx
00007ff9`65e7687f ffcf            dec     edi
00007ff9`65e76881 488bce          mov     rcx,rsi
00007ff9`65e76884 e887dd8e5f      call    coreclr!coreclr_shutdown_2+0xc910 (00007ff9`c5764610)
00007ff9`65e76889 895e08          mov     dword ptr [rsi+8],ebx
00007ff9`65e7688c 897e0c          mov     dword ptr [rsi+0Ch],edi
00007ff9`65e7688f 488bc6          mov     rax,rsi
00007ff9`65e76892 4883c420        add     rsp,20h
00007ff9`65e76896 5b              pop     rbx
00007ff9`65e76897 5e              pop     rsi
00007ff9`65e76898 5f              pop     rdi
00007ff9`65e76899 c3              **ret**
00007ff9`65e56af0 System.Text.Json.Perf_MemSlice.MemUnsafeSlice()
00007ff9`65e56af0 4883ec28        sub     rsp,28h
00007ff9`65e56af4 90              nop
00007ff9`65e56af5 3909            cmp     dword ptr [rcx],ecx
00007ff9`65e56af7 4883c110        add     rcx,10h
00007ff9`65e56afb 8b410c          mov     eax,dword ptr [rcx+0Ch]
00007ff9`65e56afe 83f801          cmp     eax,1
00007ff9`65e56b01 725a            jb      00007ff9`65e56b5d
00007ff9`65e56b03 4c8b01          mov     r8,qword ptr [rcx]
00007ff9`65e56b06 8b4908          mov     ecx,dword ptr [rcx+8]
00007ff9`65e56b09 ffc1            inc     ecx
00007ff9`65e56b0b ffc8            dec     eax
00007ff9`65e56b0d 83f801          cmp     eax,1
00007ff9`65e56b10 7251            jb      00007ff9`65e56b63
00007ff9`65e56b12 ffc1            inc     ecx
00007ff9`65e56b14 ffc8            dec     eax
00007ff9`65e56b16 83f801          cmp     eax,1
00007ff9`65e56b19 724e            jb      00007ff9`65e56b69
00007ff9`65e56b1b ffc1            inc     ecx
00007ff9`65e56b1d ffc8            dec     eax
00007ff9`65e56b1f 83f801          cmp     eax,1
00007ff9`65e56b22 724b            jb      00007ff9`65e56b6f
00007ff9`65e56b24 ffc1            inc     ecx
00007ff9`65e56b26 ffc8            dec     eax
00007ff9`65e56b28 83f801          cmp     eax,1
00007ff9`65e56b2b 7248            jb      00007ff9`65e56b75
00007ff9`65e56b2d ffc1            inc     ecx
00007ff9`65e56b2f ffc8            dec     eax
00007ff9`65e56b31 83f801          cmp     eax,1
00007ff9`65e56b34 7245            jb      00007ff9`65e56b7b
00007ff9`65e56b36 ffc1            inc     ecx
00007ff9`65e56b38 ffc8            dec     eax
00007ff9`65e56b3a 83f801          cmp     eax,1
00007ff9`65e56b3d 7242            jb      00007ff9`65e56b81
00007ff9`65e56b3f ffc1            inc     ecx
00007ff9`65e56b41 ffc8            dec     eax
00007ff9`65e56b43 83f801          cmp     eax,1
00007ff9`65e56b46 723f            jb      00007ff9`65e56b87
00007ff9`65e56b48 ffc1            inc     ecx
00007ff9`65e56b4a ffc8            dec     eax
00007ff9`65e56b4c 4c8902          mov     qword ptr [rdx],r8
00007ff9`65e56b4f 894a08          mov     dword ptr [rdx+8],ecx
00007ff9`65e56b52 89420c          mov     dword ptr [rdx+0Ch],eax
00007ff9`65e56b55 488bc2          mov     rax,rdx
00007ff9`65e56b58 4883c428        add     rsp,28h
00007ff9`65e56b5c c3              ret
00007ff9`65e46af0 System.Text.Json.Perf_MemSlice.SpanSlice()
00007ff9`65e46af0 4883ec28        sub     rsp,28h
00007ff9`65e46af4 90              nop
00007ff9`65e46af5 488b4108        mov     rax,qword ptr [rcx+8]
00007ff9`65e46af9 4885c0          test    rax,rax
00007ff9`65e46afc 7465            je      00007ff9`65e46b63
00007ff9`65e46afe 8b4808          mov     ecx,dword ptr [rax+8]
00007ff9`65e46b01 83f901          cmp     ecx,1
00007ff9`65e46b04 7263            jb      00007ff9`65e46b69
00007ff9`65e46b06 4883c010        add     rax,10h
00007ff9`65e46b0a ffc9            dec     ecx
00007ff9`65e46b0c 48ffc0          inc     rax
00007ff9`65e46b0f 83f901          cmp     ecx,1
00007ff9`65e46b12 725b            jb      00007ff9`65e46b6f
00007ff9`65e46b14 ffc9            dec     ecx
00007ff9`65e46b16 48ffc0          inc     rax
00007ff9`65e46b19 83f901          cmp     ecx,1
00007ff9`65e46b1c 7257            jb      00007ff9`65e46b75
00007ff9`65e46b1e ffc9            dec     ecx
00007ff9`65e46b20 48ffc0          inc     rax
00007ff9`65e46b23 83f901          cmp     ecx,1
00007ff9`65e46b26 7253            jb      00007ff9`65e46b7b
00007ff9`65e46b28 ffc9            dec     ecx
00007ff9`65e46b2a 48ffc0          inc     rax
00007ff9`65e46b2d 83f901          cmp     ecx,1
00007ff9`65e46b30 724f            jb      00007ff9`65e46b81
00007ff9`65e46b32 ffc9            dec     ecx
00007ff9`65e46b34 48ffc0          inc     rax
00007ff9`65e46b37 83f901          cmp     ecx,1
00007ff9`65e46b3a 724b            jb      00007ff9`65e46b87
00007ff9`65e46b3c ffc9            dec     ecx
00007ff9`65e46b3e 48ffc0          inc     rax
00007ff9`65e46b41 83f901          cmp     ecx,1
00007ff9`65e46b44 7247            jb      00007ff9`65e46b8d
00007ff9`65e46b46 ffc9            dec     ecx
00007ff9`65e46b48 48ffc0          inc     rax
00007ff9`65e46b4b 83f901          cmp     ecx,1
00007ff9`65e46b4e 7243            jb      00007ff9`65e46b93
00007ff9`65e46b50 ffc9            dec     ecx
00007ff9`65e46b52 48ffc0          inc     rax
00007ff9`65e46b55 488902          mov     qword ptr [rdx],rax
00007ff9`65e46b58 894a08          mov     dword ptr [rdx+8],ecx
00007ff9`65e46b5b 488bc2          mov     rax,rdx
00007ff9`65e46b5e 4883c428        add     rsp,28h
00007ff9`65e46b62 c3              ret

Benchmark:

    public class Perf_MemSlice
    {
        byte[] _array;
        Memory<byte> _memory;

        [GlobalSetup]
        public void Setup()
        {
            _array = new byte[128];
            _memory = _array;
        }

        [Benchmark]
        public Memory<byte> MemSlice()
        {
            Memory<byte> memory = _memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);

            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            return memory;
        }

        [Benchmark]
        public Memory<byte> MemUnsafeSlice()
        {
            UnsafeMemory<byte> memory = Unsafe.As<Memory<byte>, UnsafeMemory<byte>>(ref _memory).Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);

            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            memory = memory.Slice(1);
            return Unsafe.As<UnsafeMemory<byte>, Memory<byte>>(ref memory);
        }

        [Benchmark]
        public Span<byte> SpanSlice()
        {
            Span<byte> span = _array.AsSpan(1);
            span = span.Slice(1);
            span = span.Slice(1);
            span = span.Slice(1);

            span = span.Slice(1);
            span = span.Slice(1);
            span = span.Slice(1);
            span = span.Slice(1);
            return span;
        }
    }

    // Unsafe hack to measure what perf we can get if we don't have the write barrier.
    // Courtesy of Levi
    public struct UnsafeMemory<T>
    {
        private readonly IntPtr _object;
        private readonly int _index;
        private readonly int _length;

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public UnsafeMemory<T> Slice(int start)
        {
            if ((uint)start > (uint)_length)
            {
                ThrowArgumentOutOfRangeException();
            }

            // It is expected for _index + start to be negative if the memory is already pre-pinned.
            return new UnsafeMemory<T>(_object, _index + start, _length - start);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal UnsafeMemory(IntPtr obj, int start, int length)
        {
            // No validation performed in release builds; caller must provide any necessary validation.

            _object = obj;
            _index = start;
            _length = length;
        }

        internal static void ThrowArgumentOutOfRangeException()
        {
            throw new ArgumentOutOfRangeException();
        }
    }

image

cc @AndyAyersMS, @CarolEidt, @davidfowl, @jkotas

category:cq
theme:barriers
skill-level:expert
cost:large

Metadata

Metadata

Assignees

No one assigned

    Labels

    JitUntriagedCLR JIT issues needing additional triagearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions