Skip to content

Move reftracer call from _Py_Dealloc to non-inlined function call #130706

Open
@colesbury

Description

@colesbury

Even with PGO and LTO, the reftracer call in _Py_Dealloc causes a bunch of extra register spills on x86-64, especially in the free threading build:

cpython/Objects/object.c

Lines 3003 to 3004 in b545450

_PyReftracerTrack(op, PyRefTracer_DESTROY);
(*dealloc)(op);

The free threading build calls _Py_MergeZeroLocalRefcount(), which in turn inlines the call to _Py_Dealloc:

Dump of assembler code for function _Py_MergeZeroLocalRefcount:
   0x00000000001a8e50 <+0>:     push   r14
   0x00000000001a8e52 <+2>:     push   rbx
   0x00000000001a8e53 <+3>:     push   rax
   0x00000000001a8e54 <+4>:     mov    rcx,QWORD PTR [rdi+0x10]
   0x00000000001a8e58 <+8>:     test   rcx,rcx
   0x00000000001a8e5b <+11>:    jne    0x1a8e7a <_Py_MergeZeroLocalRefcount+42>
   0x00000000001a8e5d <+13>:    mov    rax,QWORD PTR [rdi+0x18]
   0x00000000001a8e61 <+17>:    mov    rcx,QWORD PTR [rax+0x40]
   0x00000000001a8e65 <+21>:    mov    rax,QWORD PTR [rip+0x476324]        # 0x61f190 <_PyRuntime+10384>
   0x00000000001a8e6c <+28>:    test   rax,rax
   0x00000000001a8e6f <+31>:    jne    0x1a8ead <_Py_MergeZeroLocalRefcount+93>
   0x00000000001a8e71 <+33>:    add    rsp,0x8
   0x00000000001a8e75 <+37>:    pop    rbx
   0x00000000001a8e76 <+38>:    pop    r14
   0x00000000001a8e78 <+40>:    jmp    rcx  # the fast path ends here with the jump to tp_dealloc
   0x00000000001a8e7a <+42>:    mov    QWORD PTR [rdi],0x0
   0x00000000001a8e81 <+49>:    mov    rdx,rcx
   0x00000000001a8e84 <+52>:    or     rdx,0x3
   0x00000000001a8e88 <+56>:    mov    rax,rcx
   0x00000000001a8e8b <+59>:    lock cmpxchg QWORD PTR [rdi+0x10],rdx
   0x00000000001a8e91 <+65>:    je     0x1a8e9f <_Py_MergeZeroLocalRefcount+79>
   0x00000000001a8e93 <+67>:    mov    rcx,rax
   0x00000000001a8e96 <+70>:    mov    rdx,rax
   0x00000000001a8e99 <+73>:    or     rdx,0x3
   0x00000000001a8e9d <+77>:    jmp    0x1a8e8b <_Py_MergeZeroLocalRefcount+59>
   0x00000000001a8e9f <+79>:    cmp    rcx,0x3
   0x00000000001a8ea3 <+83>:    jbe    0x1a8e5d <_Py_MergeZeroLocalRefcount+13>
   0x00000000001a8ea5 <+85>:    add    rsp,0x8
   0x00000000001a8ea9 <+89>:    pop    rbx
   0x00000000001a8eaa <+90>:    pop    r14
   0x00000000001a8eac <+92>:    ret
   0x00000000001a8ead <+93>:    mov    rdx,QWORD PTR [rip+0x4762e4]        # 0x61f198 <_PyRuntime+10392>
   0x00000000001a8eb4 <+100>:   mov    rbx,rdi
   0x00000000001a8eb7 <+103>:   mov    esi,0x1
   0x00000000001a8ebc <+108>:   mov    r14,rcx
   0x00000000001a8ebf <+111>:   call   rax
   0x00000000001a8ec1 <+113>:   mov    rcx,r14
   0x00000000001a8ec4 <+116>:   mov    rdi,rbx
   0x00000000001a8ec7 <+119>:   jmp    0x1a8e71 <_Py_MergeZeroLocalRefcount+33>

(This is with ./configure -C --with-tail-call-interp --enable-optimizations --disable-gil --with-lto=thin)

Note the three registers pushed to the stack at entry.

A slight refactoring of _Py_Dealloc avoids three push and three pop instructions, where if reftracer is active, the code jumps the non-inlined function dealloc_with_reftracer.

Dump of assembler code for function _Py_MergeZeroLocalRefcount:
   0x00000000001a7ec0 <+0>:     mov    rcx,QWORD PTR [rdi+0x10]
   0x00000000001a7ec4 <+4>:     test   rcx,rcx
   0x00000000001a7ec7 <+7>:     jne    0x1a7ede <_Py_MergeZeroLocalRefcount+30>
   0x00000000001a7ec9 <+9>:     cmp    QWORD PTR [rip+0x46b2bf],0x0        # 0x613190 <_PyRuntime+10384>
   0x00000000001a7ed1 <+17>:    jne    0xa8ca0 <dealloc_with_reftracer>
   0x00000000001a7ed7 <+23>:    mov    rax,QWORD PTR [rdi+0x18]
   0x00000000001a7edb <+27>:    jmp    QWORD PTR [rax+0x40]  # The fast path ends here with the jump to `tp_dealloc()`
   0x00000000001a7ede <+30>:    mov    QWORD PTR [rdi],0x0
   0x00000000001a7ee5 <+37>:    mov    rdx,rcx
   0x00000000001a7ee8 <+40>:    or     rdx,0x3
   0x00000000001a7eec <+44>:    mov    rax,rcx
   0x00000000001a7eef <+47>:    lock cmpxchg QWORD PTR [rdi+0x10],rdx
   0x00000000001a7ef5 <+53>:    je     0x1a7f03 <_Py_MergeZeroLocalRefcount+67>
   0x00000000001a7ef7 <+55>:    mov    rcx,rax
   0x00000000001a7efa <+58>:    mov    rdx,rax
   0x00000000001a7efd <+61>:    or     rdx,0x3
   0x00000000001a7f01 <+65>:    jmp    0x1a7eef <_Py_MergeZeroLocalRefcount+47>
   0x00000000001a7f03 <+67>:    cmp    rcx,0x3
   0x00000000001a7f07 <+71>:    jbe    0x1a7f0a <_Py_MergeZeroLocalRefcount+74>
   0x00000000001a7f09 <+73>:    ret
   0x00000000001a7f0a <+74>:    cmp    QWORD PTR [rip+0x46b27e],0x0        # 0x613190 <_PyRuntime+10384>
   0x00000000001a7f12 <+82>:    jne    0xa8ca0 <dealloc_with_reftracer>
   0x00000000001a7f18 <+88>:    mov    rax,QWORD PTR [rdi+0x18]
   0x00000000001a7f1c <+92>:    jmp    QWORD PTR [rax+0x40]

Metadata

Metadata

Assignees

No one assigned

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions