Even with PGO and LTO, the reftracer call in _Py_Dealloc
causes a bunch of extra register spills on x86-64, especially in the free threading build:
Lines 3003 to 3004 in b545450
The free threading build calls _Py_MergeZeroLocalRefcount()
, which in turn inlines the call to _Py_Dealloc
Dump of assembler code for function _Py_MergeZeroLocalRefcount:
0x00000000001a8e50 <+0>: push r14
0x00000000001a8e52 <+2>: push rbx
0x00000000001a8e53 <+3>: push rax
0x00000000001a8e54 <+4>: mov rcx,QWORD PTR [rdi+0x10]
0x00000000001a8e58 <+8>: test rcx,rcx
0x00000000001a8e5b <+11>: jne 0x1a8e7a <_Py_MergeZeroLocalRefcount+42>
0x00000000001a8e5d <+13>: mov rax,QWORD PTR [rdi+0x18]
0x00000000001a8e61 <+17>: mov rcx,QWORD PTR [rax+0x40]
0x00000000001a8e65 <+21>: mov rax,QWORD PTR [rip+0x476324] # 0x61f190 <_PyRuntime+10384>
0x00000000001a8e6c <+28>: test rax,rax
0x00000000001a8e6f <+31>: jne 0x1a8ead <_Py_MergeZeroLocalRefcount+93>
0x00000000001a8e71 <+33>: add rsp,0x8
0x00000000001a8e75 <+37>: pop rbx
0x00000000001a8e76 <+38>: pop r14
0x00000000001a8e78 <+40>: jmp rcx # the fast path ends here with the jump to tp_dealloc
0x00000000001a8e7a <+42>: mov QWORD PTR [rdi],0x0
0x00000000001a8e81 <+49>: mov rdx,rcx
0x00000000001a8e84 <+52>: or rdx,0x3
0x00000000001a8e88 <+56>: mov rax,rcx
0x00000000001a8e8b <+59>: lock cmpxchg QWORD PTR [rdi+0x10],rdx
0x00000000001a8e91 <+65>: je 0x1a8e9f <_Py_MergeZeroLocalRefcount+79>
0x00000000001a8e93 <+67>: mov rcx,rax
0x00000000001a8e96 <+70>: mov rdx,rax
0x00000000001a8e99 <+73>: or rdx,0x3
0x00000000001a8e9d <+77>: jmp 0x1a8e8b <_Py_MergeZeroLocalRefcount+59>
0x00000000001a8e9f <+79>: cmp rcx,0x3
0x00000000001a8ea3 <+83>: jbe 0x1a8e5d <_Py_MergeZeroLocalRefcount+13>
0x00000000001a8ea5 <+85>: add rsp,0x8
0x00000000001a8ea9 <+89>: pop rbx
0x00000000001a8eaa <+90>: pop r14
0x00000000001a8eac <+92>: ret
0x00000000001a8ead <+93>: mov rdx,QWORD PTR [rip+0x4762e4] # 0x61f198 <_PyRuntime+10392>
0x00000000001a8eb4 <+100>: mov rbx,rdi
0x00000000001a8eb7 <+103>: mov esi,0x1
0x00000000001a8ebc <+108>: mov r14,rcx
0x00000000001a8ebf <+111>: call rax
0x00000000001a8ec1 <+113>: mov rcx,r14
0x00000000001a8ec4 <+116>: mov rdi,rbx
0x00000000001a8ec7 <+119>: jmp 0x1a8e71 <_Py_MergeZeroLocalRefcount+33>
(This is with ./configure -C --with-tail-call-interp --enable-optimizations --disable-gil --with-lto=thin
Note the three registers pushed to the stack at entry.
A slight refactoring of _Py_Dealloc
avoids three push
and three pop
instructions, where if reftracer is active, the code jumps the non-inlined function dealloc_with_reftracer
Dump of assembler code for function _Py_MergeZeroLocalRefcount:
0x00000000001a7ec0 <+0>: mov rcx,QWORD PTR [rdi+0x10]
0x00000000001a7ec4 <+4>: test rcx,rcx
0x00000000001a7ec7 <+7>: jne 0x1a7ede <_Py_MergeZeroLocalRefcount+30>
0x00000000001a7ec9 <+9>: cmp QWORD PTR [rip+0x46b2bf],0x0 # 0x613190 <_PyRuntime+10384>
0x00000000001a7ed1 <+17>: jne 0xa8ca0 <dealloc_with_reftracer>
0x00000000001a7ed7 <+23>: mov rax,QWORD PTR [rdi+0x18]
0x00000000001a7edb <+27>: jmp QWORD PTR [rax+0x40] # The fast path ends here with the jump to `tp_dealloc()`
0x00000000001a7ede <+30>: mov QWORD PTR [rdi],0x0
0x00000000001a7ee5 <+37>: mov rdx,rcx
0x00000000001a7ee8 <+40>: or rdx,0x3
0x00000000001a7eec <+44>: mov rax,rcx
0x00000000001a7eef <+47>: lock cmpxchg QWORD PTR [rdi+0x10],rdx
0x00000000001a7ef5 <+53>: je 0x1a7f03 <_Py_MergeZeroLocalRefcount+67>
0x00000000001a7ef7 <+55>: mov rcx,rax
0x00000000001a7efa <+58>: mov rdx,rax
0x00000000001a7efd <+61>: or rdx,0x3
0x00000000001a7f01 <+65>: jmp 0x1a7eef <_Py_MergeZeroLocalRefcount+47>
0x00000000001a7f03 <+67>: cmp rcx,0x3
0x00000000001a7f07 <+71>: jbe 0x1a7f0a <_Py_MergeZeroLocalRefcount+74>
0x00000000001a7f09 <+73>: ret
0x00000000001a7f0a <+74>: cmp QWORD PTR [rip+0x46b27e],0x0 # 0x613190 <_PyRuntime+10384>
0x00000000001a7f12 <+82>: jne 0xa8ca0 <dealloc_with_reftracer>
0x00000000001a7f18 <+88>: mov rax,QWORD PTR [rdi+0x18]
0x00000000001a7f1c <+92>: jmp QWORD PTR [rax+0x40]