Revamp the generation of runtime division checks on ARM64 #111543

snickolls-arm · 2025-01-17T16:23:46Z

This patch introduces a new compilation phase that passes over the GenTrees looking for GT_DIV/GT_UDIV nodes on integral types, and morphs the code to introduce the necessary conformance checks (overflow/divide-by-zero) early on in the compilation pipeline. Currently these are added during the Emit phase, meaning optimizations don't run on any code introduced.

The aim is to allow the compiler to make decisions on code position and instruction selection for these checks. For example on ARM64 this enables certain scenarios to choose the cbz instruction over cmp/beq, can lead to more compact code. It also allows some of the comparisons in the checks to be hoisted out of loops.

snickolls-arm · 2025-01-17T16:27:07Z

@kunalspathak @a74nh

This is WIP. I've taken a different approach to adding new nodes, instead adding a pass that modifies the HIR.

The pass will run through all of the code in the function looking for GT_DIV/GT_UDIV nodes. On ARM64 we need to run this after morph because so we catch any GT_DIV nodes that might've been introduced by conversions such as the MOD to SUB-MUL-DIV. If the pass encounters a GT_DIV node, it will use fgSplitBlockBeforeTree to ensure any side effects of the tree will run before the runtime check. Then it will add the runtime checks to the graph just after these side effects, but before the actual division occurs.

The added HIR looks like this for the signed overflow check, for example. This is checking for (dividend < 0 && divisor == -1), which should throw an overflow exception.

------------ BB06 [0005] [???..???) -> BB07(0.01),BB05(0.99) (cond), preds={BB02} succs={BB05,BB07}

***** BB06 [0005]
STMT00007 ( ??? ... ??? )
               [000032] -----------                         *  JTRUE     void
               [000030] J----------                         \--*  EQ        int
               [000028] -----------                            +--*  AND       int
               [000025] -----------                            |  +--*  EQ        int
               [000022] -----------                            |  |  +--*  LCL_VAR   int    V01 arg1
               [000024] -----------                            |  |  \--*  CNS_INT   int    -1
               [000027] -----------                            |  \--*  LT        int
               [000023] -----------                            |     +--*  LCL_VAR   int    V03 loc0
               [000026] -----------                            |     \--*  CNS_INT   int    0
               [000029] -----------                            \--*  CNS_INT   int    1

------------ BB07 [0006] [???..???) (throw), preds={BB06} succs={}

***** BB07 [0006]
STMT00006 ( ??? ... ??? )
               [000031] --CXG------                         *  CALL help void   CORINFO_HELP_OVERFLOW

Here's the example @kunalspathak mentioned in #64795:

// See https://aka.ms/new-console-template for more information
using System;

namespace MyApp
{
    internal class Program
    {
        public static int issue2(int x, int y, int z)
	{
	    int result = x;
	    for (int i = 0; i < z; i++)
	    {
		//result = x % y; <-- this hoist things properly because both dividend and divisor are invariant.
		result = result % y;
	    }
	    return result;
	}

        static void Main(string[] args)
        {
	    var rand = new Random(1234);
	    Console.WriteLine(issue2(rand.Next(), rand.Next(), rand.Next()));
        }
    }
}

Before the change:

; Total bytes of code 80, prolog size 8, PerfScore 81.00, instruction count 24, allocated bytes for code 80 (MethodHash=3a9665a0) for method MyApp.Program:issue2(int,int,int):int (FullOpts)
; ============================================================

*************** After end code gen, before unwindEmit()
G_M39519_IG01:        ; func=00, offs=0x000000, size=0x0008, bbWeight=1, PerfScore 1.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG

IN0015: 000000      stp     fp, lr, [sp, #-0x10]!
IN0016: 000004      mov     fp, sp

G_M39519_IG02:        ; offs=0x000008, size=0x0008, bbWeight=1, PerfScore 1.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB01 [0000], byref, isz

IN0001: 000008      cmp     w2, #0
IN0002: 00000C      ble     G_M39519_IG06

G_M39519_IG03:        ; offs=0x000010, size=0x0000, bbWeight=0.25, PerfScore 0.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB02 [0005], byref, isz

IN0003: 000010      align   [0 bytes for IG04]
IN0004: 000010      align   [0 bytes]
IN0005: 000010      align   [0 bytes]
IN0006: 000010      align   [0 bytes]

G_M39519_IG04:        ; offs=0x000010, size=0x0018, bbWeight=4, PerfScore 18.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB03 [0001], byref, isz

IN0007: 000010      cmp     w1, #0
IN0008: 000014      beq     G_M39519_IG07
IN0009: 000018      cmn     w1, #1
IN000a: 00001C      bne     G_M39519_IG05
IN000b: 000020      cmp     w0, #1
IN000c: 000024      bvs     G_M39519_IG08

G_M39519_IG05:        ; offs=0x000028, size=0x0010, bbWeight=4, PerfScore 58.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, loop=IG04, BB03 [0001], byref, isz

IN000d: 000028      sdiv    w3, w0, w1
IN000e: 00002C      msub    w0, w3, w1, w0
IN000f: 000030      sub     w2, w2, #1
IN0010: 000034      cbnz    w2, G_M39519_IG04

G_M39519_IG06:        ; offs=0x000038, size=0x0008, bbWeight=1, PerfScore 2.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc

IN0017: 000038      ldp     fp, lr, [sp], #0x10
IN0018: 00003C      ret     lr

G_M39519_IG07:        ; offs=0x000040, size=0x0008, bbWeight=0, PerfScore 0.00, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB06 [0007], gcvars, byref

IN0011: 000040      bl      CORINFO_HELP_THROWDIVZERO
IN0012: 000044      brk     #0

G_M39519_IG08:        ; offs=0x000048, size=0x0008, bbWeight=0, PerfScore 0.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB07 [0008], byref

IN0013: 000048      bl      CORINFO_HELP_OVERFLOW
IN0014: 00004C      brk     #0

After the change:

; Total bytes of code 84, prolog size 8, PerfScore 79.25, instruction count 25, allocated bytes for code 84 (MethodHash=3a9665a0) for method MyApp.Program:issue2(int,int,int):int (FullOpts)
; ============================================================

*************** After end code gen, before unwindEmit()
G_M39519_IG01:        ; func=00, offs=0x000000, size=0x0008, bbWeight=1, PerfScore 1.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG

IN0016: 000000      stp     fp, lr, [sp, #-0x10]!
IN0017: 000004      mov     fp, sp

G_M39519_IG02:        ; offs=0x000008, size=0x0008, bbWeight=1, PerfScore 1.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB01 [0000], byref, isz

IN0001: 000008      cmp     w2, #0
IN0002: 00000C      ble     G_M39519_IG05

G_M39519_IG03:        ; offs=0x000010, size=0x0008, bbWeight=0.25, PerfScore 0.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB02 [0011], byref, isz

IN0003: 000010      cmn     w1, #1
IN0004: 000014      cset    x3, eq
IN0005: 000018      align   [0 bytes for IG04]
IN0006: 000018      align   [0 bytes]
IN0007: 000018      align   [0 bytes]
IN0008: 000018      align   [0 bytes]

G_M39519_IG04:        ; offs=0x000018, size=0x0024, bbWeight=4, PerfScore 74.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, loop=IG04, BB03 [0001], BB04 [0004], BB05 [0007], byref, isz

IN0009: 000018      lsr     w4, w0, #31
IN000a: 00001C      and     w4, w3, w4
IN000b: 000020      cmp     w4, #1
IN000c: 000024      beq     G_M39519_IG07
IN000d: 000028      cbz     w1, G_M39519_IG06
IN000e: 00002C      sdiv    w4, w0, w1
IN000f: 000030      msub    w0, w4, w1, w0
IN0010: 000034      sub     w2, w2, #1
IN0011: 000038      cbnz    w2, G_M39519_IG04

G_M39519_IG05:        ; offs=0x00003C, size=0x0008, bbWeight=1, PerfScore 2.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc

IN0018: 00003C      ldp     fp, lr, [sp], #0x10
IN0019: 000040      ret     lr

G_M39519_IG06:        ; offs=0x000044, size=0x0008, bbWeight=0, PerfScore 0.00, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB07 [0009], gcvars, byref

IN0012: 000044      bl      CORINFO_HELP_THROWDIVZERO
IN0013: 000048      brk     #0

G_M39519_IG07:        ; offs=0x00004C, size=0x0008, bbWeight=0, PerfScore 0.00, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB06 [0006], byref

IN0014: 00004C      bl      CORINFO_HELP_OVERFLOW
IN0015: 000050      brk     #0

The main difference is at label IG04, rather than a fixed sequence of compare and branch instructions chosen at the emit stage, the compiler has decided to build a logical expression for the overflow check and emit a cbz for the divide-by-zero check. The loop hoisting optimization has decided that the test for (divisor == -1) can be performed outside of the loop to save an instruction inside the loop, this is computed in IG03. Building a logical expression instead of a branch sequence has also allowed the compiler to perform these checks with 2 compare and branches instead of 3.

The approach is working well when:
• The trees containing GT_DIV don't have many side-effects, as these will have to be split out and this can result in spilling, especially in MinOpts.
• GT_DIV occurs in a loop, as some of the expression tree for the check can now be hoisted outside the loop.
• There are a lot of GT_DIV nodes in a function, as now the compiler seems to choose cbz more often than cmp/beq.

It seems to have an adverse effect on MinOpts though, because splitting the tree will often spill and there aren't any optimization passes running to clear up these spills.

At the moment I haven't focused on the efficiency of the pass itself but I believe it could be improved. I could borrow the recursive traversals in the earlier morph phase to build a work-list for where checks need to be added. Then the pass can be linear over a pre-built list of nodes rather than a search in a loop. I would just have to be careful to update all of the locations of the nodes after any trees are split, but I think this should be possible.

I've also had to make a temporary fix on a problem with the tree splitting code where it wasn't correctly updating the node flags after splitting out side effects. After splitting the tree I traverse it post-order to update all of the flags. There might be a more efficient way of doing this.

snickolls-arm · 2025-01-17T16:54:26Z

I think the build is failing on Release mode due to use of GenTree::gtTreeID so I'll need to look into having access to this, or some similar identifier, for all modes as it is part of the algorithm.

kunalspathak · 2025-01-18T02:26:34Z

can you also eliminate the regressions?

jakobbotsch · 2025-01-18T09:34:41Z

I think the build is failing on Release mode due to use of GenTree::gtTreeID so I'll need to look into having access to this, or some similar identifier, for all modes as it is part of the algorithm.

What do you need this for? Increasing the size of GenTree is hard to justify. I do not think this transformation qualifies. Most likely you have other options.

snickolls-arm · 2025-01-20T12:06:55Z

What do you need this for? Increasing the size of GenTree is hard to justify. I do not think this transformation qualifies. Most likely you have other options.

I need a way of uniquely identifying a GenTree node, so I can record that I've already visited the node and added the runtime checks. Is it possible to use the address value of the node? There would need to be a guarantee that the address would be unique within the function compilation.

can you also eliminate the regressions?

I think many of the regressions are caused by spilling in gtSplitTree, but there are also some individual regressions that I'd need to look into case-by-case.

My options for continuing this are:

Look into making this transform produce the same code as before in MinOpts.
Make this pass run only for FullOpts and revert to existing code for MinOpts.

I think it's sensible to allow the compiler to have a view of all of this code being added early on in the pipeline, but this might not make sense in the tiering model. So option 2 could be a good compromise. I'll need to look into individual regression cases for both options regardless of choice. I would be grateful for any opinions on this and the approach in general.

jakobbotsch · 2025-01-20T12:28:10Z

src/coreclr/jit/arithchecks.cpp

+            impCloneExpr(divisor, &divisorCopy, CHECK_SPILL_NONE, nullptr DEBUGARG("cloned for runtime check"));
+            impCloneExpr(dividend, &dividendCopy, CHECK_SPILL_NONE, nullptr DEBUGARG("cloned for runtime check"));
+
+            // (dividend < 0 && divisor == -1)


Shouldn't this be dividend == MinValue and divisor == -1?

Yes, I misinterpreted the exception case and this likely explains some of the issues I'm seeing. Thanks.

jakobbotsch · 2025-01-20T12:36:03Z

src/coreclr/jit/arithchecks.cpp

+    code.block = divBlock;
+    code.stmt  = divBlock->firstStmt();
+    code.tree  = tree;
+}


Seems like something here should be setting GTF_DIV_MOD_NO_BY_ZERO and GTF_DIV_MOD_NO_OVERFLOW on the DIV node.

I'm relying on the settings of these flags in morph stage, e.g. in fgMorphSmpOp (morph.cpp:8584). When I call GenTree::OperExceptions I'm implicitly checking these flags too. As I'm not doing any further processing of the types of the operands I don't think I can make that decision in this pass, unless I'm missing something here?

jakobbotsch · 2025-01-20T12:37:35Z

I need a way of uniquely identifying a GenTree node, so I can record that I've already visited the node and added the runtime checks. Is it possible to use the address value of the node? There would need to be a guarantee that the address would be unique within the function compilation.

Yes, the address of nodes can be used, see e.g. NodeInternalRegisters for a hash table keyed on GenTree addresses (the JIT does not have a set type, so you could use a bool-valued hash table).

However, most likely there is no need for any form of "visited" check at all; instead you can shape your pass so that it visits all IR exactly once. See e.g. the various helper expansions in helperexpansion.cpp; those are shaped so that they visit all IR once while allowing for expansion of internal nodes into control flow.

I think many of the regressions are caused by spilling in gtSplitTree, but there are also some individual regressions that I'd need to look into case-by-case.

My options for continuing this are:

Look into making this transform produce the same code as before in MinOpts.

Make this pass run only for FullOpts and revert to existing code for MinOpts.

I think it's sensible to allow the compiler to have a view of all of this code being added early on in the pipeline, but this might not make sense in the tiering model. So option 2 could be a good compromise. I'll need to look into individual regression cases for both options regardless of choice. I would be grateful for any opinions on this and the approach in general.

I agree that (2) would be most reasonable.

You may want to experiment with some alternative simpler and cheaper ways of accomplishing what this pass is doing. One thing that comes to mind is expanding the checks as QMARKs during import. That is, instead of creating a GT_DIV/GT_MOD` node, you would create a shape like

QMARK(dividend == MinValue & divisor == -1, CALL CORINFO_HELP_OVERFLOW, QMARK(divisor == 0, GCALL CORINFO_HELP_THROWDIVBYZERO, DIV dividend, divisor))

(marking the division node with GTF_DIV_MOD_NO_OVERFLOW | GTF_DIV_MOD_NO_BY_ZERO).

Fixes dotnet#64795 This patch wraps GT_DIV/GT_UDIV nodes on integral types with GT_QMARK trees that contain the necessary conformance checks (overflow/divide-by-zero) when compiling the code with FullOpts enabled. Currently these checks are added during the Emit phase, this is still the case for MinOpts. The aim is to allow the compiler to make decisions on code position and instruction selection for these checks. For example on ARM64 this enables certain scenarios to choose the cbz instruction over cmp/beq, can lead to more compact code. It also allows some of the comparisons in the checks to be hoisted out of loops.

snickolls-arm · 2025-01-24T16:26:41Z

Hi @jakobbotsch,

Thanks for the help. I've updated the pull request now with the implementation introducing QMARK nodes at import. I think this is a much cleaner solution.

It's still not fully clear from the diffs exactly how much impact is caused by the cbz instruction, and how much is related to loop hoisting. I've just opened #111797, once this is rebased on that it should reduce the amount of diffs and show some examples of hoisting. If there aren't any examples of hoisting then this might not show any positive impact until functions that perform DIV are inlined into user code.

It's much clearer now where the spills are occurring now, quite a few are related to trying to clone a RETURN_EXPR tree in the divisor/dividend, as assertions fired for this when I replaced impCloneExpr with gtCloneExpr. So to reduce regressions, I would have to try and allow cloning of inline trees.

I'll also need to think about a way of making sure the MOD nodes are also being affected, because currently this is only affecting DIV nodes and the MOD nodes don't get morphed to SUB/MUL/DIV until later on.

kunalspathak

Added some comments. Let us merge #111797 and make this change Arm64 only. I can take a look after that. Also, don't expect to see MinOpts diffs, so please double check why they are coming.

kunalspathak · 2025-01-29T18:03:42Z

src/coreclr/jit/compiler.h

@@ -4681,6 +4681,8 @@ class Compiler

    GenTree* impThrowIfNull(GenTreeCall* call);

+    void impImportDivision(bool isSigned);


can you add this under #ifdef TARGET_ARM64?

kunalspathak · 2025-01-29T18:04:38Z

src/coreclr/jit/importer.cpp

@@ -7292,12 +7292,9 @@ void Compiler::impImportBlockCode(BasicBlock* block)
                // Other binary math operations

            case CEE_DIV:
-                oper = GT_DIV;


same here, just do it for #ifdef TARGET_ARM64.

kunalspathak · 2025-01-29T18:06:45Z

src/coreclr/jit/importer.cpp

@@ -13823,3 +13820,137 @@ methodPointerInfo* Compiler::impAllocateMethodPointerInfo(const CORINFO_RESOLVED
    memory->m_tokenConstraint = tokenConstrained;
    return memory;
 }
+
+void Compiler::impImportDivision(bool isSigned)


please add the method docs.

snickolls-arm · 2025-02-05T16:04:36Z

There shouldn't be any MinOpts diff in this latest version (4b58921) as there aren't any on my local run, but if they are still there after the CI run I'll take a look again.

jakobbotsch · 2025-02-05T17:16:19Z

src/coreclr/jit/importer.cpp

+    // That said, as of now it *is* a large node, so we'll do this with an assert rather
+    // than an "if".
+    assert(GenTree::s_gtNodeSizes[GT_CALL] == TREE_NODE_SZ_LARGE);
+    GenTree* divNode = new (this, GT_CALL) GenTreeOp(oper, resultType, dividend, divisor DEBUGARG(/*largeNode*/ true));


Can all of this logic be written so that we delegate to the standard div logic unless the QMARK handling logic actually kicks in? E.g. this weird GT_CALL sized node does not need to be allocated like this for your integral-only cases.

I've tried removing this logic, it hits assertions in Lowering where some code has tried to change the node type to GT_CAST and it can't because the nodes are different sizes. I think it would be best to keep this logic for all DIV nodes for now? And there would be some cleanup later involving removing any implicit assumptions about the size of DIV nodes.

Can you point me to the code in lowering that is changing an integer division to a cast?

See here on current main branch:

runtime/src/coreclr/jit/lower.cpp

Line 7329 in da4f0a3

divMod->ChangeOper(GT_CAST);

The assertion fires in SetOper, so I'm not sure if all of the failures I'm seeing are from this exact code path.

I see, that should probably be cleaned up.

In any case this code should still be refactored so that it falls back to the existing code for the cases it is not trying to handle differently. But it is ok to keep this code allocating a large node (it should rather use gtNewLargeOperNode, though, the comment above this is wrong since these divisions do not get transformed into helper calls).

jakobbotsch · 2025-02-05T17:23:59Z

src/coreclr/jit/importer.cpp

+    {
+        // Spill the divisor, as (divisor == 0) is always checked.
+        GenTree* divisorCopy = nullptr;
+        impCloneExpr(divisor, &divisorCopy, CHECK_SPILL_NONE, nullptr DEBUGARG("divisor used in runtime checks"));


This looks wrong -- it reorders the evaluation of divisor with dividend and other things on the IL stack. I would suggest you avoid popping the dividend and divisor until after this point by using impStackTop above. Then you can pop the divisor, clone it with impCloneExpr(divisor, CHECK_SPILL_ALL, ...) and then afterwards you can pop the dividend.

Note that this function is returning the tree that is one of the copies of the divisor, so you should use it instead of calling gtClone below.

Just to clarify, is the problem that the cloning of the expressions is in the wrong order? If I make sure the dividend check is added first and order my clones correctly, that will make sure the dividend is always cloned first if it needs to be. I'm not familiar with the import stage, so I can't see how calling impPopStack affects the execution order?

The importer works as follows: it keeps the current list of basic block statements (impStmtList + impLastStmt) and it keeps values representing each IL stack entry (accessed with e.g. impPopStack() and impStackTop). At all times, there is an invariant that the current basic block will evaluate first the basic block statements and then the IL evaluation stack from bottom to the top. That is the defined evaluation order.

If you ever pop a value from the stack, then the side effects of that value need to be evaluated after everything that is in the list of basic block statements and after everything that was below it on the IL stack. You are responsible for ensuring that this ends up being the evaluation order. The importer provides some tools to ensure that this happens, like the notion of spilling: when you want to append a side effect to the basic block list, spilling takes care to check interference between the side effect and things that are currently on the IL stack. When it detects interference, it spills what is on the IL stack by appending its side effect (and everything before it) to the basic block list of statements.

To clone an expression impCloneExpr sometimes may need to store it into a temporary value and append that store to the statement list. By passing CHECK_SPILL_NONE here you are telling it that it is ok to evaluate the value before everything currently on the IL stack. That's not correct; it will mean that side effects in the divisor will be reordered to happen before things that were below it on the IL stack.

If you pass CHECK_SPILL_ALL you still have a problem that the divisor will potentially end up executing before the dividend. Hence why I suggested the stepwise pop/cloning process above.

Thanks for the explanation, hopefully the latest change now reflects this properly.

snickolls-arm · 2025-02-24T17:10:57Z

The implementation is now regressing #63905, I think I can see why.

***** BB02 [0001]
STMT00002 ( 0x00C[E-] ... 0x01A )
               [000032] DACXG------                         *  STORE_LCL_VAR int    V04 tmp1         
               [000031] --CXG------                         \--*  QMARK     int   
               [000028] -----------    if                      +--*  EQ        int   
               [000026] -----------                            |  +--*  CNS_INT   int    0
               [000027] -----------                            |  \--*  CNS_INT   int    0
               [000030] --CXG------    if                      \--*  COLON     int   
               [000025] --CXG------ else                          +--*  QMARK     int   
               [000022] -----------    if                         |  +--*  EQ        int   
               [000020] -----------                               |  |  +--*  AND       int   
               [000017] -----------                               |  |  |  +--*  EQ        int   
               [000015] -----------                               |  |  |  |  +--*  CNS_INT   int    0
               [000016] -----------                               |  |  |  |  \--*  CNS_INT   int    -1
               [000019] -----------                               |  |  |  \--*  EQ        int   
               [000014] -----------                               |  |  |     +--*  LCL_VAR   int    V01 loc1         
               [000018] -----------                               |  |  |     \--*  CNS_INT   int    -0x80000000
               [000021] -----------                               |  |  \--*  CNS_INT   int    1
               [000024] --CXG------    if                         |  \--*  COLON     int   
               [000013] ----------- else                          |     +--*  DIV       int   
               [000011] -----------                               |     |  +--*  LCL_VAR   int    V01 loc1         
               [000012] -----------                               |     |  \--*  CNS_INT   int    0
               [000023] --CXG------ then                          |     \--*  CALL help int    CORINFO_HELP_OVERFLOW
               [000029] --CXG------ then                          \--*  CALL help int    CORINFO_HELP_THROWDIVZERO

***** BB02 [0001]
STMT00003 ( ??? ... ??? )
               [000034] --CXG------                         *  CALL      void   Program:Foo(System.Object,int)
               [000010] n--XG------ arg0                    +--*  IND       ref   
               [000009] ---X-------                         |  \--*  FIELD_ADDR byref  Program+C:Field
               [000008] -----------                         |     \--*  LCL_VAR   ref    V00 loc0         
               [000033] ----------- arg1                    \--*  LCL_VAR   int    V04 tmp1

The snippet is coming from this IL produced by the regression test:

IL_000d  7b 01 00 00 04    ldfld        0x4000001
IL_0012  07                ldloc.1     
IL_0013  16                ldc.i4.0    
IL_0014  5b                div         
IL_0015  28 04 00 00 06    call         0x6000004

The spilling required for the divide-by-zero check has caused it to be evaluated before the null-check on the indirection to the struct field. I would need to refactor the code to make LDFLD spill its side effects on import as well to get the correct evaluation order. This could also be an issue for any other opcodes that throw or have other side effects near a division node, just they haven't been flagged in tests.

jakobbotsch · 2025-02-24T18:26:01Z

The implementation is now regressing #63905, I think I can see why.

***** BB02 [0001]
STMT00002 ( 0x00C[E-] ... 0x01A )
               [000032] DACXG------                         *  STORE_LCL_VAR int    V04 tmp1         
               [000031] --CXG------                         \--*  QMARK     int   
               [000028] -----------    if                      +--*  EQ        int   
               [000026] -----------                            |  +--*  CNS_INT   int    0
               [000027] -----------                            |  \--*  CNS_INT   int    0
               [000030] --CXG------    if                      \--*  COLON     int   
               [000025] --CXG------ else                          +--*  QMARK     int   
               [000022] -----------    if                         |  +--*  EQ        int   
               [000020] -----------                               |  |  +--*  AND       int   
               [000017] -----------                               |  |  |  +--*  EQ        int   
               [000015] -----------                               |  |  |  |  +--*  CNS_INT   int    0
               [000016] -----------                               |  |  |  |  \--*  CNS_INT   int    -1
               [000019] -----------                               |  |  |  \--*  EQ        int   
               [000014] -----------                               |  |  |     +--*  LCL_VAR   int    V01 loc1         
               [000018] -----------                               |  |  |     \--*  CNS_INT   int    -0x80000000
               [000021] -----------                               |  |  \--*  CNS_INT   int    1
               [000024] --CXG------    if                         |  \--*  COLON     int   
               [000013] ----------- else                          |     +--*  DIV       int   
               [000011] -----------                               |     |  +--*  LCL_VAR   int    V01 loc1         
               [000012] -----------                               |     |  \--*  CNS_INT   int    0
               [000023] --CXG------ then                          |     \--*  CALL help int    CORINFO_HELP_OVERFLOW
               [000029] --CXG------ then                          \--*  CALL help int    CORINFO_HELP_THROWDIVZERO

***** BB02 [0001]
STMT00003 ( ??? ... ??? )
               [000034] --CXG------                         *  CALL      void   Program:Foo(System.Object,int)
               [000010] n--XG------ arg0                    +--*  IND       ref   
               [000009] ---X-------                         |  \--*  FIELD_ADDR byref  Program+C:Field
               [000008] -----------                         |     \--*  LCL_VAR   ref    V00 loc0         
               [000033] ----------- arg1                    \--*  LCL_VAR   int    V04 tmp1

The snippet is coming from this IL produced by the regression test:

IL_000d  7b 01 00 00 04    ldfld        0x4000001
IL_0012  07                ldloc.1     
IL_0013  16                ldc.i4.0    
IL_0014  5b                div         
IL_0015  28 04 00 00 06    call         0x6000004

The spilling required for the divide-by-zero check has caused it to be evaluated before the null-check on the indirection to the struct field. I would need to refactor the code to make LDFLD spill its side effects on import as well to get the correct evaluation order. This could also be an issue for any other opcodes that throw or have other side effects near a division node, just they haven't been flagged in tests.

I still see CHECK_SPILL_NONE in your code that should be CHECK_SPILL_ALL.

jakobbotsch · 2025-02-24T18:28:50Z

src/coreclr/jit/importer.cpp

+                    impImportDivisionWithChecks(oper, type, op1, op2);
+                    break;
+                }
+#endif


I think what you had before looked better. It should be shaped something like this:

case CEE_DIV: #ifdef TARGET_ARM64 if (opts.OptimizationEnabled() && impImportDivisionWithChecks(oper, type, op1, op2)) { break; } #endif oper = GT_DIV; goto MATH_MAYBE_CALL_NO_OVF;

Then just have impImportDivisionWithChecks return false for the cases it does not handle.

To do this I would have to evaluate type, op1 and op2 in this block which might make it look repetitive again, especially if copied across CEE_DIV/CEE_DIV_UN. It's using the original behavior as a fallback now so it makes sense to try and merge this with the original logic in my opinion.

I'm not sure I follow. I prefer a solution where the changes are restricted to be inside the new function you introduce. I don't see what duplication we would have.

I mean that this block (from the label MATH_OP2_FLAGS) needs to be evaluated before I have access to those parameters to the function:

op2 = impStackTop().val; op1 = impStackTop(1).val; /* Can't do arithmetic with references */ assertImp(genActualType(op1->TypeGet()) != TYP_REF && genActualType(op2->TypeGet()) != TYP_REF); // Change both to TYP_I_IMPL (impBashVarAddrsToI won't change if its a true byref, only // if it is in the stack) impBashVarAddrsToI(op1, op2); type = impGetByRefResultType(oper, uns, &op1, &op2);

So I could refactor the function again to not take these as arguments but that would mean copying this block into the function.

Division is not supported with byrefs, so this block is unnecessary to have in your function.

I tried removing this, but removing impGetByRefResultType loses some important side effects and I think it causes the recent set of failures. It's inserting upcasts for integers and processing native int types. I've renamed it to impProcessResultType as it doesn't just handle byrefs and added it back to my function.

…or in temp vars

…impImportDivisionWithChecks

snickolls-arm · 2025-03-05T17:03:29Z

This seems to be passing now, with some positive PerfScore improvements in the diffs. I've made sure checks are also added this way for modulo operations as they are morphed to divisions on ARM64.

There are some opportunities for improvement that @a74nh and I have noticed in the diffs:

Some compilations contain duplicated throw code, where more than one check occurs in the function. This can increase the code size for some functions but shouldn't have much impact on performance.

-G_M20943_IG07:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
+G_M20943_IG08:        ; bbWeight=0.12, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
             bl      CORINFO_HELP_OVERFLOW
             brk     #0
-						;; size=8 bbWeight=0 PerfScore 0.00
+						;; size=8 bbWeight=0.12 PerfScore 0.25
+G_M20943_IG09:        ; bbWeight=0.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
+            bl      CORINFO_HELP_THROWDIVZERO
+            brk     #0
+						;; size=8 bbWeight=0.25 PerfScore 0.50
+G_M20943_IG10:        ; bbWeight=0.12, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
+            bl      CORINFO_HELP_OVERFLOW
+            brk     #0
+						;; size=8 bbWeight=0.12 PerfScore 0.25
+G_M20943_IG11:        ; bbWeight=0.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
+            bl      CORINFO_HELP_THROWDIVZERO
+            brk     #0
+

Another thing to note is that these throw blocks are being assigned block weights, previously they were marked as run rarely. It's not clear to me if this is going to have any impact.

We've also noticed that some checks are now preferring CCMP over consecutive branches, see the sequence below. This should be a good thing as it reduces branch density. The sequence could be improved by removing the unnecessary cset.

+            ccmp    x0, x2, 0, eq
+            cset    x2, eq
+            cmp     w2, #1
+            beq     G_M30534_IG06

jakobbotsch · 2025-03-05T17:14:33Z

I'm curious if you compared this with the approach suggested by @SingleAccretion in #64795 (comment).
That approach should lend itself towards actually hoisting the full if (divisor == 0) { throw exception } check out of the loop. The approach in this PR does not actually lend itself to do that; the hoisting that's happening here is only of the divisor == 0 part, which is computed into a bool that is then still being checked inside the loop.

snickolls-arm · 2025-03-07T11:42:40Z

I didn't manage to get this working initially, down to needing more experience across the code base, but I think it could follow on from this patch to try that.

The problem with hoisting conditional logic isn't unique to code added by the compiler though. For example, this code doesn't result in any optimization in the same way:

using System;
using System.Runtime;
using System.Runtime.CompilerServices;

class Program
{
    [MethodImpl(MethodImplOptions.NoInlining)]
    private static int example(int x, int y, int z)
    {
        int result = x;
        for (int i = 0; i < z; i++)
        {
            if (y == 0)
            {
                throw new DivideByZeroException();
            }
        }
        return result;
    }

    static void Main()
    {
        System.Console.WriteLine(example(1, 3, 100000));
    }
}

Could this be improved by implementing loop unswitching, which would hoist the condition and duplicate the loop to produce

if (y == 0)
{
    for (int i = 0; i < z; i++)
    {
        throw new DivideByZeroException();
    }
}
else
{
    for (int i = 0; i < z; i++)
    {
    }
}

Then the true loop is optimized to a single throw statement, and the false loop is optimized away.

On ARM64 I'm seeing this loop block generated at the moment:

G_M16100_IG04:
IN0007: 000014      cbz     w1, G_M16100_IG07    // target throws DivideByZeroException()
IN0008: 000018      sub     w2, w2, #1
IN0009: 00001C      cbnz    w2, G_M16100_IG04

which could be simplified to just the first cbz, removing the loop entirely.

Has this been discussed before? A quick search shows this issue: #65342

jakobbotsch · 2025-03-07T12:23:26Z

The problem with hoisting conditional logic isn't unique to code added by the compiler though. For example, this code doesn't result in any optimization in the same way:
Could this be improved by implementing loop unswitching, which would hoist the condition and duplicate the loop to produce

Yes, the hoisting implementation in the JIT is notoriously poor. So you kind of have to compensate for that. Introducing GT_CKZERO should fit in with how loop hoisting currently works.
Indeed loop unswitching is on the list of optimizations that we want to support at some point.

I didn't manage to get this working initially, down to needing more experience across the code base, but I think it could follow on from this patch to try that.

I think the other approach is incompatible with this PR, so we would need to revert this PR. That's why I would be curious to see an experiment to do that instead.
CKZERO will look very similar to CKFINITE so that should make it somewhat straightforward to try out.

(Wrt. the "dividend == MinValue && divisor == -1" check: I would suggest just leaving that check up to the div node for now instead of materializing it explicitly, given that the dividend is not expected to be invariant.)

jakobbotsch · 2025-03-07T12:28:06Z

src/coreclr/jit/importer.cpp

+        result =
+            gtNewQmarkNode(resultType, condition,
+                           gtNewColonNode(resultType, gtNewHelperCallNode(CORINFO_HELP_OVERFLOW, resultType), result));
+    }
+
+    divisor = gtClone(divisor, false);
+    result =
+        gtNewQmarkNode(resultType,
+                       // (divisor == 0)
+                       gtNewOperNode(GT_EQ, TYP_INT, divisor, gtNewIconNode(0, genActualType(divisor))),
+                       gtNewColonNode(resultType, gtNewHelperCallNode(CORINFO_HELP_THROWDIVZERO, resultType), result));


gtThenLikelihood should be set for these qmark nodes to indicate that we never expect the exceptions to be thrown. I do not think the current perfscore diffs show the right picture because this is missing.

SingleAccretion · 2025-03-07T12:41:54Z

CKZERO will look very similar to CKFINITE so that should make it somewhat straightforward to try out.

Notably, since #64795 (comment) I've realized that it should probably be like NULLCHECK, since that would need less special handling. So the final tree would look like:

COMMA
  CKZERO
  DIV/MOD

; For comparison, the initial idea:
DIV/MOD
  dividend
  CKZERO
    divisor

Where DIV/MOD has the "no DivByZero" flag, and both CKZERO and DIV/MOD have GTF_ORDER_SIDEEFF.

DIV/MOD would retain their current semantics, this transformation could be entirely optional and done e. g. in morph, like INDEX_ADDR expasion.

jakobbotsch · 2025-03-07T13:14:23Z

That will work too, though I am unsure how well hoisting is going to handle it.
It will need a case added to

runtime/src/coreclr/jit/optimizer.cpp

Lines 4425 to 4454 in d2650b6

    
           bool IsNodeHoistable(GenTree* node) 
        
           { 
        
               // TODO-CQ: This is a more restrictive version of a check that optIsCSEcandidate already does - it allows 
        
               // a struct typed node if a class handle can be recovered from it. 
        
               if (node->TypeGet() == TYP_STRUCT) 
        
               { 
        
                   return false; 
        
               } 
        
               else if (node->OperIs(GT_NULLCHECK)) 
        
               { 
        
                   // If a null-check is for `this` object, it is safe to 
        
                   // hoist it out of the loop. Assertionprop will get rid 
        
                   // of left over nullchecks present inside the loop. Also, 
        
                   // since NULLCHECK has no value, it will never be CSE, 
        
                   // hence this check is not present in optIsCSEcandidate(). 
        
                   return true; 
        
               } 
        
               else if ((node->gtFlags & GTF_ORDER_SIDEEFF) != 0) 
        
               { 
        
                   // If a node has an order side effect, we can't hoist it at all: we don't know what the order 
        
                   // dependence actually is. For example, assertion prop might have determined a node can't throw 
        
                   // an exception, and eliminated the GTF_EXCEPT flag, replacing it with GTF_ORDER_SIDEEFF. We 
        
                   // can't hoist because we might then hoist above the expression that led assertion prop to make 
        
                   // that decision. This can happen in JitOptRepeat, where hoisting can follow assertion prop. 
        
                   return false; 
        
               } 
        
               // Tree must be a suitable CSE candidate for us to be able to hoist it. 
        
               return m_compiler->optIsCSEcandidate(node); 
        
           }

and then furthermore some assertion prop work will be needed to generate assertions from CKZERO and eliminate it based on them. That will also largely follow the handling for GT_NULLCHECK, however.

@snickolls-arm are you up for giving it a go?

snickolls-arm · 2025-03-07T15:10:56Z

Yes I'll have a go at this. I'll open another draft PR when it's close to completion and we can compare with this one.

JulieLeeMSFT · 2025-03-28T18:43:50Z

@kunalspathak, PTAL.

JulieLeeMSFT · 2025-04-14T15:52:56Z

@snickolls-arm, I moved this PR to draft. Please remove it once ready for review.

a74nh · 2025-04-16T13:17:12Z

@JulieLeeMSFT : I've cleared the milestone from this PR. This is a lot of work from a minimal perf improvement plus tidier IR. As such, it's best concentrating on other issues and returning to this after the cut off point for .NET10. I still think it's worth completing.

dotnet-policy-service · 2025-04-30T14:22:19Z

This pull request has been automatically marked no-recent-activity because it has not had any activity for 14 days. It will be closed if no further activity occurs within 14 more days. Any new comment (by anyone, not necessarily the author) will remove no-recent-activity.

dotnet-policy-service · 2025-05-14T17:19:39Z

This pull request will now be closed since it had been marked no-recent-activity but received no further activity in the past 14 days. It is still possible to reopen or comment on the pull request, but please note that it will be locked if it remains inactive for another 30 days.

ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jan 17, 2025

dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Jan 17, 2025

build-analysis bot mentioned this pull request Jan 17, 2025

System.Data.Common.Tests Assert failure on Linx x64 CI test run #108070

Open

jakobbotsch reviewed Jan 20, 2025

View reviewed changes

kunalspathak added the arch-arm64 label Jan 23, 2025

snickolls-arm force-pushed the rt-check-hoisting branch from e22295b to f8928ff Compare January 24, 2025 13:52

kunalspathak reviewed Jan 29, 2025

View reviewed changes

snickolls-arm and others added 2 commits February 5, 2025 15:47

Add docs and conditional compile for ARM64

0a71d96

Merge branch 'main' into rt-check-hoisting

699ff3e

Fix merged code

4b58921

snickolls-arm force-pushed the rt-check-hoisting branch from c777c08 to 4b58921 Compare February 5, 2025 16:30

jakobbotsch reviewed Feb 5, 2025

View reviewed changes

tannergooding mentioned this pull request Feb 19, 2025

Tensor primitives divide int32 #111505

Merged

Tidying and fix IL evaluation ordering

400ad55

This was referenced Feb 21, 2025

slow macOS - "##[error]The job running on agent Azure Pipelines 9 ran longer than the maximum time of 60 minutes." dotnet/dnceng#1883

Open

The Operation will be canceled. The next steps may not contain expected logs. dotnet/dnceng#3008

Open

jakobbotsch reviewed Feb 24, 2025

View reviewed changes

snickolls-arm added 6 commits February 26, 2025 10:49

Revert to previous design and ordering of clones

dbc85a4

Convert unsigned check to assertion

a2107b6

Try using impSpillSideEffects and explicitly store dividend and divis…

901babc

…or in temp vars

Rename impGetByRefResultType to impProcessResultType and use this in …

064c4a7

…impImportDivisionWithChecks

Add checks for remainder operations

f8925b6

Merge branch 'main' into rt-check-hoisting

0bfc8fe

snickolls-arm marked this pull request as ready for review March 5, 2025 16:17

jakobbotsch reviewed Mar 7, 2025

View reviewed changes

JulieLeeMSFT assigned snickolls-arm Mar 28, 2025

JulieLeeMSFT added this to the 10.0.0 milestone Mar 28, 2025

JulieLeeMSFT requested a review from kunalspathak March 28, 2025 18:42

JulieLeeMSFT added the needs-author-action An issue or pull request that requires more info or actions from the author. label Apr 7, 2025

snickolls-arm mentioned this pull request Apr 8, 2025

Add RTCHECK node for compiler-inserted runtime check code #114369

Closed

JulieLeeMSFT marked this pull request as draft April 14, 2025 15:52

a74nh removed this from the 10.0.0 milestone Apr 16, 2025

dotnet-policy-service bot added the no-recent-activity label Apr 30, 2025

dotnet-policy-service bot closed this May 14, 2025

github-actions bot locked and limited conversation to collaborators Jun 14, 2025

		@@ -4681,6 +4681,8 @@ class Compiler

		GenTree* impThrowIfNull(GenTreeCall* call);

		void impImportDivision(bool isSigned);

Revamp the generation of runtime division checks on ARM64 #111543

Revamp the generation of runtime division checks on ARM64 #111543

Uh oh!

Conversation

snickolls-arm commented Jan 17, 2025

Uh oh!

snickolls-arm commented Jan 17, 2025

Uh oh!

snickolls-arm commented Jan 17, 2025

Uh oh!

kunalspathak commented Jan 18, 2025

Uh oh!

jakobbotsch commented Jan 18, 2025

Uh oh!

snickolls-arm commented Jan 20, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jakobbotsch commented Jan 20, 2025

Uh oh!

snickolls-arm commented Jan 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kunalspathak left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

snickolls-arm commented Feb 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

snickolls-arm commented Feb 24, 2025

Uh oh!

jakobbotsch commented Feb 24, 2025

Uh oh!

jakobbotsch Feb 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jakobbotsch Feb 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

snickolls-arm commented Mar 5, 2025

snickolls-arm commented Jan 24, 2025 •

edited

Loading

snickolls-arm commented Feb 5, 2025 •

edited

Loading

jakobbotsch Feb 24, 2025 •

edited

Loading

jakobbotsch Feb 25, 2025 •

edited

Loading