dotnet · fuad1502 · May 5, 2025
diff --git a/src/coreclr/jit/emitriscv64.cpp b/src/coreclr/jit/emitriscv64.cpp
@@ -1320,26 +1320,8 @@ void emitter::emitLoadImmediate(emitAttr size, regNumber reg, ssize_t imm)
         return;
     }
 
-    /* The following algorithm works based on the following equation:
-     * `imm = high32 + offset1` OR `imm = high32 - offset2`
-     *
-     * high32 will be loaded with `lui + addiw`, while offset
-     * will be loaded with `slli + addi` in 11-bits chunks
-     *
-     * First, determine at which position to partition imm into high32 and offset,
-     * so that it yields the least instruction.
-     * Where high32 = imm[y:x] and imm[63:y] are all zeroes or all ones.
-     *
-     * From the above equation, the value of offset1 & offset2 are:
-     * -> offset1 = imm[x-1:0]
-     * -> offset2 = ~(imm[x-1:0] - 1)
-     * The smaller offset should yield the least instruction. (is this correct?) */
-
-    // STEP 1: Determine x & y
-
-    int x;
     int y;
-    if (((uint64_t)imm >> 63) & 0b1)
+    if ((imm >> 63) & 0b1)
     {
         // last one position from MSB
         y = 63 - BitOperations::LeadingZeroCount((uint64_t)~imm) + 1;
@@ -1349,18 +1331,6 @@ void emitter::emitLoadImmediate(emitAttr size, regNumber reg, ssize_t imm)
         // last zero position from MSB
         y = 63 - BitOperations::LeadingZeroCount((uint64_t)imm) + 1;
     }
-    if (imm & 0b1)
-    {
-        // first zero position from LSB
-        x = BitOperations::TrailingZeroCount((uint64_t)~imm);
-    }
-    else
-    {
-        // first one position from LSB
-        x = BitOperations::TrailingZeroCount((uint64_t)imm);
-    }
-
-    // STEP 2: Determine whether to utilize SRLI or not.
 
     /* SRLI can be utilized when the input has the following pattern:
      *
@@ -1399,188 +1369,118 @@ void emitter::emitLoadImmediate(emitAttr size, regNumber reg, ssize_t imm)
         insCountLimit = absMaxInsCount;
     }
 
-    bool     utilizeSRLI     = false;
-    int      srliShiftAmount = 0;
-    uint64_t originalImm     = imm;
-    bool     cond1           = (y - x) > 31;
-    if ((((uint64_t)imm >> 63) & 0b1) == 0 && cond1)
-    {
-        srliShiftAmount  = BitOperations::LeadingZeroCount((uint64_t)imm);
-        uint64_t tempImm = (uint64_t)imm << srliShiftAmount;
-        int      m       = BitOperations::LeadingZeroCount(~tempImm);
-        int      b       = 64 - m;
-        int      a       = BitOperations::TrailingZeroCount(tempImm);
-        bool     cond2   = (b - a) < 32;
-        bool     cond3   = ((y - x) - (b - a)) >= 11;
-        if (cond2 || cond3)
+    int     srliShiftAmount = 0;
+    ssize_t originalImm     = imm;
+    if ((((imm >> 63) & 0b1) == 0) && (y > 31))
+    {
+        int     leadingZeroCount = BitOperations::LeadingZeroCount((uint64_t)imm);
+        ssize_t shiftedImm       = imm << leadingZeroCount;
+        int     m                = BitOperations::LeadingZeroCount((uint64_t)~shiftedImm);
+        if (m > 11)
         {
-            imm         = tempImm;
-            y           = b;
-            x           = a;
-            utilizeSRLI = true;
+            srliShiftAmount = leadingZeroCount;
+            imm             = shiftedImm;
+            y               = 64 - m;
             insCountLimit -= 1;
         }
     }
 
-    assert(y >= x);
-    assert((1 <= y) && (y <= 63));
-    assert((1 <= x) && (x <= 63));
-
     if (y < 32)
     {
         y = 31;
-        x = 0;
-    }
-    else if ((y - x) < 31)
-    {
-        y = x + 31;
-    }
-    else
-    {
-        x = y - 31;
-    }
-
-    uint32_t high32 = ((int64_t)imm >> x) & WordMask(32);
-
-    // STEP 3: Determine whether to use high32 + offset1 or high32 - offset2
-
-    /* TODO: Instead of using subtract / add mode, assume that we're always adding
-     * 12-bit chunks. However, if we encounter such 12-bit chunk with MSB == 1,
-     * add 1 to the previous chunk, and add the 12-bit chunk as is, which
-     * essentially does a subtraction. It will generate the least instruction to
-     * load offset.
-     * See the following discussion:
-     * https://github.com/dotnet/runtime/pull/113250#discussion_r1987576070 */
-
-    uint32_t offset1        = imm & WordMask((uint8_t)x);
-    uint32_t offset2        = (~(offset1 - 1)) & WordMask((uint8_t)x);
-    uint32_t offset         = offset1;
-    bool     isSubtractMode = false;
-
-    if ((high32 == 0x7FFFFFFF) && (y != 63))
-    {
-        /* Handle corner case: we cannot do subtract mode if high32 == 0x7FFFFFFF
-         * Since adding 1 to it will change the sign bit. Instead, shift x and y
-         * to the left by one. */
-        int      newX       = x + 1;
-        uint32_t newOffset1 = imm & WordMask((uint8_t)newX);
-        uint32_t newOffset2 = (~(newOffset1 - 1)) & WordMask((uint8_t)newX);
-        if (newOffset2 < offset1)
-        {
-            x              = newX;
-            high32         = ((int64_t)imm >> x) & WordMask(32);
-            offset2        = newOffset2;
-            isSubtractMode = true;
-        }
-    }
-    else if (offset2 < offset1)
-    {
-        isSubtractMode = true;
-    }
-
-    if (isSubtractMode)
-    {
-        offset = offset2;
-        high32 = (high32 + 1) & WordMask(32);
     }
 
     assert(absMaxInsCount >= 2);
     int         numberOfInstructions = 0;
     instruction ins[absMaxInsCount];
     int32_t     values[absMaxInsCount];
 
-    // STEP 4: Generate instructions to load high32
+    int32_t high32  = (imm >> (y - 31));
+    int32_t upper20 = high32 >> 12;
+    int32_t lower12 = (high32 << 20) >> 20;
 
-    uint32_t upper    = (high32 >> 12) & WordMask(20);
-    uint32_t lower    = high32 & WordMask(12);
-    int      lowerMsb = (lower >> 11) & 0b1;
-    if (lowerMsb == 1)
-    {
-        upper += 1;
-        upper &= WordMask(20);
-    }
-    if (upper != 0)
+    int     numRemainderBits = y - 31;
+    int32_t remainder        = (imm << (32 - numRemainderBits)) & WordMask(32);
+
+    if (!((upper20 == 0) || (upper20 == -1 && lower12 < 0)))
     {
         ins[numberOfInstructions]    = INS_lui;
-        values[numberOfInstructions] = ((upper >> 19) & 0b1) ? (upper + 0xFFF00000) : upper;
+        values[numberOfInstructions] = upper20;
         numberOfInstructions += 1;
     }
-    if (lower != 0)
+
+    bool defer_addiw = (lower12 == -1 && remainder < 0);
+
+    if (!((lower12 == 0) || defer_addiw))
     {
         ins[numberOfInstructions]    = INS_addiw;
-        values[numberOfInstructions] = lower;
+        values[numberOfInstructions] = lower12;
         numberOfInstructions += 1;
+        if (lower12 < 0)
+        {
+            values[numberOfInstructions - 2] += 1;
+            // Sign extend 20 bits immediate value, since the sign bit might change.
+            values[numberOfInstructions - 2] = (values[numberOfInstructions - 2] << (32 - 20)) >> (32 - 20);
+        }
     }
 
-    // STEP 5: Generate instructions to load offset in 11-bits chunks
-
-    int chunkLsbPos = (x < 11) ? 0 : (x - 11);
-    int shift       = (x < 11) ? x : 11;
-    int chunkMask   = (x < 11) ? WordMask((uint8_t)x) : WordMask(11);
-    while (true)
+    while (numRemainderBits > 0)
     {
-        uint32_t chunk = (offset >> chunkLsbPos) & chunkMask;
+        int32_t shift;
+        int     remainderMsb = (remainder >> 31) & 0b1;
+        if (remainderMsb == 0)
+        {
+            shift = 11 + BitOperations::LeadingZeroCount((uint32_t)remainder);
+        }
+        else
+        {
+            shift = 11 + BitOperations::LeadingZeroCount((uint32_t)~remainder);
+        }
+        shift         = (numRemainderBits < shift) ? numRemainderBits : shift;
+        int32_t chunk = (remainder >> (32 - shift));
 
-        if (chunk != 0)
+        if (numRemainderBits < 12 && !defer_addiw)
         {
-            /* We could move our 11 bit chunk window to the right for as many as the
-             * leading zeros.*/
-            int leadingZerosOn11BitsChunk = 11 - (32 - BitOperations::LeadingZeroCount(chunk));
-            if (leadingZerosOn11BitsChunk > 0)
-            {
-                int maxAdditionalShift =
-                    (chunkLsbPos < leadingZerosOn11BitsChunk) ? chunkLsbPos : leadingZerosOn11BitsChunk;
-                chunkLsbPos -= maxAdditionalShift;
-                shift += maxAdditionalShift;
-                chunk = (offset >> chunkLsbPos) & chunkMask;
-            }
+            chunk &= WordMask(numRemainderBits);
+        }
+
+        numberOfInstructions += 1;
+        if (numberOfInstructions > insCountLimit)
+        {
+            break;
+        }
+        ins[numberOfInstructions - 1]    = INS_slli;
+        values[numberOfInstructions - 1] = shift;
 
-            numberOfInstructions += 2;
+        if (chunk != 0)
+        {
+            numberOfInstructions += 1;
             if (numberOfInstructions > insCountLimit)
             {
                 break;
             }
-            ins[numberOfInstructions - 2]    = INS_slli;
-            values[numberOfInstructions - 2] = shift;
-            if (isSubtractMode)
-            {
-                ins[numberOfInstructions - 1]    = INS_addi;
-                values[numberOfInstructions - 1] = -(int32_t)chunk;
-            }
-            else
-            {
-                ins[numberOfInstructions - 1]    = INS_addi;
-                values[numberOfInstructions - 1] = chunk;
-            }
-            shift = 0;
+            ins[numberOfInstructions - 1]    = INS_addi;
+            values[numberOfInstructions - 1] = chunk;
         }
-        if (chunkLsbPos == 0)
-        {
-            break;
-        }
-        shift += (chunkLsbPos < 11) ? chunkLsbPos : 11;
-        chunkMask = (chunkLsbPos < 11) ? (chunkMask >> (11 - chunkLsbPos)) : WordMask(11);
-        chunkLsbPos -= (chunkLsbPos < 11) ? chunkLsbPos : 11;
-    }
-    if (shift > 0)
-    {
-        numberOfInstructions += 1;
-        if (numberOfInstructions <= insCountLimit)
+
+        if (chunk >> 11)
         {
-            ins[numberOfInstructions - 1]    = INS_slli;
-            values[numberOfInstructions - 1] = shift;
+            values[numberOfInstructions - 3] += 1;
+            // Sign extend 12 bits immediate value, since the sign bit might change.
+            values[numberOfInstructions - 3] = (values[numberOfInstructions - 3] << (32 - 12)) >> (32 - 12);
         }
-    }
 
-    // STEP 6: Determine whether to use emitDataConst or emit generated instructions
+        remainder = remainder << shift;
+        numRemainderBits -= shift;
+    }
 
     if (numberOfInstructions <= insCountLimit)
     {
         instrDescLoadImm* id = static_cast<instrDescLoadImm*>(emitNewInstrLoadImm(size, originalImm));
         id->idReg1(reg);
         memcpy(id->ins, ins, sizeof(instruction) * numberOfInstructions);
         memcpy(id->values, values, sizeof(int32_t) * numberOfInstructions);
-        if (utilizeSRLI)
+        if (srliShiftAmount != 0)
         {
             numberOfInstructions += 1;
             assert(numberOfInstructions < absMaxInsCount);