Skip to content

Commit 5cfcb01

Browse files
jacobly0andrewrk
authored andcommitted
llvm: convert @divFloor and @mod to forms llvm will recognize
On x86_64, the `@divFloor` change is a strict improvement, and the `@mod` change adds one zero latency instruction. In return, once we upgrade to LLVM 20, when the optimizer discovers one of these operations has a power-of-two constant rhs, it will be able to optimize the entire operation into an `ashr` or `and`, respectively. #I CPL CPT old `@divFloor` | 8 | 15 | .143 | new `@divFloor` | 7 | 15 | .148 | old `@mod` | 9 | 17 | .134 | (rip llvm new `@mod` | 10 | 17 | .138 | scheduler)
1 parent b074fb7 commit 5cfcb01

File tree

1 file changed

+54
-19
lines changed

1 file changed

+54
-19
lines changed

src/codegen/llvm.zig

+54-19
Original file line numberDiff line numberDiff line change
@@ -8571,19 +8571,37 @@ pub const FuncGen = struct {
85718571
}
85728572
if (scalar_ty.isSignedInt(zcu)) {
85738573
const inst_llvm_ty = try o.lowerType(inst_ty);
8574-
const bit_size_minus_one = try o.builder.splatValue(inst_llvm_ty, try o.builder.intConst(
8574+
8575+
const ExpectedContents = [std.math.big.int.calcTwosCompLimbCount(256)]std.math.big.Limb;
8576+
var stack align(@max(
8577+
@alignOf(std.heap.StackFallbackAllocator(0)),
8578+
@alignOf(ExpectedContents),
8579+
)) = std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
8580+
const allocator = stack.get();
8581+
8582+
const scalar_bits = inst_llvm_ty.scalarBits(&o.builder);
8583+
var smin_big_int: std.math.big.int.Mutable = .{
8584+
.limbs = try allocator.alloc(
8585+
std.math.big.Limb,
8586+
std.math.big.int.calcTwosCompLimbCount(scalar_bits),
8587+
),
8588+
.len = undefined,
8589+
.positive = undefined,
8590+
};
8591+
defer allocator.free(smin_big_int.limbs);
8592+
smin_big_int.setTwosCompIntLimit(.min, .signed, scalar_bits);
8593+
const smin = try o.builder.splatValue(inst_llvm_ty, try o.builder.bigIntConst(
85758594
inst_llvm_ty.scalarType(&o.builder),
8576-
inst_llvm_ty.scalarBits(&o.builder) - 1,
8595+
smin_big_int.toConst(),
85778596
));
85788597

8579-
const div = try self.wip.bin(.sdiv, lhs, rhs, "");
8580-
const rem = try self.wip.bin(.srem, lhs, rhs, "");
8581-
const div_sign = try self.wip.bin(.xor, lhs, rhs, "");
8582-
const div_sign_mask = try self.wip.bin(.ashr, div_sign, bit_size_minus_one, "");
8583-
const zero = try o.builder.zeroInitValue(inst_llvm_ty);
8584-
const rem_nonzero = try self.wip.icmp(.ne, rem, zero, "");
8585-
const correction = try self.wip.select(.normal, rem_nonzero, div_sign_mask, zero, "");
8586-
return self.wip.bin(.@"add nsw", div, correction, "");
8598+
const div = try self.wip.bin(.sdiv, lhs, rhs, "divFloor.div");
8599+
const rem = try self.wip.bin(.srem, lhs, rhs, "divFloor.rem");
8600+
const rhs_sign = try self.wip.bin(.@"and", rhs, smin, "divFloor.rhs_sign");
8601+
const rem_xor_rhs_sign = try self.wip.bin(.xor, rem, rhs_sign, "divFloor.rem_xor_rhs_sign");
8602+
const need_correction = try self.wip.icmp(.ugt, rem_xor_rhs_sign, smin, "divFloor.need_correction");
8603+
const correction = try self.wip.cast(.sext, need_correction, inst_llvm_ty, "divFloor.correction");
8604+
return self.wip.bin(.@"add nsw", div, correction, "divFloor");
85878605
}
85888606
return self.wip.bin(.udiv, lhs, rhs, "");
85898607
}
@@ -8642,19 +8660,36 @@ pub const FuncGen = struct {
86428660
return self.wip.select(fast, ltz, c, a, "");
86438661
}
86448662
if (scalar_ty.isSignedInt(zcu)) {
8645-
const bit_size_minus_one = try o.builder.splatValue(inst_llvm_ty, try o.builder.intConst(
8663+
const ExpectedContents = [std.math.big.int.calcTwosCompLimbCount(256)]std.math.big.Limb;
8664+
var stack align(@max(
8665+
@alignOf(std.heap.StackFallbackAllocator(0)),
8666+
@alignOf(ExpectedContents),
8667+
)) = std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
8668+
const allocator = stack.get();
8669+
8670+
const scalar_bits = inst_llvm_ty.scalarBits(&o.builder);
8671+
var smin_big_int: std.math.big.int.Mutable = .{
8672+
.limbs = try allocator.alloc(
8673+
std.math.big.Limb,
8674+
std.math.big.int.calcTwosCompLimbCount(scalar_bits),
8675+
),
8676+
.len = undefined,
8677+
.positive = undefined,
8678+
};
8679+
defer allocator.free(smin_big_int.limbs);
8680+
smin_big_int.setTwosCompIntLimit(.min, .signed, scalar_bits);
8681+
const smin = try o.builder.splatValue(inst_llvm_ty, try o.builder.bigIntConst(
86468682
inst_llvm_ty.scalarType(&o.builder),
8647-
inst_llvm_ty.scalarBits(&o.builder) - 1,
8683+
smin_big_int.toConst(),
86488684
));
86498685

8650-
const rem = try self.wip.bin(.srem, lhs, rhs, "");
8651-
const div_sign = try self.wip.bin(.xor, lhs, rhs, "");
8652-
const div_sign_mask = try self.wip.bin(.ashr, div_sign, bit_size_minus_one, "");
8653-
const rhs_masked = try self.wip.bin(.@"and", rhs, div_sign_mask, "");
8686+
const rem = try self.wip.bin(.srem, lhs, rhs, "mod.rem");
8687+
const rhs_sign = try self.wip.bin(.@"and", rhs, smin, "mod.rhs_sign");
8688+
const rem_xor_rhs_sign = try self.wip.bin(.xor, rem, rhs_sign, "mod.rem_xor_rhs_sign");
8689+
const need_correction = try self.wip.icmp(.ugt, rem_xor_rhs_sign, smin, "mod.need_correction");
86548690
const zero = try o.builder.zeroInitValue(inst_llvm_ty);
8655-
const rem_nonzero = try self.wip.icmp(.ne, rem, zero, "");
8656-
const correction = try self.wip.select(.normal, rem_nonzero, rhs_masked, zero, "");
8657-
return self.wip.bin(.@"add nsw", rem, correction, "");
8691+
const correction = try self.wip.select(.normal, need_correction, rhs, zero, "mod.correction");
8692+
return self.wip.bin(.@"add nsw", correction, rem, "mod");
86588693
}
86598694
return self.wip.bin(.urem, lhs, rhs, "");
86608695
}

0 commit comments

Comments
 (0)