JuliaMath · NHDaly · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,15 @@
 name = "FixedPointDecimals"
 uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
 authors = ["Fengyang Wang <[email protected]>", "Curtis Vogt <[email protected]>"]
-version = "0.5.2"
+version = "0.5.3"
 
 [deps]
+BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
 Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 
 [compat]
 Parsers = "2.7"
+BitIntegers = "0.3.1"
 julia = "1.6"
 
 [extras]

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
@@ -35,8 +35,23 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
     checked_mod, checked_mul, checked_neg, checked_rem, checked_sub
 
 using Base: decompose, BitInteger
+
+using BitIntegers: BitIntegers, Int256, Int512, UInt256, UInt512
 import Parsers
 
+# The effects system in newer versions of Julia is necessary for the fldmod_by_const
+# optimization to take affect without adverse performance impacts.
+# For older versions of julia, we will fall back to the flmod implementation, which works
+# very fast for all FixedDecimals of size <= 64 bits.
+if isdefined(Base, Symbol("@assume_effects"))
+    include("fldmod-by-const.jl")
+else
+    # We force-inline this, to allow the fldmod operation to be specialized for a constant
+    # second argument (converting divide-by-power-of-ten instructions with the cheaper
+    # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128.
+    @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y))
+end
+
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
 
@@ -118,6 +133,23 @@ function __init__()
     return
 end
 
+# Custom widemul implementation to avoid the cost of widening to BigInt.
+# FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
+const BitInteger128 = Union{Int128, UInt128}
+_widemul(x, y) = _widen(x) * _widen(y)
+_widemul(x::Signed,y::Unsigned) = _widen(x) * signed(_widen(y))
+_widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
+
+# Custom widen implementation to avoid the cost of widening to BigInt.
+# FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
+_widen(::Type{Int128}) = Int256
+_widen(::Type{UInt128}) = UInt256
+_widen(::Type{Int256}) = BitIntegers.Int512
+_widen(::Type{UInt256}) = BitIntegers.UInt512
+_widen(t::Type) = widen(t)
+_widen(x::T) where {T} = (_widen(T))(x)
+
+
 (::Type{T})(x::Real) where {T <: FD} = convert(T, x)
 
 floattype(::Type{<:FD{T}}) where {T<:Union{Int8, UInt8, Int16, UInt16}} = Float32
@@ -157,15 +189,18 @@ function _round_to_nearest(quotient::T,
                            divisor::T,
                            ::RoundingMode{M}=RoundNearest) where {T <: Integer, M}
     halfdivisor = divisor >> 1
-    if iseven(divisor) && remainder == halfdivisor
+    # PERF Note: Only need the last bit to check iseven, and default iseven(Int256)
+    # allocates, so we truncate first.
+    _iseven(x) = (x & 0x1) == 0
+    if _iseven(divisor) && remainder == halfdivisor
         # `:NearestTiesAway` will tie away from zero, e.g. -8.5 ->
         # -9. `:NearestTiesUp` will always ties towards positive
         # infinity. `:Nearest` will tie towards the nearest even
         # integer.
         if M == :NearestTiesAway
             ifelse(quotient < zero(quotient), quotient, quotient + one(quotient))
         elseif M == :Nearest
-            ifelse(iseven(quotient), quotient, quotient + one(quotient))
+            ifelse(_iseven(quotient), quotient, quotient + one(quotient))
         else
             quotient + one(quotient)
         end
@@ -177,18 +212,12 @@ function _round_to_nearest(quotient::T,
 end
 _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m)
 
-# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since
-# `fldmod` is sometimes not being inlined, that constant information is not available to the
-# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive
-# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient.
-@inline fldmodinline(x,y) = (fld(x,y), mod(x,y))
-
 # multiplication rounds to nearest even representation
 # TODO: can we use floating point to speed this up? after we build a
 # correctness test suite.
 function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
 end
 
@@ -215,12 +244,12 @@ function Base.round(x::FD{T, f},
                              RoundingMode{:NearestTiesUp},
                              RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     FD{T, f}(_round_to_nearest(quotient, remainder, powt, m))
 end
 function Base.ceil(x::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     if remainder > 0
         FD{T, f}(quotient + one(quotient))
     else
@@ -416,7 +445,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
 end
 function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     v = _round_to_nearest(quotient, remainder, powt)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
     return reinterpret(FD{T, f}, T(v))
@@ -474,7 +503,7 @@ checked_rdiv(x::FD, y::FD) = checked_rdiv(promote(x, y)...)
 
 function checked_rdiv(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmod(widemul(x.i, powt), y.i)
+    quotient, remainder = fldmod(_widemul(x.i, powt), y.i)
     v = _round_to_nearest(quotient, remainder, y.i)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:/, x, y)
     return reinterpret(FD{T, f}, v)
@@ -484,8 +513,8 @@ end
 # FixedDecimal.
 function checked_rdiv(x::Integer, y::FD{T, f}) where {T<:Integer, f}
     powt = coefficient(FD{T, f})
-    powtsq = widemul(powt, powt)
-    quotient, remainder = fldmod(widemul(x, powtsq), y.i)
+    powtsq = _widemul(powt, powt)
+    quotient, remainder = fldmod(_widemul(x, powtsq), y.i)
     v = _round_to_nearest(quotient, remainder, y.i)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:/, x, y)
     reinterpret(FD{T, f}, v)
@@ -722,7 +751,7 @@ NOTE: This function is expensive, since it contains a while-loop, but it is actu
       This function does not have or depend on any side-effects.
 """
 function max_exp10(::Type{T}) where {T <: Integer}
-    W = widen(T)
+    W = _widen(T)
     type_max = W(typemax(T))
 
     powt = one(W)
@@ -759,4 +788,4 @@ value(fd::FD) = fd.i
 # for generic hashing
 Base.decompose(fd::FD) = decompose(Rational(fd))
 
-end
+end  # module
diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
@@ -0,0 +1,141 @@
+# NOTE: Surprisingly, even though LLVM implements a version of this optimization on its own
+# for smaller integer sizes (<=64-bits), using the code in this file produces faster
+# multiplications for *all* types of integers. So we use our custom fldmod_by_const for all
+# bit integer types.
+# Before:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     84.959 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     247.709 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+#julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#    4.077 ms (160798 allocations: 3.22 MiB)
+#  FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+#
+# After:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     68.416 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     106.125 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#     204.125 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+
+"""
+    ShouldUseCustomFldmodByConst(::Type{<:MyCustomIntType})) = true
+A trait to control opt-in for the custom `fldmod_by_const` implementation. To use this for a
+given integer type, you can define this overload for your integer type.
+You will also need to implement some parts of the interface below, including _widen().
+"""
+ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true
+ShouldUseCustomFldmodByConst(::Type) = false
+
+@inline function fldmod_by_const(x, y)
+    if ShouldUseCustomFldmodByConst(typeof(x))
+        # For large Int types, LLVM doesn't optimize well, so we use a custom implementation
+        # of fldmod, which extends that optimization to those larger integer types.
+        d = fld_by_const(x, Val(y))
+        return d, manual_mod(promote(x, y, d)...)
+    else
+        # For other integers, LLVM might be able to correctly optimize away the division, if
+        # it knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
+        # inlined, so here we have explictly inlined it instead.
+        return (fld(x,y), mod(x,y))
+    end
+end
+
+# Calculate fld(x,y) when y is a Val constant.
+# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that
+# it uses `div_by_const` instead of `div`.
+fld_by_const(x::T, y::Val{C}) where {T<:Unsigned, C} = div_by_const(x, y)
+function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C}
+    d = div_by_const(x, y)
+    return d - (signbit(x ⊻ C) & (d * C != x))
+end
+
+# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
+# REQUIRES:
+#   - `y != -1`
+@inline function manual_mod(x::T, y::T, quotient::T) where T<:Integer
+    return x - quotient * y
+end
+
+# This function is based on the native code produced by the following:
+# @code_native ((x)->div(x, 100))(Int64(2))
+function div_by_const(x::T, ::Val{C}) where {T, C}
+    # These checks will be compiled away during specialization.
+    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
+    # checks allow this function to work for any `C > 0`, in case that's useful in the
+    # future.
+    if C == 1
+        return x
+    elseif ispow2(C)
+        return div(x, C)  # Will already do the right thing
+    elseif C <= 0
+        throw(DomainError("C must be > 0"))
+    end
+    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
+    # runtime.
+    inverse_coeff, toshift = calculate_inverse_coeff(T, C)
+    # Compute the upper-half of widemul(x, 2^nbits(T)/C).
+    # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the
+    # numerator of the multiplication, so that the result is equal to x/C.
+    out = mul_hi(x, inverse_coeff)
+    # This condition will be compiled away during specialization.
+    if T <: Signed
+        # Because our magic number has a leading one (since we shift all-the-way left), the
+        # result is negative if it's Signed. We add x to give us the positive equivalent.
+        out += x
+        signshift = (nbits(x) - 1)
+        isnegative = T(out >>> signshift)  # 1 if < 0 else 0 (Unsigned bitshift to read top bit)
+    end
+    # Undo the bitshifts used to calculate the invoeff magic number with maximum precision.
+    out = out >> toshift
+    if T <: Signed
+        out =  out + isnegative
+    end
+    return T(out)
+end
+
+Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
+    # First, calculate 2^nbits(T)/C
+    # We shift away leading zeros to preserve the most precision when we use it to multiply
+    # in the next step. At the end, we will shift the final answer back to undo this
+    # operation (which is why we need to return `toshift`).
+    # Note, also, that we calculate invcoeff at double-precision so that the left-shift
+    # doesn't leave trailing zeros. We truncate to only the upper-half before returning.
+    UT = _unsigned(T)
+    invcoeff = typemax(_widen(UT)) ÷ C
+    toshift = leading_zeros(invcoeff)
+    invcoeff = invcoeff << toshift
+    # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
+    # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
+    # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
+    invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T
+    return invcoeff, toshift
+end
+
+function mul_hi(x::T, y::T) where T
+    xy = _widemul(x, y)  # support Int256 -> Int512 (!!)
+    (xy >> nbits(T)) % T
+end
+
+# Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
+# https://github.com/rfourquet/BitIntegers.jl/pull/2
+# Note: We do not need this for Int512, since we only widen to 512 _after_ calling
+# _unsigned, above. This code is only for supporting the built-in integer types, which only
+# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for
+# other integer types, they will need to add methods to either _unsigned or unsigned.
+_unsigned(x) = unsigned(x)
+_unsigned(::Type{Int256}) = UInt256
+_unsigned(::Type{UInt256}) = UInt256
+
+nbits(x) = sizeof(x) * 8
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
@@ -0,0 +1,58 @@
+using Test
+using FixedPointDecimals
+
+@testset "calculate_inverse_coeff signed" begin
+    using FixedPointDecimals: calculate_inverse_coeff
+
+    # The correct magic number here comes from investigating the following native code
+    # produced on an m2 aarch64 macbook pro:
+    # @code_native ((x)->fld(x,100))(2)
+    #   ...
+    #         mov     x8, #55051
+    #         movk    x8, #28835, lsl #16
+    #         movk    x8, #2621, lsl #32
+    #         movk    x8, #41943, lsl #48
+    # Where:
+    # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48
+    # -6640827866535438581
+    @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6)
+
+    # Same for the tests below:
+
+    # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
+    #  but the result is the same.)
+    @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
+
+    @test calculate_inverse_coeff(Int64, 1) == (1, 0)
+end
+
+@testset "calculate_inverse_coeff signed 4" begin
+    using FixedPointDecimals: calculate_inverse_coeff
+
+    # Same here, our magic number is shifted 2 bits more than LLVM's
+    @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
+
+    @test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0)
+end
+
+@testset "div_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since div_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b
+        end
+    end
+end
+
+@testset "fldmod_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since fldmod_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.fldmod_by_const(a, b) == fldmod(a, b)
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,4 +10,6 @@ include(joinpath(pkg_path, "test", "utils.jl"))
 
 @testset "FixedPointDecimals" begin
     include("FixedDecimal.jl")
-end  # global testset
+    isdefined(Base, Symbol("@assume_effects")) && include("fldmod-by-const_tests.jl")
+end
+