diff --git a/Project.toml b/Project.toml index f120b6c..9bef680 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "FixedPointDecimals" uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0" authors = ["Fengyang Wang ", "Curtis Vogt "] -version = "0.5.3" +version = "0.5.4" [deps] BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index 28b8611..38e45bb 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -36,9 +36,28 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld, using Base: decompose, BitInteger -import BitIntegers # For 128-bit _widemul / _widen +using BitIntegers: BitIntegers, Int256, Int512, UInt256, UInt512 import Parsers +# The effects system in newer versions of Julia is necessary for the fldmod_by_const +# optimization to take affect without adverse performance impacts. +# For older versions of julia, we will fall back to the flmod implementation, which works +# very fast for all FixedDecimals of size <= 64 bits. +if isdefined(Base, Symbol("@assume_effects")) + include("fldmod-by-const.jl") +else + # We force-inline this, to allow the fldmod operation to be specialized for a constant + # second argument (converting divide-by-power-of-ten instructions with the cheaper + # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128. + if VERSION >= v"1.8.0-" + # Since julia 1.8+, the built-in fldmod optimizes for constant divisors, which is + # even better than computing the division twice. + @inline fldmod_by_const(x,y) = fldmod(x,y) + else + @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y)) + end +end + # floats that support fma and are roughly IEEE-like const FMAFloat = Union{Float16, Float32, Float64, BigFloat} @@ -129,8 +148,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y) # Custom widen implementation to avoid the cost of widening to BigInt. # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt. -_widen(::Type{Int128}) = BitIntegers.Int256 -_widen(::Type{UInt128}) = BitIntegers.UInt256 +_widen(::Type{Int128}) = Int256 +_widen(::Type{UInt128}) = UInt256 +_widen(::Type{Int256}) = Int512 +_widen(::Type{UInt256}) = UInt512 _widen(t::Type) = widen(t) _widen(x::T) where {T} = (_widen(T))(x) @@ -176,7 +197,8 @@ function _round_to_nearest(quotient::T, halfdivisor = divisor >> 1 # PERF Note: Only need the last bit to check iseven, and default iseven(Int256) # allocates, so we truncate first. - if iseven((divisor % Int8)) && remainder == halfdivisor + _iseven(x) = (x & 0x1) == 0 + if _iseven(divisor) && remainder == halfdivisor # `:NearestTiesAway` will tie away from zero, e.g. -8.5 -> # -9. `:NearestTiesUp` will always ties towards positive # infinity. `:Nearest` will tie towards the nearest even @@ -184,7 +206,7 @@ function _round_to_nearest(quotient::T, if M == :NearestTiesAway ifelse(quotient < zero(quotient), quotient, quotient + one(quotient)) elseif M == :Nearest - ifelse(iseven(quotient), quotient, quotient + one(quotient)) + ifelse(_iseven(quotient), quotient, quotient + one(quotient)) else quotient + one(quotient) end @@ -196,18 +218,12 @@ function _round_to_nearest(quotient::T, end _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m) -# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since -# `fldmod` is sometimes not being inlined, that constant information is not available to the -# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive -# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient. -@inline fldmodinline(x,y) = (fld(x,y), mod(x,y)) - # multiplication rounds to nearest even representation # TODO: can we use floating point to speed this up? after we build a # correctness test suite. function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt) + quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt) reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt)) end @@ -234,12 +250,12 @@ function Base.round(x::FD{T, f}, RoundingMode{:NearestTiesUp}, RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(x.i, powt) + quotient, remainder = fldmod_by_const(x.i, powt) FD{T, f}(_round_to_nearest(quotient, remainder, powt, m)) end function Base.ceil(x::FD{T, f}) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(x.i, powt) + quotient, remainder = fldmod_by_const(x.i, powt) if remainder > 0 FD{T, f}(quotient + one(quotient)) else @@ -435,7 +451,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD} end function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt) + quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt) v = _round_to_nearest(quotient, remainder, powt) typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y) return reinterpret(FD{T, f}, T(v)) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl new file mode 100644 index 0000000..d36b4da --- /dev/null +++ b/src/fldmod-by-const.jl @@ -0,0 +1,131 @@ +# This file includes a manual implementation of the divide-by-constant optimization for +# non-power-of-2 divisors. LLVM automatically includes this optimization, but only for +# smaller integer sizes (<=64-bits on my 64-bit machine). +# +# NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code +# simpler (though the code we produce is identical). We apply this optimization to (U)Int128 +# and (U)Int256, which result from multiplying FD{Int64}s and FD{Int128}s. +# Before: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) +# 54.125 μs (0 allocations: 0 bytes) # (Unchanged) +# FixedDecimal{Int32,3}(1700943.280) +# +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) +# 174.625 μs (0 allocations: 0 bytes) +# FixedDecimal{Int64,3}(4230510070790917.029) +# +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) +# 2.119 ms (79986 allocations: 1.60 MiB) +# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) +# +# After: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) +# 56.958 μs (0 allocations: 0 bytes) # (Unchanged) +# FixedDecimal{Int32,3}(1700943.280) +# +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) +# 90.708 μs (0 allocations: 0 bytes) +# FixedDecimal{Int64,3}(4230510070790917.029) +# +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) +# 180.167 μs (0 allocations: 0 bytes) +# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) + +""" + ShouldUseCustomFldmodByConst(::Type{<:MyCustomIntType})) = true +A trait to control opt-in for the custom `fldmod_by_const` implementation. To use this for a +given integer type, you can define this overload for your integer type. +You will also need to implement some parts of the interface below, including _widen(). +""" +ShouldUseCustomFldmodByConst(::Type{<:Union{Int128,UInt128}}) = true # For FD{Int64} +ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true # For FD{Int128} +ShouldUseCustomFldmodByConst(::Type) = false + +@inline function fldmod_by_const(x, y) + if ShouldUseCustomFldmodByConst(typeof(x)) + # For large Int types, LLVM doesn't optimize well, so we use a custom implementation + # of fldmod, which extends that optimization to those larger integer types. + d = fld_by_const(x, Val(y)) + return d, (x - d * y) + else + # For other integers, LLVM might be able to correctly optimize away the division, if + # it knows it's dividing by a const. + # Since julia 1.8+, fldmod(x,y) automatically optimizes for constant divisors. + return fldmod(x, y) + end +end + +# Calculate fld(x,y) when y is a Val constant. +# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that +# it uses `div_by_const` instead of `div`. +fld_by_const(x::T, y::Val{C}) where {T<:Unsigned, C} = div_by_const(x, y) +function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C} + d = div_by_const(x, y) + return d - (signbit(x ⊻ C) & (d * C != x)) +end + +function div_by_const(x::T, ::Val{C}) where {T, C} + # These checks will be compiled away during specialization. + # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these + # checks allow this function to work for any `C > 0`, in case that's useful in the + # future. + if C == 1 + return x + elseif ispow2(C) + return div(x, C) # Will already do the right thing + elseif C <= 0 + throw(DomainError("C must be > 0")) + end + # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10. + magic_number, shift = magicg(typemax(T), C) + + out = _widemul(promote(x, magic_number)...) + out >>= shift + # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4. + return (out % T) + (x < zero(T)) +end + +# Unsigned magic number computation + shift by constant +# See Hacker's delight, equations (26) and (27) from Chapter 10-9. +# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/) +# requires nmax >= divisor > 2. divisor must not be a power of 2. +Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) + T = typeof(nmax) + W = _widen(T) + d = W(divisor) + + nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 + nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit + # shift must be larger than int size because we want the high bits of the wide multiplication + for p in nbits-1:2nbits-1 + if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d)) # (27) + m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26) + return (m, p) + end + end + @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax. + Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl + """ +end + +# See Hacker's delight, equations (5) and (6) from Chapter 10-4. +# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/) +# requires nmax >= divisor > 2. divisor must not be a power of 2. +Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) + T = typeof(nmax) + W = _widen(T) + d = W(divisor) + + nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 + nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit + # shift must be larger than int size because we want the high bits of the wide multiplication + for p in nbits-1:2nbits-1 + if W(2)^p > nc * (d - rem(W(2)^p, d)) # (6) + m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5) + return (m, p) + end + end + @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax. + Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl + """ +end diff --git a/test/FixedDecimal.jl b/test/FixedDecimal.jl index 18d87d7..41fcda8 100644 --- a/test/FixedDecimal.jl +++ b/test/FixedDecimal.jl @@ -1408,3 +1408,25 @@ end @test hash(FD2(1//10)) ≠ hash(0.1) end +@testset "_widemul" begin + _widemul = FixedPointDecimals._widemul + Int256, UInt256 = FixedPointDecimals.Int256, FixedPointDecimals.UInt256 + Int512, UInt512 = FixedPointDecimals.Int512, FixedPointDecimals.UInt512 + + @test _widemul(UInt8(3), UInt8(2)) === widemul(UInt8(3), UInt8(2)) === UInt16(6) + @test _widemul(Int8(3), UInt8(2)) === widemul(Int8(3), UInt8(2)) === Int16(6) + @test _widemul(UInt8(3), Int8(2)) === widemul(UInt8(3), Int8(2)) === Int16(6) + + @test _widemul(UInt16(3), UInt8(2)) === widemul(UInt16(3), UInt8(2)) === UInt32(6) + @test _widemul(UInt16(3), Int8(2)) === widemul(UInt16(3), Int8(2)) === Int32(6) + @test _widemul(Int16(3), UInt8(2)) === widemul(Int16(3), UInt8(2)) === Int32(6) + + # Custom widenings + @test _widemul(UInt128(3), UInt128(2)) === UInt256(6) + @test _widemul(Int128(3), UInt128(2)) === Int256(6) + @test _widemul(UInt128(3), Int128(2)) === Int256(6) + + @test _widemul(UInt256(3), UInt256(2)) === UInt512(6) + @test _widemul(Int256(3), UInt256(2)) === Int512(6) + @test _widemul(UInt256(3), Int256(2)) === Int512(6) +end diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl new file mode 100644 index 0000000..42b04d0 --- /dev/null +++ b/test/fldmod-by-const_tests.jl @@ -0,0 +1,115 @@ +using Test +using FixedPointDecimals + +@testset "div_by_const" begin + vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32] + for a_base in vals + # Only test negative numbers on `a`, since div_by_const requires b > 0. + @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed)) + a, b = promote(f(a), f(b)) + @test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b + end + end +end + +@testset "fldmod_by_const" begin + vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32] + for a_base in vals + # Only test negative numbers on `a`, since fldmod_by_const requires b > 0. + @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed)) + a, b = promote(f(a), f(b)) + @test FixedPointDecimals.fldmod_by_const(a, b) == fldmod(a, b) + end + end +end + +# We don't actually use fldmod_by_const with 8-bit ints, but they're useful because +# we can exhaustively test every possible combination, to increase our confidence in +# the implementation. +@testset "flmdod_by_const - exhaustive 8-bit" begin + @testset for T in (Int8, UInt8) + @testset for x in typemin(T) : typemax(T) + @testset for y in typemin(T) : typemax(T) + y == 0 && continue + y == -1 && continue + @testset "fldmod($x, $y)" begin + @test fldmod(x, y) == FixedPointDecimals.fldmod_by_const(x, y) + end + end + end + end +end + +# 64-bit FD multiplication uses the custom fldmod_by_const implementation. +# Test a few various different cases to try to ensure it works correctly. +@testset "fixed decimal multiplication - 64-bit" begin + @testset for P in (0,1,2,3,4) + @testset for T in (Int64, UInt64) + FD = FixedDecimal{T,P} + + function test_multiplies_correctly(fd, x) + big = FixedDecimal{BigInt, P}(fd) + big_mul = big * x + # This might overflow: ... + mul = fd * x + @testset "$fd * $x" begin + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end + end + num_tests = 2<<11 + # Add one to avoid powers-of-2 to get hopefully better tests + epsilon = (typemax(FD) ÷ num_tests) + eps(FD) + @testset for v in typemin(FD) : epsilon : typemax(FD) + test_multiplies_correctly(v, typemin(T)) + test_multiplies_correctly(v, -1) + test_multiplies_correctly(v, -eps(FD)) + test_multiplies_correctly(v, 0) + test_multiplies_correctly(v, eps(FD)) + test_multiplies_correctly(v, 1) + test_multiplies_correctly(v, 2) + test_multiplies_correctly(v, 3) + test_multiplies_correctly(v, typemax(T)) + end + end + end +end + +@testset "fixed decimal multiplication - 128-bit" begin + @testset for P in 0:37 + @testset for T in (Int128, UInt128) + FD = FixedDecimal{T,P} + + function test_multiplies_correctly(fd, x) + big = FixedDecimal{BigInt, P}(fd) + big_mul = big * x + # This might overflow: ... + mul = fd * x + @testset "$fd * $x" begin + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end + end + vals = FD[ + typemin(FD), typemax(FD), + typemin(FD) + eps(FD), typemax(FD) - eps(FD), + 0.0, eps(FD), 0.1, 1.0, 2.0, + typemax(FD) ÷ 2, + ] + if T <: Signed + append!(vals, vals.*-1) + end + @testset for v in vals + test_multiplies_correctly(v, typemin(T)) + test_multiplies_correctly(v, -1) + test_multiplies_correctly(v, -eps(FD)) + test_multiplies_correctly(v, 0) + test_multiplies_correctly(v, eps(FD)) + test_multiplies_correctly(v, 1) + test_multiplies_correctly(v, 2) + test_multiplies_correctly(v, 3) + test_multiplies_correctly(v, typemax(T)) + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 81f9cea..b5fadb6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,4 +10,6 @@ include(joinpath(pkg_path, "test", "utils.jl")) @testset "FixedPointDecimals" begin include("FixedDecimal.jl") -end # global testset + isdefined(Base, Symbol("@assume_effects")) && include("fldmod-by-const_tests.jl") +end +