From 4d8460fee2228be2458f282113209aa0ac266ac8 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 14 Feb 2023 20:48:18 +1300 Subject: [PATCH 1/5] fix a doc typo --- src/mlj/regressors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlj/regressors.jl b/src/mlj/regressors.jl index 335fd70..e821beb 100644 --- a/src/mlj/regressors.jl +++ b/src/mlj/regressors.jl @@ -104,7 +104,7 @@ See also [`ElasticNetRegressor`](@ref). "whether to scale the penalty with the number of observations." scale_penalty_with_samples::Bool = true """any instance of `MLJLinearModels.Analytical`. Use `Analytical()` for - Cholesky and `CG()=Analytical(iteration=true)` for conjugate-gradient. + Cholesky and `CG()=Analytical(iterative=true)` for conjugate-gradient. If `solver = nothing` (default) then `Analytical()` is used. """ solver::Option{Solver} = nothing end From c7757180444cee87cd12384dd386f83411241bb2 Mon Sep 17 00:00:00 2001 From: tlienart Date: Tue, 23 May 2023 11:27:23 +0200 Subject: [PATCH 2/5] fixes following discussion around #147 --- .github/workflows/ci.yml | 2 + Project.toml | 2 +- src/fit/solvers.jl | 21 +++++--- src/loss-penalty/robust.jl | 69 ++++++++++++++++++------- test/Project.toml | 1 + test/benchmarks/elementary_functions.jl | 2 +- test/benchmarks/logistic-multinomial.jl | 2 +- test/benchmarks/ridge-lasso.jl | 2 +- test/benchmarks/robust.jl | 29 +++++++++++ test/fit/quantile.jl | 25 ++++++--- test/loss-penalty/robust.jl | 2 +- test/runtests.jl | 7 ++- 12 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 test/benchmarks/robust.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e5ea73c..8d918c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,6 +41,8 @@ jobs: ${{ runner.os }}- - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + env: + RUN_COMPARISONS: "false" - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v1 with: diff --git a/Project.toml b/Project.toml index 9b54ecf..dbae3c8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJLinearModels" uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692" authors = ["Thibaut Lienart "] -version = "0.9.1" +version = "0.9.2" [deps] DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl index b9ef8c2..93655a3 100644 --- a/src/fit/solvers.jl +++ b/src/fit/solvers.jl @@ -46,11 +46,14 @@ Newton solver. This is a full Hessian solver and should be avoided for `newton_options` are the [options of Newton's method](https://julianlsolvers.github.io/Optim.jl/stable/#algo/newton/) ## Example + ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - newton_options = (linesearch = Optim.LineSearches.HagerZhang()),)) +solver = MLJLinearModels.Newton( + optim_options = Optim.Options(time_limit = 20), + newton_options = (linesearch = Optim.LineSearches.HagerZhang()),) +) ``` """ @with_kw struct Newton{O,S} <: Solver @@ -70,13 +73,15 @@ generally be preferred for larger scale cases. `newtoncg_options` are the [options of Krylov Trust Region method](https://github.com/JuliaNLSolvers/Optim.jl/blob/master/src/multivariate/solvers/second_order/krylov_trust_region.jl) ## Example + ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - newtoncg_options = (eta = 0.2,)) +solver = MLJLinearModels.NewtonCG( + optim_options = Optim.Options(time_limit = 20), + newtoncg_options = (eta = 0.2,) +) ``` - """ @with_kw struct NewtonCG{O,S} <: Solver optim_options::O = Optim.Options(f_tol=1e-4) @@ -95,8 +100,10 @@ LBFGS quasi-Newton solver. See [the wikipedia entry](https://en.wikipedia.org/wi ```julia using MLJLinearModels, Optim -solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20), - lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),)) +solver = MLJLinearModels.LBFGS( + optim_options = Optim.Options(time_limit = 20), + lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),) +) ``` """ @with_kw struct LBFGS{O,S} <: Solver diff --git a/src/loss-penalty/robust.jl b/src/loss-penalty/robust.jl index bae7449..65c76c6 100644 --- a/src/loss-penalty/robust.jl +++ b/src/loss-penalty/robust.jl @@ -3,20 +3,37 @@ export RobustLoss, BisquareRho, Bisquare, LogisticRho, Logistic, FairRho, Fair, TalwarRho, Talwar, QuantileRho, Quantile +#= +In the non-penalised case: + + β⋆ = arg min ∑ ρ(yᵢ - ⟨xᵢ, β⟩) + +where ρ is a weighing function such as, for instance, the pinball loss for +the quantile regression. + +It is useful to define the following quantities: + + ψ(r) = ρ'(r) (first derivative) + ϕ(r) = ψ'(r) (second derivative) + ω(r) = ψ(r)/r (weighing function used for IWLS), a threshold can be passed + to clip weights + +Some refs: +- https://josephsalmon.eu/enseignement/UW/STAT593/QuantileRegression.pdf +=# + abstract type RobustRho end -abstract type RobustRho1P{δ} <: RobustRho end # one parameter +# robust rho with only one parameter +abstract type RobustRho1P{δ} <: RobustRho end struct RobustLoss{ρ} <: AtomicLoss where ρ <: RobustRho rho::ρ end -(rl::RobustLoss)(x::AVR, y::AVR) = rl(x .- y) -(rl::RobustLoss)(r::AVR) = rl.rho(r) +(rl::RobustLoss)(Xβ::AVR, y::AVR) = rl(y .- Xβ) +(rl::RobustLoss)(r::AVR) = rl.rho(r) -# ψ(r) = ρ'(r) (first derivative) -# ω(r) = ψ(r)/r (weighing function) a thresh can be passed to clip weights -# ϕ(r) = ψ'(r) (second derivative) """ $TYPEDEF @@ -24,6 +41,8 @@ $TYPEDEF Huber weighing of the residuals corresponding to ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=δ(|z|-δ/2)` otherwise. + +Note: symmetric weighing. """ struct HuberRho{δ} <: RobustRho1P{δ} HuberRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -33,7 +52,7 @@ Huber(δ::Real=1.0; delta::Real=δ) = HuberRho(delta) (::HuberRho{δ})(r::AVR) where δ = begin ar = abs.(r) w = ar .<= δ - return sum( r.^2/2 .* w .+ δ .* (ar .- δ/2) .* .!w ) + return sum( @. ifelse(w, r^2/2, δ * (ar - δ/2) ) ) end ψ(::Type{HuberRho{δ}} ) where δ = (r, w) -> r * w + δ * sign(r) * (1.0 - w) @@ -47,6 +66,8 @@ $TYPEDEF Andrews weighing of the residuals corresponding to ``ρ(z) = -cos(πz/δ)/(π/δ)²`` if `|z|≤δ` and `ρ(δ)` otherwise. + +Note: symmetric weighing. """ struct AndrewsRho{δ} <: RobustRho1P{δ} AndrewsRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -58,7 +79,7 @@ Andrews(δ::Real=1.0; delta::Real=δ) = AndrewsRho(delta) w = ar .<= δ c = π/δ κ = (δ/π)^2 - return sum( -cos.(c .* r) .* κ .* w .+ κ .* .!w ) + return sum( @. ifelse(w, -cos(c * r) * κ, κ) ) end # Note, sinc(x) = sin(πx)/πx, well defined everywhere @@ -74,6 +95,8 @@ $TYPEDEF Bisquare weighing of the residuals corresponding to ``ρ(z) = δ²/6 (1-(1-(z/δ)²)³)`` if `|z|≤δ` and `δ²/6` otherwise. + +Note: symmetric weighing. """ struct BisquareRho{δ} <: RobustRho1P{δ} BisquareRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -84,7 +107,7 @@ Bisquare(δ::Real=1.0; delta::Real=δ) = BisquareRho(delta) ar = abs.(r) w = ar .<= δ κ = δ^2/6 - return sum( κ * (1.0 .- (1.0 .- (r ./ δ).^2).^3) .* w + κ .* .!w ) + return sum( @. ifelse(w, κ * (1 - (1 - (r / δ)^2)^3), κ) ) end ψ(::Type{BisquareRho{δ}} ) where δ = (r, w) -> w * r * (1.0 - (r / δ)^2)^2 @@ -97,6 +120,8 @@ $TYPEDEF Logistic weighing of the residuals corresponding to ``ρ(z) = δ² log(cosh(z/δ))`` + +Note: symmetric weighing. """ struct LogisticRho{δ} <: RobustRho1P{δ} LogisticRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -104,7 +129,7 @@ end Logistic(δ::Real=1.0; delta::Real=δ) = LogisticRho(delta) (::LogisticRho{δ})(r::AVR) where δ = begin - return sum( δ^2 .* log.(cosh.(r ./ δ)) ) + return sum( @. δ^2 * log(cosh(r / δ)) ) end # similar to sinc, to avoid NaNs if tanh(0)/0 (lim is 1.0) @@ -121,6 +146,8 @@ $TYPEDEF Fair weighing of the residuals corresponding to ``ρ(z) = δ² (|z|/δ - log(1+|z|/δ))`` + +Note: symmetric weighing. """ struct FairRho{δ} <: RobustRho1P{δ} FairRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -128,8 +155,8 @@ end Fair(δ::Real=1.0; delta::Real=δ) = FairRho(delta) (::FairRho{δ})(r::AVR) where δ = begin - sr = abs.(r) ./ δ - return sum( δ^2 .* (sr .- log1p.(sr)) ) + sr = @. abs(r) / δ + return sum( @. δ^2 * (sr - log1p(sr)) ) end ψ(::Type{FairRho{δ}} ) where δ = (r, _) -> δ * r / (abs(r) + δ) @@ -143,6 +170,8 @@ $TYPEDEF Talwar weighing of the residuals corresponding to ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=ρ(δ)` otherwise. + +Note: symmetric weighing. """ struct TalwarRho{δ} <: RobustRho1P{δ} TalwarRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -150,8 +179,8 @@ end Talwar(δ::Real=1.0; delta::Real=δ) = TalwarRho(delta) (::TalwarRho{δ})(r::AVR) where δ = begin - w = abs.(r) .<= δ - return sum( r.^2 ./ 2 .* w .+ δ^2/2 .* .!w) + w = @. abs(r) <= δ + return sum( @. ifelse(w, r^2 / 2, δ^2/2) ) end ψ(::Type{TalwarRho{δ}} ) where δ = (r, w) -> w * r @@ -164,7 +193,11 @@ $TYPEDEF Quantile regression weighing of the residuals corresponding to -``ρ(z) = z(δ - 1(z<0))`` +``ρ(z) = -z(δ - 1(z>=0))`` + +Note: asymetric weighing, the "-" sign is because similar libraries like +quantreg for instance define the residual as `y-Xθ` while we do the opposite +(out of convenience for gradients etc). """ struct QuantileRho{δ} <: RobustRho1P{δ} QuantileRho(δ::Real=1.0; delta::Real=δ) = new{delta}() @@ -173,9 +206,9 @@ end Quantile(δ::Real=1.0; delta::Real=δ) = QuantileRho(delta) (::QuantileRho{δ})(r::AVR) where δ = begin - return sum( r .* (δ .- (r .<= 0.0)) ) + return sum( @. -r * (δ - (r >= 0)) ) end -ψ(::Type{QuantileRho{δ}} ) where δ = (r, _) -> (δ - (r <= 0.0)) -ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> (δ - (r <= 0.0)) / clip(r, τ) +ψ(::Type{QuantileRho{δ}} ) where δ = (r, _) -> ((r >= 0.0) - δ) +ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> ((r >= 0.0) - δ) / clip(-r, τ) ϕ(::Type{QuantileRho{δ}} ) where δ = (_, _) -> error("Newton(CG) not available for Quantile Reg.") diff --git a/test/Project.toml b/test/Project.toml index 3190290..30cbf32 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" diff --git a/test/benchmarks/elementary_functions.jl b/test/benchmarks/elementary_functions.jl index c069c00..18ac322 100644 --- a/test/benchmarks/elementary_functions.jl +++ b/test/benchmarks/elementary_functions.jl @@ -1,6 +1,6 @@ using MLJLinearModels using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=512, sparse=0.5) diff --git a/test/benchmarks/logistic-multinomial.jl b/test/benchmarks/logistic-multinomial.jl index 16b3e9e..0554fde 100644 --- a/test/benchmarks/logistic-multinomial.jl +++ b/test/benchmarks/logistic-multinomial.jl @@ -1,6 +1,6 @@ using MLJLinearModels using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_binary(n, p; seed=525) diff --git a/test/benchmarks/ridge-lasso.jl b/test/benchmarks/ridge-lasso.jl index 2e50c43..46d9c40 100644 --- a/test/benchmarks/ridge-lasso.jl +++ b/test/benchmarks/ridge-lasso.jl @@ -1,6 +1,6 @@ using MLJLinearModels, StableRNGs using BenchmarkTools, Random, LinearAlgebra -DO_COMPARISONS = false; include("../testutils.jl") +include("../testutils.jl") n, p = 50_000, 500 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=512, sparse=0.5) diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl new file mode 100644 index 0000000..d5a8e1c --- /dev/null +++ b/test/benchmarks/robust.jl @@ -0,0 +1,29 @@ +# Follow up from issue #147 comparing quantreg more specifically. + +if DO_COMPARISONS + @testset "Comp-QR-147" begin + using CSV, DataFrames + + dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame) + tau = 0.3 + + y = Vector(dataset[!,:rent_euro]) + X = Matrix(dataset[!,[:area, :yearc]]) + X1 = hcat(X[:,1], X[:, 2], ones(size(X, 1))) + + qr = QuantileRegression(tau; penalty=:none) + obj = objective(qr, X, y) + + θ_lbfgs = fit(qr, X, y) + @test isapprox(obj(θ_lbfgs), 226_639, rtol=1e-4) + + # in this case QR with BR method does better + θ_qr_br = rcopy(getproperty(QUANTREG, :rq_fit_br)(X1, y; tau=tau))[:coefficients] + @test isapprox(obj(θ_qr_br), 207_551, rtol=1e-4) + + # lasso doesn't + θ_qr_lasso = rcopy(getproperty(QUANTREG, :rq_fit_lasso)(X1, y; tau=tau))[:coefficients] + obj(θ_qr_lasso) # 229_172 + @test 228_000 ≤ obj(θ_qr_lasso) ≤ 231_000 + end +end diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl index f1e6105..e4ce57a 100644 --- a/test/fit/quantile.jl +++ b/test/fit/quantile.jl @@ -46,16 +46,27 @@ y1a = outlify(y1, 0.1) θ_ls = fit(LinearRegression(), X, y1a) θ_lbfgs = fit(rr, X, y1a, solver=LBFGS()) θ_iwls = fit(rr, X, y1a, solver=IWLSCG()) - θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a))[:coefficients] - θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a))[:coefficients] + θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients] + θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a, tau=δ))[:coefficients] # NOTE: we take θ_qr_br as reference point @test isapprox(J(θ_ls), 505.45286, rtol=1e-5) @test J(θ_qr_br) ≈ 409.570777 # <- ref value # Their IP algorithm essentially gives the same answer @test (J(θ_qr_fnb) - J(θ_qr_br)) ≤ 1e-10 # Our algorithms are close enough - @test isapprox(J(θ_lbfgs), 409.57154, rtol=1e-5) + @test isapprox(J(θ_lbfgs), 409.57608, rtol=1e-5) @test isapprox(J(θ_iwls), 409.59, rtol=1e-4) + + # Let's try this again but with a δ different from 0.5 + δ = 0.75 + rr = QuantileRegression(δ, lambda=0) + J = objective(rr, X, y1a) + + θ_lbfgs = fit(rr, X, y1a, solver=LBFGS()) + θ_qr_br = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients] + + @test isapprox(J(θ_qr_br), 404.6161, rtol=1e-4) + @test isapprox(J(θ_lbfgs), 404.6195, rtol=1e-4) end end @@ -63,7 +74,7 @@ end ## With Sparsity penalty ## ########################### -n, p = 500, 100 +Jn, p = 500, 100 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=51112, sparse=0.1) # pepper with outliers y1a = outlify(y1, 0.1) @@ -90,9 +101,9 @@ y1a = outlify(y1, 0.1) θ_ls = X1 \ y1a θ_fista = fit(rr, X, y1a, solver=FISTA()) θ_ista = fit(rr, X, y1a, solver=ISTA()) - θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a))[:coefficients] + θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a, lambda=λ))[:coefficients] @test isapprox(J(θ_ls), 888.3748, rtol=1e-5) - @test isapprox(J(θ_qr_lasso), 425.5, rtol=1e-3) + @test isapprox(J(θ_qr_lasso), 425.2, rtol=1e-3) # Our algorithms are close enough @test isapprox(J(θ_fista), 425.0526, rtol=1e-5) @test isapprox(J(θ_ista), 425.4113, rtol=1e-5) @@ -101,6 +112,6 @@ y1a = outlify(y1, 0.1) @test nnz(θ_fista) == 88 @test nnz(θ_ista) == 82 # in this case fista is best - @test J(θ_fista) < J(θ_ista) < J(θ_qr_lasso) + @test J(θ_fista) < J(θ_qr_lasso) < J(θ_ista) end end diff --git a/test/loss-penalty/robust.jl b/test/loss-penalty/robust.jl index 45e98ed..0abfbfa 100644 --- a/test/loss-penalty/robust.jl +++ b/test/loss-penalty/robust.jl @@ -5,7 +5,7 @@ y = randn(rng, 10) r = x .- y @testset "Robust Loss" begin - δ = 0.5 + δ = 0.75 rlδ = RobustLoss(Huber(δ)) @test rlδ isa RobustLoss{HuberRho{δ}} @test rlδ(r) == rlδ(x, y) == sum(ifelse(abs(rᵢ)≤δ, rᵢ^2/2, δ*(abs(rᵢ)-δ/2)) for rᵢ in r) diff --git a/test/runtests.jl b/test/runtests.jl index 3d7099c..73cba12 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,8 @@ using Random, StableRNGs, DataFrames, ForwardDiff import Optim import MLJ, MLJBase -DO_COMPARISONS = false; include("testutils.jl") +DO_COMPARISONS = true +include("testutils.jl") m("UTILS"); include("utils.jl") @@ -31,3 +32,7 @@ m("MLJ", false); begin mm("fit-predict"); include("interface/fitpredict.jl") mm("extras"); include("interface/extras.jl") end + +m("MISC", false); begin + mm("benchmarks"); include("benchmarks/robust.jl") +end From 0b48318e246fea0f3cfa6322427d732500a5b839 Mon Sep 17 00:00:00 2001 From: tlienart Date: Wed, 24 May 2023 08:36:01 +0200 Subject: [PATCH 3/5] small adjustments + typo fixes --- test/Project.toml | 1 + test/benchmarks/robust.jl | 4 ++-- test/fit/quantile.jl | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index 30cbf32..ac1e430 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -2,6 +2,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl index d5a8e1c..6fcd739 100644 --- a/test/benchmarks/robust.jl +++ b/test/benchmarks/robust.jl @@ -2,9 +2,9 @@ if DO_COMPARISONS @testset "Comp-QR-147" begin - using CSV, DataFrames + using CSV, DataFrames, Downloads - dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame) + dataset = CSV.read(Downloads.download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame) tau = 0.3 y = Vector(dataset[!,:rent_euro]) diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl index e4ce57a..d99eeb0 100644 --- a/test/fit/quantile.jl +++ b/test/fit/quantile.jl @@ -74,7 +74,7 @@ end ## With Sparsity penalty ## ########################### -Jn, p = 500, 100 +n, p = 500, 100 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p; seed=51112, sparse=0.1) # pepper with outliers y1a = outlify(y1, 0.1) From 14eb46619bfa0983ff5d154393b268158e107767 Mon Sep 17 00:00:00 2001 From: adienes <51664769+adienes@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:09:12 -0400 Subject: [PATCH 4/5] first pass at Gramian training for OLS (#146) * proof of concept * AbstractMatrix -> AVR * cleaner impl * endline * fix error type * construct kernels if not passed in * add test case for implicit gram construction * last endline * check for isempty instead of iszero --- src/fit/default.jl | 19 ++++++++++++++----- src/fit/proxgrad.jl | 17 +++++++++++++---- src/fit/solvers.jl | 1 + src/glr/d_l2loss.jl | 9 +++++++++ src/glr/utils.jl | 5 ++++- src/mlj/classifiers.jl | 4 ++-- src/utils.jl | 23 +++++++++++++++++++++++ test/fit/ols-ridge-lasso-elnet.jl | 15 +++++++++++++++ 8 files changed, 81 insertions(+), 12 deletions(-) diff --git a/src/fit/default.jl b/src/fit/default.jl index 5bd5575..459853c 100644 --- a/src/fit/default.jl +++ b/src/fit/default.jl @@ -33,13 +33,22 @@ $SIGNATURES Fit a generalised linear regression model using an appropriate solver based on the loss and penalty of the model. A method can, in some cases, be specified. """ -function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; +function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing, solver::Solver=_solver(glr, size(X))) - check_nrows(X, y) - n, p = size(X) - c = getc(glr, y) - return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept)) + if hasproperty(solver, :gram) && solver.gram + # interpret X,y as X'X, X'y + data = verify_or_construct_gramian(glr, X, y, data) + p = size(data.XX, 2) + return _fit(glr, solver, data.XX, data.Xy, (; dims=(data.n, p, 0))) + else + check_nrows(X, y) + n, p = size(X) + c = getc(glr, y) + return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept)) + end end +fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...) + function scratch(n, p, c=0; i=false) p_ = p + Int(i) diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl index bebd1f0..1a04fe1 100644 --- a/src/fit/proxgrad.jl +++ b/src/fit/proxgrad.jl @@ -3,7 +3,7 @@ # Assumption: loss has gradient; penalty has prox e.g.: Lasso # J(θ) = f(θ) + r(θ) where f is smooth function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) - _,p,c = npc(scratch) + n,p,c = npc(scratch) c > 0 && (p *= c) # vector caches + eval cache θ = zeros(p) # θ_k @@ -19,9 +19,18 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch) η = 1.0 # stepsize (1/L) acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA) # functions - _f = smooth_objective(glr, X, y; c=c) - _fg! = smooth_fg!(glr, X, y, scratch) - _prox! = prox!(glr, size(X, 1)) + _f = if solver.gram + smooth_gram_objective(glr, X, y, n) + else + smooth_objective(glr, X, y; c=c) + end + + _fg! = if solver.gram + smooth_gram_fg!(glr, X, y, n) + else + smooth_fg!(glr, X, y, scratch) + end + _prox! = prox!(glr, n) bt_cond = θ̂ -> _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η) # loop-related diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl index 93655a3..aaeccb2 100644 --- a/src/fit/solvers.jl +++ b/src/fit/solvers.jl @@ -133,6 +133,7 @@ Proximal Gradient solver for non-smooth objective functions. tol::Float64 = 1e-4 # tol relative change of θ i.e. norm(θ-θ_)/norm(θ) max_inner::Int = 100 # β^max_inner should be > 1e-10 beta::Float64 = 0.8 # in (0, 1); shrinkage in the backtracking step + gram::Bool = false # use precomputed Gramian for lsq where possible end FISTA(; kwa...) = ProxGrad(;accel = true, kwa...) diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl index 3160a9a..1184c9f 100644 --- a/src/glr/d_l2loss.jl +++ b/src/glr/d_l2loss.jl @@ -72,3 +72,12 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y, scratch) return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ)) end end + +function smooth_gram_fg!(glr::GLR{L2Loss,<:ENR}, XX, Xy, n) + λ = get_penalty_scale_l2(glr, n) + (g, θ) -> begin + _g = XX * θ .- Xy + g .= _g .+ λ .* θ + return θ'*_g + get_l2(glr.penalty)(view_θ(glr, θ)) + end +end diff --git a/src/glr/utils.jl b/src/glr/utils.jl index 026f0ff..102da1d 100644 --- a/src/glr/utils.jl +++ b/src/glr/utils.jl @@ -1,4 +1,4 @@ -export objective, smooth_objective +export objective, smooth_objective, smooth_gram_objective # NOTE: RobustLoss are not always everywhere smooth but "smooth-enough". const SmoothLoss = Union{L2Loss, LogisticLoss, MultinomialLoss, RobustLoss} @@ -37,6 +37,9 @@ Return the smooth part of the objective function of a GLR. """ smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) = glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.) +smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) = + θ -> (θ'*XX*θ)/2 - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.))(θ) + smooth_objective(::GLR) = @error "Case not implemented yet." """ diff --git a/src/mlj/classifiers.jl b/src/mlj/classifiers.jl index 75ffa31..8b04a87 100644 --- a/src/mlj/classifiers.jl +++ b/src/mlj/classifiers.jl @@ -65,7 +65,7 @@ See also [`MultinomialClassifier`](@ref). """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`, `Newton`, `NewtonCG`, `ProxGrad`; but subject to the following restrictions: - - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only + - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only option. - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed. @@ -142,7 +142,7 @@ See also [`LogisticClassifier`](@ref). """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`, `NewtonCG`, `ProxGrad`; but subject to the following restrictions: - - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only + - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only option. - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed. diff --git a/src/utils.jl b/src/utils.jl index 4e6d952..36b93ba 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -9,6 +9,29 @@ function check_nrows(X::AbstractMatrix, y::AbstractVecOrMat)::Nothing throw(DimensionMismatch("`X` and `y` must have the same number of rows.")) end +function verify_or_construct_gramian(glr, X, y, data) + check_nrows(X, y) + isnothing(data) && return (; XX = X'X, Xy = X'y, n = length(y)) + + !all(hasproperty.(Ref(data), (:XX, :Xy, :n))) && throw(ArgumentError("data must contain XX, Xy, n")) + size(data.XX, 1) != size(data.Xy, 1) && throw(DimensionMismatch("`XX` and Xy` must have the same number of rows.")) + !issymmetric(data.XX) && throw(ArgumentError("Input `XX` must be symmetric")) + + c = getc(glr, data.Xy) + !iszero(c) && throw(ArgumentError("Categorical loss not supported with Gramian kernel")) + glr.fit_intercept && throw(ArgumentError("Intercept not supported with Gramian kernel")) + + if any(!isempty, (X, y)) + all(( + isapprox(X'X, data.XX; rtol=1e-5), + isapprox(X'y, data.Xy; rtol=1e-5), + length(y) == data.n + )) || throw(ArgumentError("Inputs `X` and `y` do not match inputs `XX` and `Xy`.")) + end + + return data +end + """ $SIGNATURES diff --git a/test/fit/ols-ridge-lasso-elnet.jl b/test/fit/ols-ridge-lasso-elnet.jl index 2a99168..3c9a1c6 100644 --- a/test/fit/ols-ridge-lasso-elnet.jl +++ b/test/fit/ols-ridge-lasso-elnet.jl @@ -146,3 +146,18 @@ end @test nnz(θ_sk) == 8 end end + +@testset "gramian" begin + λ = 0.1 + γ = 0.1 + enr = ElasticNetRegression(λ, γ; fit_intercept=false, + scale_penalty_with_samples=false) + XX = X'X + Xy = X'y + n = size(X, 1) + θ_fista = fit(enr, X, y; solver=FISTA(max_iter=5000)) + θ_gram_explicit = fit(enr; data=(; XX, Xy, n), solver=FISTA(max_iter=5000, gram=true)) + θ_gram_implicit = fit(enr, X, y; solver=FISTA(max_iter=5000, gram=true)) + @test isapprox(θ_fista, θ_gram_explicit, rtol=1e-5) + @test isapprox(θ_gram_explicit, θ_gram_implicit; rtol=1e-5) +end From 88b5c5f267b972b3c6dcbc9e1bb344deb88351de Mon Sep 17 00:00:00 2001 From: Thibaut Lienart Date: Mon, 2 Oct 2023 23:10:05 +0200 Subject: [PATCH 5/5] Prepare minor release with gramian training --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index dbae3c8..b4773b6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJLinearModels" uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692" authors = ["Thibaut Lienart "] -version = "0.9.2" +version = "0.10.0" [deps] DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"