From 4d8460fee2228be2458f282113209aa0ac266ac8 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 14 Feb 2023 20:48:18 +1300
Subject: [PATCH 1/5] fix a doc typo

---
 src/mlj/regressors.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mlj/regressors.jl b/src/mlj/regressors.jl
index 335fd70..e821beb 100644
--- a/src/mlj/regressors.jl
+++ b/src/mlj/regressors.jl
@@ -104,7 +104,7 @@ See also [`ElasticNetRegressor`](@ref).
     "whether to scale the penalty with the number of observations."
     scale_penalty_with_samples::Bool = true
     """any instance of `MLJLinearModels.Analytical`. Use `Analytical()` for
-    Cholesky and `CG()=Analytical(iteration=true)` for conjugate-gradient.
+    Cholesky and `CG()=Analytical(iterative=true)` for conjugate-gradient.
     If `solver = nothing` (default) then `Analytical()` is used. """
     solver::Option{Solver}   = nothing
 end

From c7757180444cee87cd12384dd386f83411241bb2 Mon Sep 17 00:00:00 2001
From: tlienart <tlienart@me.com>
Date: Tue, 23 May 2023 11:27:23 +0200
Subject: [PATCH 2/5] fixes following discussion around #147

---
 .github/workflows/ci.yml                |  2 +
 Project.toml                            |  2 +-
 src/fit/solvers.jl                      | 21 +++++---
 src/loss-penalty/robust.jl              | 69 ++++++++++++++++++-------
 test/Project.toml                       |  1 +
 test/benchmarks/elementary_functions.jl |  2 +-
 test/benchmarks/logistic-multinomial.jl |  2 +-
 test/benchmarks/ridge-lasso.jl          |  2 +-
 test/benchmarks/robust.jl               | 29 +++++++++++
 test/fit/quantile.jl                    | 25 ++++++---
 test/loss-penalty/robust.jl             |  2 +-
 test/runtests.jl                        |  7 ++-
 12 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 test/benchmarks/robust.jl

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e5ea73c..8d918c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,6 +41,8 @@ jobs:
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
+        env:
+          RUN_COMPARISONS: "false"
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:
diff --git a/Project.toml b/Project.toml
index 9b54ecf..dbae3c8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.9.1"
+version = "0.9.2"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
index b9ef8c2..93655a3 100644
--- a/src/fit/solvers.jl
+++ b/src/fit/solvers.jl
@@ -46,11 +46,14 @@ Newton solver. This is a full Hessian solver and should be avoided for
 `newton_options` are the [options of Newton's method](https://julianlsolvers.github.io/Optim.jl/stable/#algo/newton/)
 
 ## Example
+
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                newton_options = (linesearch = Optim.LineSearches.HagerZhang()),))
+solver = MLJLinearModels.Newton(
+    optim_options = Optim.Options(time_limit = 20),
+    newton_options = (linesearch = Optim.LineSearches.HagerZhang()),)
+)
 ```
 """
 @with_kw struct Newton{O,S} <: Solver
@@ -70,13 +73,15 @@ generally be preferred for larger scale cases.
 `newtoncg_options` are the [options of Krylov Trust Region method](https://github.com/JuliaNLSolvers/Optim.jl/blob/master/src/multivariate/solvers/second_order/krylov_trust_region.jl)
 
 ## Example
+
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                newtoncg_options = (eta = 0.2,))
+solver = MLJLinearModels.NewtonCG(
+    optim_options = Optim.Options(time_limit = 20),
+    newtoncg_options = (eta = 0.2,)
+)
 ```
-
 """
 @with_kw struct NewtonCG{O,S} <: Solver
     optim_options::O = Optim.Options(f_tol=1e-4)
@@ -95,8 +100,10 @@ LBFGS quasi-Newton solver. See [the wikipedia entry](https://en.wikipedia.org/wi
 ```julia
 using MLJLinearModels, Optim
 
-solver = MLJLinearModels.Newton(optim_options = Optim.Options(time_limit = 20),
-                                lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),))
+solver = MLJLinearModels.LBFGS(
+    optim_options = Optim.Options(time_limit = 20),
+    lbfgs_options = (linesearch = Optim.LineSearches.HagerZhang()),)
+)
 ```
 """
 @with_kw struct LBFGS{O,S} <: Solver
diff --git a/src/loss-penalty/robust.jl b/src/loss-penalty/robust.jl
index bae7449..65c76c6 100644
--- a/src/loss-penalty/robust.jl
+++ b/src/loss-penalty/robust.jl
@@ -3,20 +3,37 @@ export RobustLoss,
        BisquareRho, Bisquare, LogisticRho, Logistic,
        FairRho, Fair, TalwarRho, Talwar, QuantileRho, Quantile
 
+#=
+In the non-penalised case:
+
+   β⋆ = arg min ∑ ρ(yᵢ - ⟨xᵢ, β⟩)
+
+where ρ is a weighing function such as, for instance, the pinball loss for
+the quantile regression.
+
+It is useful to define the following quantities:
+
+   ψ(r) = ρ'(r)     (first derivative)
+   ϕ(r) = ψ'(r)     (second derivative)
+   ω(r) = ψ(r)/r    (weighing function used for IWLS), a threshold can be passed
+                    to clip weights
+
+Some refs:
+- https://josephsalmon.eu/enseignement/UW/STAT593/QuantileRegression.pdf
+=#
+
 abstract type RobustRho end
 
-abstract type RobustRho1P{δ} <: RobustRho end # one parameter
+# robust rho with only one parameter
+abstract type RobustRho1P{δ} <: RobustRho end
 
 struct RobustLoss{ρ} <: AtomicLoss where ρ <: RobustRho
    rho::ρ
 end
 
-(rl::RobustLoss)(x::AVR, y::AVR) = rl(x .- y)
-(rl::RobustLoss)(r::AVR) = rl.rho(r)
+(rl::RobustLoss)(Xβ::AVR, y::AVR) = rl(y .- Xβ)
+(rl::RobustLoss)(r::AVR)          = rl.rho(r)
 
-# ψ(r) = ρ'(r)     (first derivative)
-# ω(r) = ψ(r)/r    (weighing function) a thresh can be passed to clip weights
-# ϕ(r) = ψ'(r)     (second derivative)
 
 """
 $TYPEDEF
@@ -24,6 +41,8 @@ $TYPEDEF
 Huber weighing of the residuals corresponding to
 
 ``ρ(z) = z²/2``  if `|z|≤δ` and `ρ(z)=δ(|z|-δ/2)` otherwise.
+
+Note: symmetric weighing.
 """
 struct HuberRho{δ} <: RobustRho1P{δ}
    HuberRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -33,7 +52,7 @@ Huber(δ::Real=1.0; delta::Real=δ) = HuberRho(delta)
 (::HuberRho{δ})(r::AVR) where δ = begin
    ar = abs.(r)
    w  = ar .<= δ
-   return sum( r.^2/2 .* w .+ δ .* (ar .- δ/2) .* .!w )
+   return sum( @. ifelse(w, r^2/2, δ * (ar - δ/2) ) )
 end
 
 ψ(::Type{HuberRho{δ}}   ) where δ = (r, w) -> r * w + δ * sign(r) * (1.0 - w)
@@ -47,6 +66,8 @@ $TYPEDEF
 Andrews weighing of the residuals corresponding to
 
 ``ρ(z) = -cos(πz/δ)/(π/δ)²`` if `|z|≤δ` and `ρ(δ)` otherwise.
+
+Note: symmetric weighing.
 """
 struct AndrewsRho{δ} <: RobustRho1P{δ}
    AndrewsRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -58,7 +79,7 @@ Andrews(δ::Real=1.0; delta::Real=δ) = AndrewsRho(delta)
    w  = ar .<= δ
    c  = π/δ
    κ  = (δ/π)^2
-   return sum( -cos.(c .* r) .* κ .* w .+ κ .* .!w )
+   return sum( @. ifelse(w, -cos(c * r) * κ, κ) )
 end
 
 # Note, sinc(x) = sin(πx)/πx, well defined everywhere
@@ -74,6 +95,8 @@ $TYPEDEF
 Bisquare weighing of the residuals corresponding to
 
 ``ρ(z) = δ²/6 (1-(1-(z/δ)²)³)`` if `|z|≤δ` and `δ²/6` otherwise.
+
+Note: symmetric weighing.
 """
 struct BisquareRho{δ} <: RobustRho1P{δ}
    BisquareRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -84,7 +107,7 @@ Bisquare(δ::Real=1.0; delta::Real=δ) = BisquareRho(delta)
    ar = abs.(r)
    w  = ar .<= δ
    κ  = δ^2/6
-   return sum( κ * (1.0 .- (1.0 .- (r ./ δ).^2).^3) .* w + κ .* .!w )
+   return sum( @. ifelse(w, κ * (1 - (1 - (r / δ)^2)^3), κ) )
 end
 
 ψ(::Type{BisquareRho{δ}}   ) where δ = (r, w) -> w * r * (1.0 - (r / δ)^2)^2
@@ -97,6 +120,8 @@ $TYPEDEF
 Logistic weighing of the residuals corresponding to
 
 ``ρ(z) = δ² log(cosh(z/δ))``
+
+Note: symmetric weighing.
 """
 struct LogisticRho{δ} <: RobustRho1P{δ}
    LogisticRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -104,7 +129,7 @@ end
 Logistic(δ::Real=1.0; delta::Real=δ) = LogisticRho(delta)
 
 (::LogisticRho{δ})(r::AVR) where δ = begin
-   return sum( δ^2 .* log.(cosh.(r ./ δ)) )
+   return sum( @. δ^2 * log(cosh(r / δ)) )
 end
 
 # similar to sinc, to avoid NaNs if tanh(0)/0 (lim is 1.0)
@@ -121,6 +146,8 @@ $TYPEDEF
 Fair weighing of the residuals corresponding to
 
 ``ρ(z) = δ² (|z|/δ - log(1+|z|/δ))``
+
+Note: symmetric weighing.
 """
 struct FairRho{δ} <: RobustRho1P{δ}
    FairRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -128,8 +155,8 @@ end
 Fair(δ::Real=1.0; delta::Real=δ) = FairRho(delta)
 
 (::FairRho{δ})(r::AVR) where δ = begin
-   sr = abs.(r) ./ δ
-   return sum( δ^2 .* (sr .- log1p.(sr)) )
+   sr = @. abs(r) / δ
+   return sum( @. δ^2 * (sr - log1p(sr)) )
 end
 
 ψ(::Type{FairRho{δ}}   ) where δ = (r, _) -> δ * r / (abs(r) + δ)
@@ -143,6 +170,8 @@ $TYPEDEF
 Talwar weighing of the residuals corresponding to
 
 ``ρ(z) = z²/2`` if `|z|≤δ` and `ρ(z)=ρ(δ)` otherwise.
+
+Note: symmetric weighing.
 """
 struct TalwarRho{δ} <: RobustRho1P{δ}
    TalwarRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -150,8 +179,8 @@ end
 Talwar(δ::Real=1.0; delta::Real=δ) = TalwarRho(delta)
 
 (::TalwarRho{δ})(r::AVR) where δ = begin
-   w = abs.(r) .<= δ
-   return sum( r.^2 ./ 2 .* w .+ δ^2/2 .* .!w)
+   w = @. abs(r) <= δ
+   return sum( @. ifelse(w, r^2 / 2, δ^2/2) )
 end
 
 ψ(::Type{TalwarRho{δ}}   ) where δ = (r, w) -> w * r
@@ -164,7 +193,11 @@ $TYPEDEF
 
 Quantile regression weighing of the residuals corresponding to
 
-``ρ(z) = z(δ - 1(z<0))``
+``ρ(z) = -z(δ - 1(z>=0))``
+
+Note: asymetric weighing, the "-" sign is because similar libraries like
+quantreg for instance define the residual as `y-Xθ` while we do the opposite
+(out of convenience for gradients etc).
 """
 struct QuantileRho{δ} <: RobustRho1P{δ}
    QuantileRho(δ::Real=1.0; delta::Real=δ) = new{delta}()
@@ -173,9 +206,9 @@ end
 Quantile(δ::Real=1.0; delta::Real=δ) = QuantileRho(delta)
 
 (::QuantileRho{δ})(r::AVR) where δ = begin
-   return sum( r .* (δ .- (r .<= 0.0)) )
+   return sum( @. -r * (δ - (r >= 0)) )
 end
 
-ψ(::Type{QuantileRho{δ}}   ) where δ = (r, _) -> (δ - (r <= 0.0))
-ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> (δ - (r <= 0.0)) / clip(r, τ)
+ψ(::Type{QuantileRho{δ}}   ) where δ = (r, _) -> ((r >= 0.0) - δ)
+ω(::Type{QuantileRho{δ}}, τ) where δ = (r, _) -> ((r >= 0.0) - δ) / clip(-r, τ)
 ϕ(::Type{QuantileRho{δ}}   ) where δ = (_, _) -> error("Newton(CG) not available for Quantile Reg.")
diff --git a/test/Project.toml b/test/Project.toml
index 3190290..30cbf32 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
diff --git a/test/benchmarks/elementary_functions.jl b/test/benchmarks/elementary_functions.jl
index c069c00..18ac322 100644
--- a/test/benchmarks/elementary_functions.jl
+++ b/test/benchmarks/elementary_functions.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=512, sparse=0.5)
diff --git a/test/benchmarks/logistic-multinomial.jl b/test/benchmarks/logistic-multinomial.jl
index 16b3e9e..0554fde 100644
--- a/test/benchmarks/logistic-multinomial.jl
+++ b/test/benchmarks/logistic-multinomial.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_binary(n, p; seed=525)
diff --git a/test/benchmarks/ridge-lasso.jl b/test/benchmarks/ridge-lasso.jl
index 2e50c43..46d9c40 100644
--- a/test/benchmarks/ridge-lasso.jl
+++ b/test/benchmarks/ridge-lasso.jl
@@ -1,6 +1,6 @@
 using MLJLinearModels, StableRNGs
 using BenchmarkTools, Random, LinearAlgebra
-DO_COMPARISONS = false; include("../testutils.jl")
+include("../testutils.jl")
 
 n, p = 50_000, 500
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=512, sparse=0.5)
diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl
new file mode 100644
index 0000000..d5a8e1c
--- /dev/null
+++ b/test/benchmarks/robust.jl
@@ -0,0 +1,29 @@
+# Follow up from issue #147 comparing quantreg more specifically.
+
+if DO_COMPARISONS
+    @testset "Comp-QR-147" begin
+        using CSV, DataFrames
+
+        dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
+        tau     = 0.3
+
+        y  = Vector(dataset[!,:rent_euro])
+        X  = Matrix(dataset[!,[:area, :yearc]])
+        X1 = hcat(X[:,1], X[:, 2], ones(size(X, 1)))
+
+        qr  = QuantileRegression(tau; penalty=:none)
+        obj = objective(qr, X, y)
+
+        θ_lbfgs = fit(qr, X, y)
+        @test isapprox(obj(θ_lbfgs), 226_639, rtol=1e-4)
+
+        # in this case QR with BR method does better
+        θ_qr_br = rcopy(getproperty(QUANTREG, :rq_fit_br)(X1, y; tau=tau))[:coefficients]
+        @test isapprox(obj(θ_qr_br), 207_551, rtol=1e-4)
+
+        # lasso doesn't
+        θ_qr_lasso = rcopy(getproperty(QUANTREG, :rq_fit_lasso)(X1, y; tau=tau))[:coefficients]
+        obj(θ_qr_lasso) # 229_172
+        @test 228_000 ≤ obj(θ_qr_lasso) ≤ 231_000
+    end
+end
diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl
index f1e6105..e4ce57a 100644
--- a/test/fit/quantile.jl
+++ b/test/fit/quantile.jl
@@ -46,16 +46,27 @@ y1a = outlify(y1, 0.1)
         θ_ls     = fit(LinearRegression(), X, y1a)
         θ_lbfgs  = fit(rr, X, y1a, solver=LBFGS())
         θ_iwls   = fit(rr, X, y1a, solver=IWLSCG())
-        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a))[:coefficients]
-        θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a))[:coefficients]
+        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients]
+        θ_qr_fnb = rcopy(QUANTREG.rq_fit_fnb(X1, y1a, tau=δ))[:coefficients]
         # NOTE: we take θ_qr_br as reference point
         @test isapprox(J(θ_ls), 505.45286,  rtol=1e-5)
         @test J(θ_qr_br) ≈      409.570777 # <- ref value
         # Their IP algorithm essentially gives the same answer
         @test (J(θ_qr_fnb) - J(θ_qr_br)) ≤ 1e-10
         # Our algorithms are close enough
-        @test isapprox(J(θ_lbfgs), 409.57154, rtol=1e-5)
+        @test isapprox(J(θ_lbfgs), 409.57608, rtol=1e-5)
         @test isapprox(J(θ_iwls),  409.59,    rtol=1e-4)
+
+        # Let's try this again but with a δ different from 0.5
+        δ  = 0.75
+        rr = QuantileRegression(δ, lambda=0)
+        J  = objective(rr, X, y1a)
+
+        θ_lbfgs  = fit(rr, X, y1a, solver=LBFGS())
+        θ_qr_br  = rcopy(QUANTREG.rq_fit_br(X1, y1a, tau=δ))[:coefficients]
+
+        @test isapprox(J(θ_qr_br), 404.6161, rtol=1e-4)
+        @test isapprox(J(θ_lbfgs), 404.6195, rtol=1e-4)
     end
 end
 
@@ -63,7 +74,7 @@ end
 ## With Sparsity penalty ##
 ###########################
 
-n, p = 500, 100
+Jn, p = 500, 100
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=51112, sparse=0.1)
 # pepper with outliers
 y1a  = outlify(y1, 0.1)
@@ -90,9 +101,9 @@ y1a  = outlify(y1, 0.1)
         θ_ls       = X1 \ y1a
         θ_fista    = fit(rr, X, y1a, solver=FISTA())
         θ_ista     = fit(rr, X, y1a, solver=ISTA())
-        θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a))[:coefficients]
+        θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a, lambda=λ))[:coefficients]
         @test isapprox(J(θ_ls),       888.3748, rtol=1e-5)
-        @test isapprox(J(θ_qr_lasso), 425.5,    rtol=1e-3)
+        @test isapprox(J(θ_qr_lasso), 425.2,    rtol=1e-3)
         # Our algorithms are close enough
         @test isapprox(J(θ_fista),    425.0526, rtol=1e-5)
         @test isapprox(J(θ_ista),     425.4113, rtol=1e-5)
@@ -101,6 +112,6 @@ y1a  = outlify(y1, 0.1)
         @test nnz(θ_fista)    == 88
         @test nnz(θ_ista)     == 82
         # in this case fista is best
-        @test J(θ_fista) < J(θ_ista) < J(θ_qr_lasso)
+        @test J(θ_fista) < J(θ_qr_lasso) < J(θ_ista)
     end
 end
diff --git a/test/loss-penalty/robust.jl b/test/loss-penalty/robust.jl
index 45e98ed..0abfbfa 100644
--- a/test/loss-penalty/robust.jl
+++ b/test/loss-penalty/robust.jl
@@ -5,7 +5,7 @@ y = randn(rng, 10)
 r = x .- y
 
 @testset "Robust Loss" begin
-    δ = 0.5
+    δ = 0.75
     rlδ = RobustLoss(Huber(δ))
     @test rlδ isa RobustLoss{HuberRho{δ}}
     @test rlδ(r) == rlδ(x, y) == sum(ifelse(abs(rᵢ)≤δ, rᵢ^2/2, δ*(abs(rᵢ)-δ/2)) for rᵢ in r)
diff --git a/test/runtests.jl b/test/runtests.jl
index 3d7099c..73cba12 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,8 @@ using Random, StableRNGs, DataFrames, ForwardDiff
 import Optim
 import MLJ, MLJBase
 
-DO_COMPARISONS = false; include("testutils.jl")
+DO_COMPARISONS = true
+include("testutils.jl")
 
 m("UTILS"); include("utils.jl")
 
@@ -31,3 +32,7 @@ m("MLJ", false); begin
     mm("fit-predict"); include("interface/fitpredict.jl")
     mm("extras");      include("interface/extras.jl")
 end
+
+m("MISC", false); begin
+    mm("benchmarks"); include("benchmarks/robust.jl")
+end

From 0b48318e246fea0f3cfa6322427d732500a5b839 Mon Sep 17 00:00:00 2001
From: tlienart <tlienart@me.com>
Date: Wed, 24 May 2023 08:36:01 +0200
Subject: [PATCH 3/5] small adjustments + typo fixes

---
 test/Project.toml         | 1 +
 test/benchmarks/robust.jl | 4 ++--
 test/fit/quantile.jl      | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index 30cbf32..ac1e430 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,6 +2,7 @@
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
diff --git a/test/benchmarks/robust.jl b/test/benchmarks/robust.jl
index d5a8e1c..6fcd739 100644
--- a/test/benchmarks/robust.jl
+++ b/test/benchmarks/robust.jl
@@ -2,9 +2,9 @@
 
 if DO_COMPARISONS
     @testset "Comp-QR-147" begin
-        using CSV, DataFrames
+        using CSV, DataFrames, Downloads
 
-        dataset = CSV.read(download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
+        dataset = CSV.read(Downloads.download("http://freakonometrics.free.fr/rent98_00.txt"), DataFrame)
         tau     = 0.3
 
         y  = Vector(dataset[!,:rent_euro])
diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl
index e4ce57a..d99eeb0 100644
--- a/test/fit/quantile.jl
+++ b/test/fit/quantile.jl
@@ -74,7 +74,7 @@ end
 ## With Sparsity penalty ##
 ###########################
 
-Jn, p = 500, 100
+n, p = 500, 100
 ((X, y, θ), (X1, y1, θ1)) = generate_continuous(n, p;  seed=51112, sparse=0.1)
 # pepper with outliers
 y1a  = outlify(y1, 0.1)

From 14eb46619bfa0983ff5d154393b268158e107767 Mon Sep 17 00:00:00 2001
From: adienes <51664769+adienes@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:09:12 -0400
Subject: [PATCH 4/5] first pass at Gramian training for OLS (#146)

* proof of concept

* AbstractMatrix -> AVR

* cleaner impl

* endline

* fix error type

* construct kernels if not passed in

* add test case for implicit gram construction

* last endline

* check for isempty instead of iszero
---
 src/fit/default.jl                | 19 ++++++++++++++-----
 src/fit/proxgrad.jl               | 17 +++++++++++++----
 src/fit/solvers.jl                |  1 +
 src/glr/d_l2loss.jl               |  9 +++++++++
 src/glr/utils.jl                  |  5 ++++-
 src/mlj/classifiers.jl            |  4 ++--
 src/utils.jl                      | 23 +++++++++++++++++++++++
 test/fit/ols-ridge-lasso-elnet.jl | 15 +++++++++++++++
 8 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/src/fit/default.jl b/src/fit/default.jl
index 5bd5575..459853c 100644
--- a/src/fit/default.jl
+++ b/src/fit/default.jl
@@ -33,13 +33,22 @@ $SIGNATURES
 Fit a generalised linear regression model using an appropriate solver based on
 the loss and penalty of the model. A method can, in some cases, be specified.
 """
-function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
+function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR; data=nothing,
              solver::Solver=_solver(glr, size(X)))
-    check_nrows(X, y)
-    n, p = size(X)
-    c = getc(glr, y)
-    return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    if hasproperty(solver, :gram) && solver.gram
+        # interpret X,y as X'X, X'y
+        data = verify_or_construct_gramian(glr, X, y, data)
+        p = size(data.XX, 2)
+        return _fit(glr, solver, data.XX, data.Xy, (; dims=(data.n, p, 0)))
+    else
+        check_nrows(X, y)
+        n, p = size(X)
+        c = getc(glr, y)
+        return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
+    end
 end
+fit(glr::GLR; kwargs...) = fit(glr, zeros((0,0)), zeros((0,)); kwargs...)
+
 
 function scratch(n, p, c=0; i=false)
     p_ = p + Int(i)
diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl
index bebd1f0..1a04fe1 100644
--- a/src/fit/proxgrad.jl
+++ b/src/fit/proxgrad.jl
@@ -3,7 +3,7 @@
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
 function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
-    _,p,c = npc(scratch)
+    n,p,c = npc(scratch)
     c > 0 && (p *= c)
     # vector caches + eval cache
     θ   = zeros(p)   # θ_k
@@ -19,9 +19,18 @@ function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
     η   = 1.0   # stepsize (1/L)
     acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA)
     # functions
-    _f      = smooth_objective(glr, X, y; c=c)
-    _fg!    = smooth_fg!(glr, X, y, scratch)
-    _prox!  = prox!(glr, size(X, 1))
+    _f = if solver.gram
+        smooth_gram_objective(glr, X, y, n)
+    else
+        smooth_objective(glr, X, y; c=c)
+    end
+
+    _fg! = if solver.gram
+        smooth_gram_fg!(glr, X, y, n)
+    else
+        smooth_fg!(glr, X, y, scratch)
+    end
+    _prox!  = prox!(glr, n)
     bt_cond = θ̂ ->
                 _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
     # loop-related
diff --git a/src/fit/solvers.jl b/src/fit/solvers.jl
index 93655a3..aaeccb2 100644
--- a/src/fit/solvers.jl
+++ b/src/fit/solvers.jl
@@ -133,6 +133,7 @@ Proximal Gradient solver for non-smooth objective functions.
     tol::Float64   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
     max_inner::Int = 100   # β^max_inner should be > 1e-10
     beta::Float64  = 0.8   # in (0, 1); shrinkage in the backtracking step
+    gram::Bool = false     # use precomputed Gramian for lsq where possible
 end
 
 FISTA(; kwa...) = ProxGrad(;accel = true, kwa...)
diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl
index 3160a9a..1184c9f 100644
--- a/src/glr/d_l2loss.jl
+++ b/src/glr/d_l2loss.jl
@@ -72,3 +72,12 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y, scratch)
         return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
     end
 end
+
+function smooth_gram_fg!(glr::GLR{L2Loss,<:ENR}, XX, Xy, n)
+    λ = get_penalty_scale_l2(glr, n)
+    (g, θ) -> begin
+        _g = XX * θ .- Xy
+        g .= _g .+ λ .* θ
+        return θ'*_g + get_l2(glr.penalty)(view_θ(glr, θ))
+    end
+end
diff --git a/src/glr/utils.jl b/src/glr/utils.jl
index 026f0ff..102da1d 100644
--- a/src/glr/utils.jl
+++ b/src/glr/utils.jl
@@ -1,4 +1,4 @@
-export objective, smooth_objective
+export objective, smooth_objective, smooth_gram_objective
 
 # NOTE: RobustLoss are not always everywhere  smooth but "smooth-enough".
 const SmoothLoss = Union{L2Loss, LogisticLoss, MultinomialLoss, RobustLoss}
@@ -37,6 +37,9 @@ Return the smooth part of the objective function of a GLR.
 """
 smooth_objective(glr::GLR{<:SmoothLoss,<:ENR}, n) = glr.loss + get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.)
 
+smooth_gram_objective(glr::GLR{<:SmoothLoss,<:ENR}, XX, Xy, n) =
+    θ -> (θ'*XX*θ)/2 - (θ'*Xy) + (get_l2(glr.penalty) * ifelse(glr.scale_penalty_with_samples, n, 1.))(θ)
+
 smooth_objective(::GLR) = @error "Case not implemented yet."
 
 """
diff --git a/src/mlj/classifiers.jl b/src/mlj/classifiers.jl
index 75ffa31..8b04a87 100644
--- a/src/mlj/classifiers.jl
+++ b/src/mlj/classifiers.jl
@@ -65,7 +65,7 @@ See also [`MultinomialClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`, `Newton`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
@@ -142,7 +142,7 @@ See also [`LogisticClassifier`](@ref).
     """some instance of `MLJLinearModels.S` where `S` is one of: `LBFGS`,
     `NewtonCG`, `ProxGrad`; but subject to the following restrictions:
 
-    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxyGrad` is the only
+    - If `penalty = :l2`, `ProxGrad` is disallowed. Otherwise, `ProxGrad` is the only
       option.
 
     - Unless `scitype(y) <: Finite{2}` (binary target) `Newton` is disallowed.
diff --git a/src/utils.jl b/src/utils.jl
index 4e6d952..36b93ba 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -9,6 +9,29 @@ function check_nrows(X::AbstractMatrix, y::AbstractVecOrMat)::Nothing
     throw(DimensionMismatch("`X` and `y` must have the same number of rows."))
 end
 
+function verify_or_construct_gramian(glr, X, y, data)
+    check_nrows(X, y)
+    isnothing(data) && return (; XX = X'X, Xy = X'y, n = length(y))
+
+    !all(hasproperty.(Ref(data), (:XX, :Xy, :n))) && throw(ArgumentError("data must contain XX, Xy, n"))
+    size(data.XX, 1) != size(data.Xy, 1) && throw(DimensionMismatch("`XX` and Xy` must have the same number of rows."))
+    !issymmetric(data.XX) && throw(ArgumentError("Input `XX` must be symmetric"))
+
+    c = getc(glr, data.Xy)
+    !iszero(c) && throw(ArgumentError("Categorical loss not supported with Gramian kernel"))
+    glr.fit_intercept && throw(ArgumentError("Intercept not supported with Gramian kernel"))
+
+    if any(!isempty, (X, y))
+        all((
+            isapprox(X'X, data.XX; rtol=1e-5),
+            isapprox(X'y, data.Xy; rtol=1e-5),
+            length(y) == data.n
+        )) || throw(ArgumentError("Inputs `X` and `y` do not match inputs `XX` and `Xy`."))
+    end
+
+    return data
+end
+
 """
 $SIGNATURES
 
diff --git a/test/fit/ols-ridge-lasso-elnet.jl b/test/fit/ols-ridge-lasso-elnet.jl
index 2a99168..3c9a1c6 100644
--- a/test/fit/ols-ridge-lasso-elnet.jl
+++ b/test/fit/ols-ridge-lasso-elnet.jl
@@ -146,3 +146,18 @@ end
         @test nnz(θ_sk) == 8
     end
 end
+
+@testset "gramian" begin
+    λ = 0.1
+    γ = 0.1
+    enr = ElasticNetRegression(λ, γ; fit_intercept=false,
+                                     scale_penalty_with_samples=false)    
+    XX = X'X
+    Xy = X'y
+    n = size(X, 1)
+    θ_fista = fit(enr, X, y; solver=FISTA(max_iter=5000))
+    θ_gram_explicit = fit(enr; data=(; XX, Xy, n), solver=FISTA(max_iter=5000, gram=true))
+    θ_gram_implicit = fit(enr, X, y; solver=FISTA(max_iter=5000, gram=true))
+    @test isapprox(θ_fista, θ_gram_explicit, rtol=1e-5)
+    @test isapprox(θ_gram_explicit, θ_gram_implicit; rtol=1e-5)
+end

From 88b5c5f267b972b3c6dcbc9e1bb344deb88351de Mon Sep 17 00:00:00 2001
From: Thibaut Lienart <tlienart@me.com>
Date: Mon, 2 Oct 2023 23:10:05 +0200
Subject: [PATCH 5/5] Prepare minor release with gramian training

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index dbae3c8..b4773b6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.9.2"
+version = "0.10.0"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"