JuliaML
diff --git a/‎Project.toml
+2 b/‎Project.toml
+2
diff --git a/‎docs/src/transforms/builtin.md
+6 b/‎docs/src/transforms/builtin.md
+6
diff --git a/‎src/TableTransforms.jl
+2 b/‎src/TableTransforms.jl
+2
diff --git a/‎src/transforms.jl
+1 b/‎src/transforms.jl
+1
diff --git a/‎src/transforms/projectionpursuit.jl
+201 b/‎src/transforms/projectionpursuit.jl
+201
diff --git a/‎test/Project.toml
+2 b/‎test/Project.toml
+2
diff --git a/‎test/data/center.png
26.1 KB b/‎test/data/center.png
26.1 KB
diff --git a/‎test/data/eigenanalysis-1.png
17.9 KB b/‎test/data/eigenanalysis-1.png
17.9 KB
diff --git a/‎test/data/eigenanalysis-2.png
28 KB b/‎test/data/eigenanalysis-2.png
28 KB
diff --git a/‎test/data/projectionpursuit-1.png
258 KB b/‎test/data/projectionpursuit-1.png
258 KB
diff --git a/‎test/data/projectionpursuit-2.png
485 KB b/‎test/data/projectionpursuit-2.png
485 KB
diff --git a/‎test/data/projectionpursuit-3.png
96.1 KB b/‎test/data/projectionpursuit-3.png
96.1 KB
diff --git a/‎test/data/scale.png
26.9 KB b/‎test/data/scale.png
26.9 KB
diff --git a/‎test/data/zscore.png
15.2 KB b/‎test/data/zscore.png
15.2 KB
diff --git a/‎test/runtests.jl
+2-2 b/‎test/runtests.jl
+2-2
diff --git a/‎test/transforms.jl
+1 b/‎test/transforms.jl
+1
diff --git a/‎test/transforms/center.jl
+2-2 b/‎test/transforms/center.jl
+2-2
diff --git a/‎test/transforms/eigenanalysis.jl
+7-7 b/‎test/transforms/eigenanalysis.jl
+7-7
@@ -8,6 +8,7 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
@@ -21,6 +22,7 @@ TransformsBase = "28dd2a49-a57a-4bfb-84ca-1a49db9b96b8"
 AbstractTrees = "0.4"
 CategoricalArrays = "0.10"
 Distributions = "0.25"
+Optim = "1.7"
 PrettyTables = "1.3, 2"
 ScientificTypes = "2.3, 3.0"
 StatsBase = "0.33"
 
@@ -152,6 +152,12 @@ DRS
 SDS
 ```
 
+## ProjectionPursuit
+
+```@docs
+ProjectionPursuit
+```
+
 ## RowTable
 
 ```@docs
 
@@ -16,6 +16,7 @@ using PrettyTables
 using AbstractTrees
 using CategoricalArrays
 using Random
+using Optim: optimize, minimizer
 
 import Distributions: ContinuousUnivariateDistribution
 import Distributions: quantile, cdf
@@ -67,6 +68,7 @@ export
   Functional,
   EigenAnalysis,
   PCA, DRS, SDS,
+  ProjectionPursuit,
   RowTable,
   ColTable,
   →, ⊔
 
@@ -285,6 +285,7 @@ include("transforms/zscore.jl")
 include("transforms/quantile.jl")
 include("transforms/functional.jl")
 include("transforms/eigenanalysis.jl")
+include("transforms/projectionpursuit.jl")
 include("transforms/rowtable.jl")
 include("transforms/coltable.jl")
 include("transforms/parallel.jl")
@@ -0,0 +1,201 @@
+# ------------------------------------------------------------------
+# Licensed under the MIT License. See LICENSE in the project root.
+# ------------------------------------------------------------------
+
+"""
+    ProjectionPursuit(;tol=1e-6, maxiter=100, deg=5, perc=.9, n=100)
+
+The projection pursuit multivariate transform converts any multivariate distribution into
+the standard multivariate Gaussian distribution.
+
+This iterative algorithm repeatedly finds a direction of projection `α` that maximizes a score of
+non-Gaussianity known as the projection index `I(α)`. The samples projected along `α` are then
+transformed with the [`Quantile`](@ref) transform to remove the non-Gaussian structure. The
+other coordinates in the rotated orthonormal basis `Q = [α ...]` are left untouched.
+
+The non-singularity of Q is controlled by assuring that norm(det(Q)) ≥ `tol`. The iterative 
+process terminates whenever the transformed samples are "more Gaussian" than `perc`% of `n`
+randomly generated samples from the standard multivariate Gaussian distribution, or when the 
+number of iterations reaches a maximum `maxiter`.
+
+# Examples
+
+```julia
+ProjectionPursuit()
+ProjectionPursuit(deg=10)
+ProjectionPursuit(perc=.85, n=50)
+ProjectionPursuit(tol=1e-4, maxiter=250, deg=5, perc=.95, n=100)
+```
+
+See [https://doi.org/10.2307/2289161](https://doi.org/10.2307/2289161) for 
+further details.
+"""
+
+struct ProjectionPursuit{T} <: StatelessFeatureTransform
+  tol::T
+  maxiter::Int
+  deg::Int
+  perc::T
+  n::Int
+end
+
+ProjectionPursuit(;tol=1e-6, maxiter=100, deg=5, perc=.9, n=100) =
+  ProjectionPursuit{typeof(tol)}(tol, maxiter, deg, perc, n)
+
+isrevertible(::Type{<:ProjectionPursuit}) = true
+
+# transforms a row of random variables into a convex combination 
+# of random variables with values in [-1,1] and standard normal distribution
+rscore(Z, α) = 2 .* cdf.(Normal(), Z * α) .- 1
+
+# projection index of sample along a given direction
+function pindex(transform, Z, α)
+  d = transform.deg
+  r = rscore(Z, α)
+  I = (3/2) * mean(r)^2
+  if d > 1
+    Pⱼ₋₂, Pⱼ₋₁ = ones(length(r)), r
+    for j = 2:d
+      Pⱼ₋₂, Pⱼ₋₁ = 
+        Pⱼ₋₁, (1/j) * ((2j-1) * r .* Pⱼ₋₁ - (j-1) * Pⱼ₋₂)
+      I += ((2j+1)/2) * (mean(Pⱼ₋₁))^2
+    end
+  end
+  I
+end
+
+# j-th element of the canonical basis in ℝᵈ
+basis(d, j) = float(1:d .== j)
+
+# index for all vectors in the canonical basis
+function pbasis(transform, Z)
+  q = size(Z, 2)
+  [pindex(transform, Z, basis(q, j)) for j in 1:q]
+end
+
+# projection index of the standard multivariate Gaussian
+function gaussquantiles(transform, N, q)
+  n = transform.n
+  p = 1.0 - transform.perc
+  Is = [pbasis(transform, randn(N, q)) for i in 1:n]
+  I  = reduce(hcat, Is)
+  quantile.(eachrow(I), p)
+end
+
+function alphaguess(transform, Z)
+  q = size(Z, 2)
+  
+  # objective function
+  func(α) = pindex(transform, Z, α)
+  
+  # evaluate objective along axes
+  j = argmax(j -> func(basis(q, j)), 1:q)
+  α = basis(q, j)
+  I = func(α)
+  
+  # evaluate objective along diagonals
+  diag(α, s, e) = (1/√(2+2s*α⋅e)) * (α + s * e)
+  for eᵢ in basis.(q, 1:q)
+    d₊ = diag(α, +1, eᵢ)
+    d₋ = diag(α, -1, eᵢ)
+    f₊ = func(d₊)
+    f₋ = α⋅eᵢ != 1.0 ? func(d₋) : 0.0
+    f, d = f₊ > f₋ ? (f₊, d₊) : (f₋, d₋)
+    if f > I
+      α = d
+      I = f
+    end
+  end
+  
+  α
+end
+
+function neldermead(transform, Z, α₀)
+  f(α) = -pindex(transform, Z, α ./ norm(α))
+  op = optimize(f, α₀)
+  minimizer(op)
+end
+
+function alphamax(transform, Z)
+  α = alphaguess(transform, Z)
+  neldermead(transform, Z, α)  
+end
+
+function orthobasis(α, tol)
+  q = length(α)
+  Q, R = qr([α rand(q,q-1)])
+  while norm(diag(R)) < tol
+    Q, R = qr([α rand(q,q-1)])
+  end  
+  Q
+end
+
+function rmstructure(transform, Z, α)
+  # find orthonormal basis for rotation
+  Q = orthobasis(α, transform.tol)
+
+  # remove structure of first rotated axis
+  newtable, qcache = apply(Quantile(1), Tables.table(Z * Q))
+  
+  # undo rotation, i.e recover original axis-aligned features
+  Z₊ = Tables.matrix(newtable) * Q'
+  
+  Z₊, (Q, qcache)
+end
+
+sphering() = Quantile() → EigenAnalysis(:VDV)
+
+function applyfeat(transform::ProjectionPursuit, table, prep) 
+  # retrieve column names
+  cols = Tables.columns(table)
+  names = Tables.columnnames(cols)
+
+  # preprocess the data to approximately spherical shape
+  ptable, pcache = apply(sphering(), table)
+
+  # initialize scores and Gaussian quantiles
+  Z = Tables.matrix(ptable)
+  I = pbasis(transform, Z)
+  g = gaussquantiles(transform, size(Z)...) 
+
+  iter = 0; caches = []
+  while any(I .> g) && iter ≤ transform.maxiter
+    # choose direction with maximum projection index
+    α = alphamax(transform, Z)
+    
+    # remove non-Gaussian structure
+    Z, cache = rmstructure(transform, Z, α)
+    
+    # update the scores along original axes
+    I = pbasis(transform, Z)
+    
+    # store cache and continue
+    push!(caches, cache)
+    iter += 1
+  end
+
+  𝒯 = (; zip(names, eachcol(Z))...)
+  newtable = 𝒯 |> Tables.materializer(table)
+  newtable, (pcache, caches)
+end
+
+function revertfeat(::ProjectionPursuit, newtable, fcache)
+  # retrieve column names
+  cols = Tables.columns(newtable)
+  names = Tables.columnnames(cols)
+  
+  # caches to retrieve transform steps
+  pcache, caches = fcache
+
+  Z = Tables.matrix(newtable)
+  for (Q, qcache) in reverse(caches)
+    table = revert(Quantile(1), Tables.table(Z * Q), qcache)
+    Z = Tables.matrix(table) * Q'
+  end
+  
+  table = revert(sphering(), Tables.table(Z), pcache)
+  Z = Tables.matrix(table)
+  
+  𝒯 = (; zip(names, eachcol(Z))...)
+  newtable = 𝒯 |> Tables.materializer(newtable)
+end
@@ -4,6 +4,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
 ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PairPlots = "43a3c2be-4208-490b-832a-a21dcd55d7da"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReferenceTests = "324d217c-45ce-50fc-942e-d289b448e8cf"
@@ -16,4 +17,5 @@ TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
 
 [compat]
 GR = "=0.59.0"
+PairPlots = "=0.6.0"
 Plots = "=1.22.4"
@@ -9,12 +9,12 @@ using Statistics
 using Test, Random, Plots
 using ReferenceTests, ImageIO
 using StatsBase
+using PairPlots
 
 const TT = TableTransforms
 
 # set default configurations for plots
-gr(ms=1, mc=:black, aspectratio=:equal,
-   label=false, size=(600,400))
+gr(ms=1, mc=:black, label=false, size=(600,400))
 
 # workaround GR warnings
 ENV["GKSwstype"] = "100"
 
@@ -17,6 +17,7 @@ transformfiles = [
   "quantile.jl",
   "functional.jl",
   "eigenanalysis.jl",
+  "projectionpursuit.jl",
   "rowtable.jl",
   "coltable.jl",
   "sequential.jl",
 
@@ -12,8 +12,8 @@
 
   # visual tests    
   if visualtests
-    p₁ = scatter(t.x, t.y, label="Original")
-    p₂ = scatter(n.x, n.y, label="Center")
+    p₁ = scatter(t.x, t.y, label="Original", aspectratio=:equal)
+    p₂ = scatter(n.x, n.y, label="Center", aspectratio=:equal)
     p = plot(p₁, p₂, layout=(1,2))
 
     @test_reference joinpath(datadir, "center.png") p
 
@@ -53,13 +53,13 @@
 
   # visual tests    
   if visualtests
-    p₁ = scatter(t₁.x, t₁.y, label="Original")
-    p₂ = scatter(t₂.PC1, t₂.PC2, label="V")
-    p₃ = scatter(t₃.PC1, t₃.PC2, label="VD")
-    p₄ = scatter(t₄.PC1, t₄.PC2, label="VDV")
-    p₅ = scatter(t₅.PC1, t₅.PC2, label="PCA")
-    p₆ = scatter(t₆.PC1, t₆.PC2, label="DRS")
-    p₇ = scatter(t₇.PC1, t₇.PC2, label="SDS")
+    p₁ = scatter(t₁.x, t₁.y, label="Original", aspectratio=:equal)
+    p₂ = scatter(t₂.PC1, t₂.PC2, label="V", aspectratio=:equal)
+    p₃ = scatter(t₃.PC1, t₃.PC2, label="VD", aspectratio=:equal)
+    p₄ = scatter(t₄.PC1, t₄.PC2, label="VDV", aspectratio=:equal)
+    p₅ = scatter(t₅.PC1, t₅.PC2, label="PCA", aspectratio=:equal)
+    p₆ = scatter(t₆.PC1, t₆.PC2, label="DRS", aspectratio=:equal)
+    p₇ = scatter(t₇.PC1, t₇.PC2, label="SDS", aspectratio=:equal)
     p = plot(p₁, p₂, p₃, p₄, layout=(2,2))
     q = plot(p₂, p₃, p₄, p₅, p₆, p₇, layout=(2,3))