Multivariate models now implement a diagonal covariance matrix.

JoshuaBillson · JoshuaBillson · commit 24cc03755e9b · 2023-04-28T17:44:36.000-06:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MixtureDensityNetworks"
 uuid = "521d8788-cab4-41cb-a05a-da376f16ad79"
 authors = ["Joshua Billson"]
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
diff --git a/README.md b/README.md
@@ -17,38 +17,39 @@ with the MLJ ecosystem. Below is an example demonstrating the use of this packag
 # Example (Native Interface)
 
 ```julia
-using MixtureDensityNetworks, Distributions, CairoMakie, Logging, TerminalLoggers
+using Flux, MixtureDensityNetworks, Distributions, CairoMakie, Logging, TerminalLoggers
 
 const n_samples = 1000
 const epochs = 1000
-const mixtures = 6
+const batchsize = 128
+const mixtures = 8
 const layers = [128, 128]
 
 function main()
     # Generate Data
     X, Y = generate_data(n_samples)
 
     # Create Model
-    machine = MixtureDensityNetworks.Machine(MDN(epochs=epochs, mixtures=mixtures, layers=layers))
+    model = MixtureDensityNetwork(1, 1, layers, mixtures)
 
     # Fit Model
-    report = with_logger(TerminalLogger()) do 
-        fit!(machine, X, Y)
+    model, report = with_logger(TerminalLogger()) do 
+        MixtureDensityNetworks.fit!(model, X, Y; epochs=epochs, opt=Flux.Adam(1e-3), batchsize=batchsize)
     end
 
     # Plot Learning Curve
     fig, _, _ = lines(1:epochs, report.learning_curve, axis=(;xlabel="Epochs", ylabel="Loss"))
     save("LearningCurve.png", fig)
 
     # Plot Learned Distribution
-    Ŷ = predict(machine, X)
+    Ŷ = model(X)
     fig, ax, plt = scatter(X[1,:], rand.(Ŷ), markersize=4, label="Predicted Distribution")
     scatter!(ax, X[1,:], Y[1,:], markersize=3, label="True Distribution")
     axislegend(ax, position=:lt)
     save("PredictedDistribution.png", fig)
 
     # Plot Conditional Distribution
-    cond = predict(machine, reshape([-2.0], (1,1)))[1]
+    cond = model(reshape([-2.1], (1,1)))[1]
     fig = Figure(resolution=(1000, 500))
     density(fig[1,1], rand(cond, 10000), npoints=10000)
     save("ConditionalDistribution.png", fig)
@@ -60,21 +61,22 @@ main()
 # Example (MLJ Interface)
 
 ```julia
-using MixtureDensityNetworks, Distributions, Logging, TerminalLoggers, CairoMakie, MLJ
+using MixtureDensityNetworks, Distributions, Logging, TerminalLoggers, CairoMakie, MLJ, Random
 
 const n_samples = 1000
-const epochs = 1000
-const mixtures = 6
+const epochs = 500
+const batchsize = 128
+const mixtures = 8
 const layers = [128, 128]
 
 function main()
     # Generate Data
     X, Y = generate_data(n_samples)
 
     # Create Model
-    mach = MLJ.machine(MDN(epochs=epochs, mixtures=mixtures, layers=layers), MLJ.table(X'), Y[1,:])
+    mach = MLJ.machine(MDN(epochs=epochs, mixtures=mixtures, layers=layers, batchsize=batchsize), MLJ.table(X'), Y[1,:])
 
-    # Evaluate Model
+    # Fit Model on Training Data, Then Evaluate on Test
     with_logger(TerminalLogger()) do 
         @info "Evaluating..."
         evaluation = MLJ.evaluate!(
@@ -88,7 +90,7 @@ function main()
         @info "Metrics: " * join(["$name: $metric" for (name, metric) in zip(names, metrics)], ", ")
     end
 
-    # Fit Model
+    # Fit Model on Entire Dataset
     with_logger(TerminalLogger()) do 
         @info "Training..."
         MLJ.fit!(mach)
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -60,7 +60,7 @@ density(fig[1,1], rand(cond, 10000), npoints=10000)
 
 Below is a script for running the complete example.
 ```julia
-using MixtureDensityNetworks, Distributions, CairoMakie, Logging, TerminalLoggers
+using Flux, MixtureDensityNetworks, Distributions, CairoMakie, Logging, TerminalLoggers
 
 const n_samples = 1000
 const epochs = 1000
diff --git a/docs/src/mlj.md b/docs/src/mlj.md
@@ -10,8 +10,6 @@ with the MLJ ecosystem. Below is an example demonstrating the use of this packag
 ```julia
 using MixtureDensityNetworks, Distributions, Logging, TerminalLoggers, CairoMakie, MLJ, Random
 
-Random.seed!(123)
-
 const n_samples = 1000
 const epochs = 500
 const batchsize = 128
diff --git a/examples/mlj_example.jl b/examples/mlj_example.jl
@@ -1,7 +1,5 @@
 using MixtureDensityNetworks, Distributions, Logging, TerminalLoggers, CairoMakie, MLJ, Random
 
-Random.seed!(123)
-
 const n_samples = 1000
 const epochs = 500
 const batchsize = 128
diff --git a/examples/multivariate_example.jl b/examples/multivariate_example.jl
@@ -0,0 +1,45 @@
+using Flux, MixtureDensityNetworks, Distributions, CairoMakie, Logging, TerminalLoggers
+
+const n_samples = 1000
+const epochs = 250
+const batchsize = 256
+const mixtures = 12
+const layers = [256, 512, 1024]
+
+function main()
+    # Generate Data
+    Y = rand(Uniform(-10.5, 10.5), 1, n_samples)
+    μ_x = (7sin.(0.75 .* Y) + 0.5 .* Y)
+    X = rand.(Normal.(μ_x, 0.5))
+    μ_z = (-0.5 .* X) .+ 2.0
+    Z = rand.(Normal.(μ_z, 0.6))
+    Y = cat(Y, Z, dims=1)
+
+    # Normalize Features
+    X̄ = (X .- mean(X, dims=2)) ./ std(X, dims=2)
+
+    # Create Model
+    model = MixtureDensityNetwork(1, 2, layers, mixtures)
+
+    # Fit Model
+    model, report = with_logger(TerminalLogger()) do 
+        MixtureDensityNetworks.fit!(model, X̄, Y; batchsize=batchsize, epochs=epochs)
+    end
+
+    # Plot Learning Curve
+    fig, _, _ = lines(1:epochs, report.learning_curve, axis=(;xlabel="Epochs", ylabel="Loss"))
+    save("MultivariateLearningCurve.png", fig)
+
+    # Plot Learned Distribution
+    Ŷ = model(X̄) .|> rand
+    fig = Figure(resolution=(2000,1000), figure_padding=100)
+    ax1 = Axis3(fig[1,1], title="True Distribution", elevation=0.2π, azimuth=0.25π, titlesize=48, titlegap=50)
+    ax2 = Axis3(fig[1,2], title="Predicted Distribution", elevation=0.2π, azimuth=0.25π, titlesize=48, titlegap=50)
+    scatter!(ax1, X[1,:], Y[1,:], Y[2,:], markersize=3.0)
+    scatter!(ax2, X[1,:], [x[1] for x in Ŷ], [x[2] for x in Ŷ], markersize=3.0)
+    xlims!(ax1, -15, 15); zlims!(ax1, -7, 10); ylims!(ax1, -13, 13)
+    xlims!(ax2, -15, 15); zlims!(ax2, -7, 10); ylims!(ax2, -13, 13)
+    save("MultivariateDistributions.png", fig)
+end
+
+main()
diff --git a/src/layers.jl b/src/layers.jl
@@ -84,7 +84,7 @@ function MultivariateGMM(input::Int, output::Int, mixtures::Int)
 
     # Construct Output Layer
     μ = Flux.Dense(input=>(output * mixtures), init=init)
-    Σ = Flux.Dense(input=>(output * output * mixtures), init=init)
+    Σ = Flux.Dense(input=>(output * mixtures), exp, init=init)
     w = Flux.Chain(Flux.Dense(input=>mixtures, init=init), x -> Flux.softmax(x; dims=1))
     
     # Return Layer
@@ -98,16 +98,11 @@ end
 function (m::MultivariateGMM)(X::AbstractMatrix{Float64})
     # Forward Pass
     μ = reshape(m.μ(X), (m.mixtures, m.outputs, :))
-    Σ = reshape(m.Σ(X), (m.mixtures, m.outputs, m.outputs, :))
+    D = reshape(m.Σ(X), (m.mixtures, m.outputs, :))
     w = reshape(m.w(X), (m.mixtures, :))
 
-    # Get Cholesky Decomposition Of Σ
-    d_mask = [b == c ? 1.0 : 0.0 for a in 1:1, b in 1:m.outputs, c in 1:m.outputs, d in 1:1]
-    u_mask = [b < c ? 1.0 : 0.0 for a in 1:1, b in 1:m.outputs, c in 1:m.outputs, d in 1:1]
-    U = exp.(Σ .* d_mask) .+ (Σ .* u_mask)
-
     # Return Distributions
     return map(eachindex(w[1,:])) do obs
-        MixtureModel([MultivariateNormal(μ[mixture,:,obs], U[mixture,:,:,obs]' * U[mixture,:,:,obs] + 1e-9I) for mixture in eachindex(μ[:,1,1])], w[:,obs])
+        MixtureModel([MultivariateNormal(μ[mixture,:,obs], Diagonal(D[mixture,:,obs])) for mixture in eachindex(μ[:,1,1])], w[:,obs])
     end
 end
diff --git a/src/losses.jl b/src/losses.jl
@@ -22,7 +22,7 @@ Conpute the negative log-likelihood loss for a set of labels `y` under a set of
 
 # Parameters
 - `distributions`: A vector of multivariate Gaussian Mixture Model distributions.
-- `y`: A kxn matrix of labels where k is the dimension of each label and n is the number of samples.
+- `y`: A dxn matrix of labels where d is the dimension of each label and n is the number of samples.
 """
 function likelihood_loss(distributions::Vector{<:MixtureModel{Multivariate}}, y::Matrix{<:Real})
     return likelihood_loss(distributions, Float64.(y))
diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl
@@ -129,7 +129,7 @@ MLJModelInterface.metadata_model(
     input_scitype=MMI.Table(MMI.Continuous),
     target_scitype=AbstractVector{<:MMI.Continuous},
     load_path="MixtureDensityNetworks.MDN", 
-    human_name="MDN", 
+    human_name="Mixture Density Network", 
 )
 
 """
diff --git a/src/model.jl b/src/model.jl
@@ -1,7 +1,7 @@
 """
 $(TYPEDEF)
 
-A custom Flux model whose predictions paramaterize a Gaussian Mixture Model.
+A Flux model for implementing a standard Mixture Density Network.
 
 # Parameters
 $(TYPEDFIELDS)
@@ -19,8 +19,8 @@ $(TYPEDSIGNATURES)
 Construct a standard Mixture Density Network.
 
 # Parameters
-- `input`: The length of the input feature vectors.
-- `output`: The length of the output feature vectors.
+- `input`: The dimension of the input features.
+- `output`: The dimension of the output. Setting output = 1 indicates a univariate model, whereas output > 1 indicates a multivariate model.
 - `layers`: The topolgy of the hidden layers, starting from the first layer.
 - `mixtures`: The number of Gaussian mixtures to use in the predicted distribution.
 """
diff --git a/src/native_interface.jl b/src/native_interface.jl
@@ -5,10 +5,10 @@ Fit the model to the data given by X and Y.
 
 # Parameters
 - `m`: The model to be trained.
-- `X`: A dxn matrix where d is the number of features and n is the number of samples.
-- `Y`: A 1xn matrix where n is the number of samples.
+- `X`: A dxn matrix where d is the number of input features and n is the number of samples.
+- `Y`: A dxn matrix where d is the dimension of the output and n is the number of samples.
 - `opt`: The optimization algorithm to use during training (default = Adam(1e-3)).
-- `batchsize`: The batch size dor each iteration of gradient descent (default = 32).
+- `batchsize`: The batch size for each iteration of gradient descent (default = 32).
 - `epochs`: The number of epochs to train for (default = 100).
 """
 function fit!(m, X::Matrix{<:Real}, Y::Matrix{<:Real}; opt=Flux.Adam(), batchsize=32, epochs=100)
@@ -79,8 +79,8 @@ $(TYPEDSIGNATURES)
 Predict the point associated with the highest probability in the conditional distribution P(Y|X).
 
 # Parameters
-- `m`: The model from which we want to generate a prediction.
-- `X`: The input features to be passed to `m`. Expected to be a matrix with dimensions d x n where d is the length of each feature vector.
+- `m`: The model with which to generate a prediction.
+- `X`: The input to be passed to `m`. Expected to be a matrix with dimensions dxn where n is the number of observations.
 
 # Returns
 The mode of each distribution returned by `m(X)`.

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ MLJModelInterface.metadata_model(`
`129`	`129`	`input_scitype=MMI.Table(MMI.Continuous),`
`130`	`130`	`target_scitype=AbstractVector{<:MMI.Continuous},`
`131`	`131`	`load_path="MixtureDensityNetworks.MDN",`
`132`		`- human_name="MDN",`
	`132`	`+ human_name="Mixture Density Network",`
`133`	`133`	`)`
`134`	`134`
`135`	`135`	`"""`