diff --git a/Project.toml b/Project.toml index ed83fb94..4f5a393c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJFlux" uuid = "094fc8d1-fd35-5302-93ea-dabda2abf845" authors = ["Anthony D. Blaom ", "Ayush Shridhar "] -version = "0.6.6" +version = "0.6.7" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -17,7 +17,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -CategoricalArrays = "0.10" +CategoricalArrays = "1" ColorTypes = "0.10.3, 0.11, 0.12" ComputationalResources = "0.3.2" Flux = "0.14, 0.15, 0.16" @@ -25,7 +25,7 @@ MLJModelInterface = "1.11" Metalhead = "0.9.3" Optimisers = "0.3.2, 0.4" ProgressMeter = "1.7.1" -StatisticalMeasures = "0.1" +StatisticalMeasures = "0.3" Statistics = "<0.0.1, 1" Tables = "1.0" julia = "1.10" diff --git a/src/classifier.jl b/src/classifier.jl index 725697ac..c38857ac 100644 --- a/src/classifier.jl +++ b/src/classifier.jl @@ -8,7 +8,7 @@ data `X` and `y`. """ function MLJFlux.shape(model::NeuralNetworkClassifier, X, y) X = X isa Matrix ? Tables.table(X) : X - levels = MLJModelInterface.classes(y[1]) + levels = CategoricalArrays.levels(y[1]) n_output = length(levels) n_input = Tables.schema(X).names |> length return (n_input, n_output) @@ -31,7 +31,7 @@ MLJFlux.fitresult( y, ordinal_mappings = nothing, embedding_matrices = nothing, -) = (chain, MLJModelInterface.classes(y[1]), ordinal_mappings, embedding_matrices) +) = (chain, levels(y[1]), ordinal_mappings, embedding_matrices) function MLJModelInterface.predict( model::NeuralNetworkClassifier, diff --git a/src/core.jl b/src/core.jl index f866ee51..6a900340 100644 --- a/src/core.jl +++ b/src/core.jl @@ -253,7 +253,7 @@ tomat(y::Vector) = reshape(y, size(y, 1), 1) reformat(y, ::Type{<:AbstractVector{<:Union{Continuous,Count}}}) = reshape(y, 1, length(y)) function reformat(y, ::Type{<:AbstractVector{<:Finite}}) - levels = y |> first |> MLJModelInterface.classes + levels = y |> first |> CategoricalArrays.levels return Flux.onehotbatch(y, levels) end @@ -285,7 +285,7 @@ end function collate(model::NeuralNetworkBinaryClassifier, X, y, verbosity) row_batches = Base.Iterators.partition(1:nrows(y), model.batch_size) Xmatrix = _f32(reformat(X), verbosity) - yvec = (y .== classes(y)[2])' # convert to boolean + yvec = (y .== levels(y)[2])' # convert to boolean return [_get(Xmatrix, b) for b in row_batches], [_get(yvec, b) for b in row_batches] end @@ -294,4 +294,3 @@ function _f32(x::AbstractArray, verbosity) verbosity > 0 && @info "MLJFlux: converting input data to Float32" return Float32.(x) end - diff --git a/src/encoders.jl b/src/encoders.jl index 1a058e93..43af62fd 100644 --- a/src/encoders.jl +++ b/src/encoders.jl @@ -17,7 +17,7 @@ function ordinal_encoder_fit(X; featinds) # 2. Use feature mapper to compute the mapping of each level in each column for i in featinds feat_col = Tables.getcolumn(Tables.columns(X), i) - feat_levels = levels(feat_col) + feat_levels = CategoricalArrays.unwrap.(levels(feat_col)) # Check if feat levels is already ordinal encoded in which case we skip (Set([Float32(i) for i in 1:length(feat_levels)]) == Set(feat_levels)) && continue # Compute the dict using the given feature_mapper function @@ -64,10 +64,10 @@ function ordinal_encoder_transform(X, mapping_matrix) # Create the transformation function for each column if ind in keys(mapping_matrix) train_levels = keys(mapping_matrix[ind]) - test_levels = levels(col) + test_levels = CategoricalArrays.unwrap.(levels(col)) check_unkown_levels(train_levels, test_levels) level2scalar = mapping_matrix[ind] - new_col = unwrap.(recode(col, level2scalar...)) + new_col = CategoricalArrays.unwrap.(recode(col, level2scalar...)) push!(new_feats, new_col) else push!(new_feats, col) diff --git a/src/image.jl b/src/image.jl index 5af4a033..cfa9de94 100644 --- a/src/image.jl +++ b/src/image.jl @@ -1,5 +1,5 @@ function shape(model::ImageClassifier, X, y) - levels = MLJModelInterface.classes(y[1]) + levels = CategoricalArrays.levels(y) n_output = length(levels) n_input = size(X[1]) @@ -18,7 +18,7 @@ build(model::ImageClassifier, rng, shape) = model.finaliser) fitresult(model::ImageClassifier, chain, y, ::Any, ::Any) = - (chain, MLJModelInterface.classes(y[1])) + (chain, levels(y)) function MLJModelInterface.predict(model::ImageClassifier, fitresult, Xnew) chain, levels = fitresult diff --git a/src/mlj_embedder_interface.jl b/src/mlj_embedder_interface.jl index 37087288..2654a597 100644 --- a/src/mlj_embedder_interface.jl +++ b/src/mlj_embedder_interface.jl @@ -110,6 +110,7 @@ In the following example we wrap a `NeuralNetworkClassifier` as an `EntityEmbedd that it can be used to supply continuously encoded features to a nearest neighbor model, which does not support categorical features. +## Simple Example ```julia using MLJ @@ -129,21 +130,46 @@ EntityEmbedder = @load EntityEmbedder pkg=MLJFlux # Flux model to do learn the entity embeddings: NeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux -# Other supervised model type, requiring `Continuous` features: -KNNClassifier = @load KNNClassifier pkg=NearestNeighborModels - # Instantiate the models: clf = NeuralNetworkClassifier(embedding_dims=Dict(:b => 2, :c => 3)) emb = EntityEmbedder(clf) -# For illustrative purposes, train the embedder on its own: +# Train and transform the data using the embedder: mach = machine(emb, X, y) fit!(mach) Xnew = transform(mach, X) -# And compare feature scitypes: +# Compare schemas before and after transformation schema(X) schema(Xnew) +``` + +## Using with Downstream Models (Pipeline) +```julia +using MLJ + +# Setup some data +N = 400 +X = ( + a = rand(Float32, N), + b = categorical(rand("abcde", N)), + c = categorical(rand("ABCDEFGHIJ", N), ordered = true), +) + +y = categorical(rand("YN", N)); + +# Initiate model +EntityEmbedder = @load EntityEmbedder pkg=MLJFlux + +# Flux model to do learn the entity embeddings: +NeuralNetworkClassifier = @load NeuralNetworkClassifier pkg=MLJFlux + +# Other supervised model type, requiring `Continuous` features: +KNNClassifier = @load KNNClassifier pkg=NearestNeighborModels + +# Instantiate the models: +clf = NeuralNetworkClassifier(embedding_dims=Dict(:b => 2, :c => 3)) +emb = EntityEmbedder(clf) # Now construct the pipeline: pipe = emb |> KNNClassifier() diff --git a/test/entity_embedding.jl b/test/entity_embedding.jl index fcb22f40..a1383adf 100644 --- a/test/entity_embedding.jl +++ b/test/entity_embedding.jl @@ -36,7 +36,7 @@ entityprops = [ EE1 = Flux.trainables(embedder.embedders[2])[1] # (newdim, levels) = (5, 10) EE2 = Flux.trainables(embedder.embedders[4])[1] # (newdim, levels) = (2, 2) - ## One-hot encoding + ## One-hot encoding z2_hot = Flux.onehotbatch(z2, levels(z2)) z4_hot = Flux.onehotbatch(z4, levels(z4))