diff --git a/data b/data index d03a7fc8df2..cbafe2a8141 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit d03a7fc8df2d12f1c58b07895a2bc71c49b917c3 +Subproject commit cbafe2a81419f038ec35f545743f6cac7b18ce9a diff --git a/examples/meta/src/clustering/kmeans.sg.in b/examples/meta/src/clustering/kmeans.sg.in index 834be2d2d35..7aee417dd4b 100644 --- a/examples/meta/src/clustering/kmeans.sg.in +++ b/examples/meta/src/clustering/kmeans.sg.in @@ -16,9 +16,10 @@ Machine kmeans = create_machine("KMeans", k=2, distance=d, seed=1) kmeans.train() #![train_dataset] -#![extract_centers_and_radius] +#![extract_centers_radiuses_stds] RealMatrix c = kmeans.get_real_matrix("cluster_centers") RealVector r = kmeans.get_real_vector("radiuses") +RealMatrix s = kmeans.get_real_matrix("std_dev") #![extract_centers_and_radius] #![create_instance_mb] diff --git a/src/shogun/clustering/KMeans.cpp b/src/shogun/clustering/KMeans.cpp index f3922077b29..f82d63680c4 100644 --- a/src/shogun/clustering/KMeans.cpp +++ b/src/shogun/clustering/KMeans.cpp @@ -42,8 +42,7 @@ KMeans::~KMeans() void KMeans::Lloyd_KMeans(SGMatrix centers, int32_t num_centers) { - auto lhs = - std::dynamic_pointer_cast>(distance->get_lhs()); + auto lhs = distance->get_lhs()->as>(); int32_t lhs_size=lhs->get_num_vectors(); int32_t dim=lhs->get_num_features(); @@ -173,10 +172,7 @@ void KMeans::Lloyd_KMeans(SGMatrix centers, int32_t num_centers) if (iter%(max_iter/10) == 0) io::info("Iteration[{}/{}]: Assignment of {} patterns changed.", iter, max_iter, changed); } - distance->reset_precompute(); - distance->replace_rhs(rhs_cache); - - + distance->replace_rhs(rhs_cache); } bool KMeans::train_machine(std::shared_ptr data) diff --git a/src/shogun/clustering/KMeansBase.cpp b/src/shogun/clustering/KMeansBase.cpp index 88c41c63304..7a39b5992b5 100644 --- a/src/shogun/clustering/KMeansBase.cpp +++ b/src/shogun/clustering/KMeansBase.cpp @@ -130,6 +130,45 @@ void KMeansBase::compute_cluster_variances() } } +SGMatrix KMeansBase::compute_std_dev() const +{ + require(cluster_centers.size() > 0, "KMeans is not trained!"); + + SGMatrix points = distance->get_rhs() + ->as>() + ->get_feature_matrix(); + SGVector cluster_assignments = const_cast(this) + ->apply() + ->as() + ->get_labels(); + + SGVector counts(k); + SGMatrix means = cluster_centers.clone(); + SGMatrix squares_sums(dimensions, k); + + for (int32_t point_number : range(cluster_assignments.vlen)) + { + auto cluster_number = + static_cast(cluster_assignments[point_number]); + const auto& point = points.get_column(point_number); + auto& count = counts[cluster_number]; + auto mean = means.get_column(cluster_number); + auto squares_sum = squares_sums.get_column(cluster_number); + + count += 1; + auto delta1 = linalg::add(point, mean, 1., -1.); + linalg::add(mean, linalg::scale(delta1, 1. / count), mean); + auto delta2 = linalg::add(point, mean, 1., -1.); + linalg::add( + squares_sum, linalg::element_prod(delta1, delta2), squares_sum); + } + + linalg::scale(squares_sums, squares_sums, 1. / (points.num_cols - 1)); + for (float64_t& x : squares_sums) + x = std::sqrt(x); + return squares_sums; +} + void KMeansBase::initialize_training(const std::shared_ptr& data) { require(distance, "Distance is not provided"); @@ -153,7 +192,6 @@ void KMeansBase::initialize_training(const std::shared_ptr& data) require(lhs, "Lhs features of distance not provided"); int32_t lhs_size=lhs->get_num_vectors(); dimensions=lhs->get_num_features(); - const int32_t centers_size=dimensions*k; require(lhs_size>0, "Lhs features should not be empty"); require(dimensions>0, "Lhs features should have more than zero dimensions"); @@ -318,6 +356,7 @@ void KMeansBase::init() &use_kmeanspp, "kmeanspp", "Whether to use kmeans++", ParameterProperties::HYPER | ParameterProperties::SETTING); watch_method("cluster_centers", &KMeansBase::get_cluster_centers); + watch_method("std_dev", &KMeansBase::compute_std_dev); SG_ADD( &initial_centers, "initial_centers", "Initial centers", ParameterProperties::HYPER); diff --git a/src/shogun/clustering/KMeansBase.h b/src/shogun/clustering/KMeansBase.h index a275e19e70b..3255320c408 100644 --- a/src/shogun/clustering/KMeansBase.h +++ b/src/shogun/clustering/KMeansBase.h @@ -76,6 +76,12 @@ class KMeansBase : public RandomMixin */ SGMatrix get_cluster_centers() const; + /** get cluster standard deviations + * + * @return cluster deviations or throws an error if no ones are there (not trained yet) + */ + SGMatrix compute_std_dev() const; + /** @return object name */ virtual const char* get_name() const { return "KMeansBase"; } diff --git a/src/shogun/clustering/KMeansMiniBatch.cpp b/src/shogun/clustering/KMeansMiniBatch.cpp index aaa91ceb1ca..1a4df1208f5 100644 --- a/src/shogun/clustering/KMeansMiniBatch.cpp +++ b/src/shogun/clustering/KMeansMiniBatch.cpp @@ -56,13 +56,15 @@ void KMeansMiniBatch::minibatch_KMeans() auto lhs= distance->get_lhs()->as>(); auto rhs_mus = std::make_shared>(cluster_centers); - auto rhs_cache=distance->replace_rhs(rhs_mus); + auto rhs_cache = distance->get_rhs(); + distance->replace_rhs(rhs_mus); int32_t XSize=lhs->get_num_vectors(); - int32_t dims=lhs->get_num_features(); SGVector v=SGVector(k); v.zero(); + distance->precompute_lhs(); + for (auto i : SG_PROGRESS(range(max_iter))) { SGVector M=mbchoose_rand(batch_size,XSize); @@ -124,6 +126,9 @@ bool KMeansMiniBatch::train_machine(std::shared_ptr data) initialize_training(data); minibatch_KMeans(); compute_cluster_variances(); + auto cluster_centres = + std::make_shared>(cluster_centers); + distance->replace_lhs(cluster_centres); return true; }