@@ -18,27 +18,145 @@ under the European Union’s Horizon 2020 research and innovation programme
1818#include " ../../data/TensorTypes.hpp"
1919#include " ../../data/FluidMemory.hpp"
2020#include < Eigen/Core>
21+ #include < cassert>
2122#include < queue>
23+ #include < random>
2224#include < string>
2325
2426namespace fluid {
2527namespace algorithm {
2628
29+ namespace _impl ::kmeans_init {
30+
31+ // / @brief Initialize means based on randomly assigning each point to a cluster
32+ // / @param input input data
33+ // / @param k number of clusters
34+ // / @return a 2D Eigen array of means
35+ Eigen::ArrayXXd randomPartition (const Eigen::MatrixXd& input, index k)
36+ {
37+ // Means come from randomly assigning points and taking average
38+ std::random_device rd;
39+ std::mt19937 gen (rd ());
40+ std::uniform_int_distribution<index> distrib (0 , k - 1 );
41+
42+ Eigen::ArrayXXd means = Eigen::ArrayXXd::Zero (k, input.cols ());
43+ Eigen::ArrayXd assignments (input.rows ());
44+ Eigen::ArrayXd mask = Eigen::ArrayXd::Constant (input.rows (), 1.0 );
45+
46+ std::generate (assignments.begin (), assignments.end (),
47+ [&distrib, &gen]() { return distrib (gen); });
48+
49+ for (index i = 0 ; i < k; ++i)
50+ {
51+ means.row (i) =
52+ (input.array ().colwise () * (assignments == i).select (mask, 0.0 ))
53+ .colwise ()
54+ .mean ();
55+ }
56+
57+ return means;
58+ }
59+
60+ // / @brief Initialize means by sampling `k` random points ('Forgy initialization')
61+ // / @param input input data
62+ // / @param k number of clusters
63+ // / @return 2D Eigen expression of sampled input points
64+ auto randomPoints (const Eigen::MatrixXd& input, index k)
65+ {
66+ // Means come from k random points
67+ std::random_device rd;
68+ std::mt19937 gen (rd ());
69+ std::uniform_int_distribution<index> distrib (0 , input.rows () - 1 );
70+
71+ std::vector<index> rows (asUnsigned (k));
72+ std::generate (begin (rows), end (rows),
73+ [&distrib, &gen]() { return distrib (gen); });
74+ return input (rows, Eigen::all);
75+ }
76+
77+ auto squareEuclidiean = [](Eigen::Ref<const Eigen::MatrixXd> const & a,
78+ Eigen::Ref<const Eigen::MatrixXd> const & b,
79+ bool squared = true ) {
80+ double a_sqnorm = a.squaredNorm ();
81+ double b_sqnorm = b.squaredNorm ();
82+ Eigen::ArrayXXd result = (a * b.transpose ()).array ();
83+ result *= -2 ;
84+ result += (a_sqnorm + b_sqnorm);
85+ return squared ? result: result.sqrt ();
86+ };
87+
88+ auto cosine = [](auto a, auto b){
89+ return 1.0 - (a * b.transpose ()).array ();
90+ };
91+
92+ // / @brief initilaize means using markov chain montecarlo approximation of Kmeans++ (kmc2)
93+ // / @tparam DistanceFn function object that performs distance calculation
94+ // / @param input
95+ // / @param k
96+ // / @param distance
97+ // / @return
98+ template <class DistanceFn >
99+ auto akmc2 (Eigen::MatrixXd const & input, index k, DistanceFn distance)
100+ {
101+ std::random_device rd;
102+ std::mt19937 gen (rd ());
103+ Eigen::MatrixXd centres (k, input.cols ());
104+
105+ // First mean sampled at random from input
106+ const index centre0 =
107+ std::uniform_int_distribution<index>(0 , input.rows () - 1 )(gen);
108+ centres.row (0 ) = input.row (centre0);
109+
110+ Eigen::ArrayXd q = distance (input, centres.row (0 )).pow (2 );
111+ q /= (2 * q.sum () + 2 * q.rows ());
112+ std::discrete_distribution proposalDistribution (q.begin (), q.end ());
113+
114+ index chainLength = 200 ;
115+ auto candidateIdx = std::vector<index>(asUnsigned (chainLength));
116+ Eigen::VectorXd candidateProbs (chainLength);
117+ std::uniform_real_distribution<double > uniform;
118+
119+ std::generate_n (centres.rowwise ().begin () + 1 , k - 1 , [&, i = 0 ]() mutable {
120+ std::generate (
121+ candidateIdx.begin (), candidateIdx.end (),
122+ [&gen, &proposalDistribution]() { return proposalDistribution (gen); });
123+
124+ Eigen::VectorXd proposalProbabilities = q (candidateIdx);
125+
126+ // changes size every iteration
127+ Eigen::ArrayXXd dist = distance (input (candidateIdx, Eigen::all),
128+ centres (Eigen::seq (0 , i++), Eigen::all));
129+ candidateProbs = dist.rowwise ().minCoeff () / q (candidateIdx);
130+
131+ auto start = candidateProbs.begin ();
132+ auto current = start;
133+ for (auto it = start; it != candidateProbs.end (); ++it)
134+ {
135+ if (*current == 0.0 || *it / *current > uniform (gen)) current = it;
136+ }
137+ return input.row (candidateIdx[asUnsigned (std::distance (start, current))]);
138+ });
139+ return centres;
140+ }
141+ } // _impl::kmeans_init
142+
27143class KMeans
28144{
29145
30146public:
147+ enum class InitMethod {randomPartion, randomPoint, randomSampling};
148+
31149 void clear ()
32150 {
33151 mMeans .setZero ();
34- mAssignments .setZero ( );
152+ mAssignments .resize ( 0 );
35153 mTrained = false ;
36154 }
37155
38156 bool initialized () const { return mTrained ; }
39157
40158 void train (const FluidDataSet<std::string, double , 1 >& dataset, index k,
41- index maxIter)
159+ index maxIter, InitMethod init )
42160 {
43161 using namespace Eigen ;
44162 using namespace _impl ;
@@ -49,12 +167,24 @@ class KMeans
49167 {
50168 mK = k;
51169 mDims = dataset.pointSize ();
52- mMeans = ArrayXXd::Zero (mK , mDims );
170+
171+ using namespace _impl ::kmeans_init;
172+ switch (init)
173+ {
174+ case InitMethod::randomSampling:
175+ {
176+ mMeans = akmc2 (dataPoints, mK , squareEuclidiean);
177+ break ;
178+ }
179+ case InitMethod::randomPoint:
180+ {
181+ mMeans = randomPoints (dataPoints, mK );
182+ break ;
183+ }
184+ default : mMeans = randomPartition (dataPoints, mK );
185+ }
186+
53187 mEmpty = std::vector<bool >(asUnsigned (mK ), false );
54- mAssignments =
55- ((0.5 + (0.5 * ArrayXf::Random (dataPoints.rows ()))) * (mK - 1 ))
56- .round ()
57- .cast <int >();
58188 }
59189
60190 while (maxIter-- > 0 )
@@ -185,6 +315,7 @@ class KMeans
185315
186316 bool changed (const Eigen::VectorXi& newAssignments) const
187317 {
318+ if (mAssignments .rows () == 0 ) return true ;
188319 auto dif = (newAssignments - mAssignments ).cwiseAbs ().sum ();
189320 return dif > 0 ;
190321 }
0 commit comments