Skip to content

Commit 668bb9b

Browse files
committed
KMeans + SKMeans; Unify intialisation options
1 parent 3fdb022 commit 668bb9b

File tree

4 files changed

+190
-62
lines changed

4 files changed

+190
-62
lines changed

include/flucoma/algorithms/public/KMeans.hpp

Lines changed: 138 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,145 @@ under the European Union’s Horizon 2020 research and innovation programme
1818
#include "../../data/TensorTypes.hpp"
1919
#include "../../data/FluidMemory.hpp"
2020
#include <Eigen/Core>
21+
#include <cassert>
2122
#include <queue>
23+
#include <random>
2224
#include <string>
2325

2426
namespace fluid {
2527
namespace algorithm {
2628

29+
namespace _impl::kmeans_init {
30+
31+
/// @brief Initialize means based on randomly assigning each point to a cluster
32+
/// @param input input data
33+
/// @param k number of clusters
34+
/// @return a 2D Eigen array of means
35+
Eigen::ArrayXXd randomPartition(const Eigen::MatrixXd& input, index k)
36+
{
37+
// Means come from randomly assigning points and taking average
38+
std::random_device rd;
39+
std::mt19937 gen(rd());
40+
std::uniform_int_distribution<index> distrib(0, k - 1);
41+
42+
Eigen::ArrayXXd means = Eigen::ArrayXXd::Zero(k, input.cols());
43+
Eigen::ArrayXd assignments(input.rows());
44+
Eigen::ArrayXd mask = Eigen::ArrayXd::Constant(input.rows(), 1.0);
45+
46+
std::generate(assignments.begin(), assignments.end(),
47+
[&distrib, &gen]() { return distrib(gen); });
48+
49+
for(index i = 0; i < k; ++i)
50+
{
51+
means.row(i) =
52+
(input.array().colwise() * (assignments == i).select(mask, 0.0))
53+
.colwise()
54+
.mean();
55+
}
56+
57+
return means;
58+
}
59+
60+
/// @brief Initialize means by sampling `k` random points ('Forgy initialization')
61+
/// @param input input data
62+
/// @param k number of clusters
63+
/// @return 2D Eigen expression of sampled input points
64+
auto randomPoints(const Eigen::MatrixXd& input, index k)
65+
{
66+
// Means come from k random points
67+
std::random_device rd;
68+
std::mt19937 gen(rd());
69+
std::uniform_int_distribution<index> distrib(0, input.rows() - 1);
70+
71+
std::vector<index> rows(asUnsigned(k));
72+
std::generate(begin(rows), end(rows),
73+
[&distrib, &gen]() { return distrib(gen); });
74+
return input(rows, Eigen::all);
75+
}
76+
77+
auto squareEuclidiean = [](Eigen::Ref<const Eigen::MatrixXd> const& a,
78+
Eigen::Ref<const Eigen::MatrixXd> const& b,
79+
bool squared = true) {
80+
double a_sqnorm = a.squaredNorm();
81+
double b_sqnorm = b.squaredNorm();
82+
Eigen::ArrayXXd result = (a * b.transpose()).array();
83+
result *= -2;
84+
result += (a_sqnorm + b_sqnorm);
85+
return squared ? result: result.sqrt();
86+
};
87+
88+
auto cosine = [](auto a, auto b){
89+
return 1.0 - (a * b.transpose()).array();
90+
};
91+
92+
/// @brief initilaize means using markov chain montecarlo approximation of Kmeans++ (kmc2)
93+
/// @tparam DistanceFn function object that performs distance calculation
94+
/// @param input
95+
/// @param k
96+
/// @param distance
97+
/// @return
98+
template<class DistanceFn>
99+
auto akmc2(Eigen::MatrixXd const& input, index k, DistanceFn distance)
100+
{
101+
std::random_device rd;
102+
std::mt19937 gen(rd());
103+
Eigen::MatrixXd centres(k, input.cols());
104+
105+
// First mean sampled at random from input
106+
const index centre0 =
107+
std::uniform_int_distribution<index>(0, input.rows() - 1)(gen);
108+
centres.row(0) = input.row(centre0);
109+
110+
Eigen::ArrayXd q = distance(input, centres.row(0)).pow(2);
111+
q /= (2 * q.sum() + 2 * q.rows());
112+
std::discrete_distribution proposalDistribution(q.begin(), q.end());
113+
114+
index chainLength = 200;
115+
auto candidateIdx = std::vector<index>(asUnsigned(chainLength));
116+
Eigen::VectorXd candidateProbs(chainLength);
117+
std::uniform_real_distribution<double> uniform;
118+
119+
std::generate_n(centres.rowwise().begin() + 1, k - 1, [&, i = 0]() mutable {
120+
std::generate(
121+
candidateIdx.begin(), candidateIdx.end(),
122+
[&gen, &proposalDistribution]() { return proposalDistribution(gen); });
123+
124+
Eigen::VectorXd proposalProbabilities = q(candidateIdx);
125+
126+
// changes size every iteration
127+
Eigen::ArrayXXd dist = distance(input(candidateIdx, Eigen::all),
128+
centres(Eigen::seq(0, i++), Eigen::all));
129+
candidateProbs = dist.rowwise().minCoeff() / q(candidateIdx);
130+
131+
auto start = candidateProbs.begin();
132+
auto current = start;
133+
for (auto it = start; it != candidateProbs.end(); ++it)
134+
{
135+
if (*current == 0.0 || *it / *current > uniform(gen)) current = it;
136+
}
137+
return input.row(candidateIdx[asUnsigned(std::distance(start, current))]);
138+
});
139+
return centres;
140+
}
141+
} //_impl::kmeans_init
142+
27143
class KMeans
28144
{
29145

30146
public:
147+
enum class InitMethod {randomPartion, randomPoint, randomSampling};
148+
31149
void clear()
32150
{
33151
mMeans.setZero();
34-
mAssignments.setZero();
152+
mAssignments.resize(0);
35153
mTrained = false;
36154
}
37155

38156
bool initialized() const { return mTrained; }
39157

40158
void train(const FluidDataSet<std::string, double, 1>& dataset, index k,
41-
index maxIter)
159+
index maxIter, InitMethod init)
42160
{
43161
using namespace Eigen;
44162
using namespace _impl;
@@ -49,12 +167,24 @@ class KMeans
49167
{
50168
mK = k;
51169
mDims = dataset.pointSize();
52-
mMeans = ArrayXXd::Zero(mK, mDims);
170+
171+
using namespace _impl::kmeans_init;
172+
switch(init)
173+
{
174+
case InitMethod::randomSampling:
175+
{
176+
mMeans = akmc2(dataPoints, mK, squareEuclidiean);
177+
break;
178+
}
179+
case InitMethod::randomPoint:
180+
{
181+
mMeans = randomPoints(dataPoints, mK);
182+
break;
183+
}
184+
default: mMeans = randomPartition(dataPoints, mK);
185+
}
186+
53187
mEmpty = std::vector<bool>(asUnsigned(mK), false);
54-
mAssignments =
55-
((0.5 + (0.5 * ArrayXf::Random(dataPoints.rows()))) * (mK - 1))
56-
.round()
57-
.cast<int>();
58188
}
59189

60190
while (maxIter-- > 0)
@@ -185,6 +315,7 @@ class KMeans
185315

186316
bool changed(const Eigen::VectorXi& newAssignments) const
187317
{
318+
if (mAssignments.rows() == 0) return true;
188319
auto dif = (newAssignments - mAssignments).cwiseAbs().sum();
189320
return dif > 0;
190321
}

include/flucoma/algorithms/public/SKMeans.hpp

Lines changed: 29 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ under the European Union’s Horizon 2020 research and innovation programme
1717
#include "../../data/FluidTensor.hpp"
1818
#include "../../data/TensorTypes.hpp"
1919
#include <Eigen/Core>
20+
#include <cassert>
2021
#include <queue>
2122
#include <random>
2223
#include <string>
@@ -26,26 +27,20 @@ namespace algorithm {
2627

2728
class SKMeans : public KMeans
2829
{
29-
30+
using MatrixLike = Eigen::Ref<const Eigen::MatrixXd>;
3031
public:
3132

32-
enum Initializer {
33-
// Random partition assigns points to random clusters at init
34-
Random_Partition,
35-
//'Forgy' initializes means with k random data points
36-
Forgy
37-
};
33+
using KMeans::InitMethod;
3834

3935
void train(const FluidDataSet<std::string, double, 1>& dataset, index k,
40-
index maxIter, unsigned initialize )
36+
index maxIter, InitMethod initialize )
4137
{
4238
using namespace Eigen;
4339
using namespace _impl;
4440
assert(!mTrained || (dataset.pointSize() == mDims && mK == k));
4541
MatrixXd dataPoints =
4642
asEigen<Matrix>(dataset.getData()).rowwise().normalized();
47-
MatrixXd dataPointsT = dataPoints.transpose();
48-
if (mTrained) { mAssignments = assignClusters(dataPointsT);}
43+
if (mTrained) { mAssignments = assignClusters(dataPoints.transpose());}
4944
else
5045
{
5146
mK = k;
@@ -55,7 +50,7 @@ class SKMeans : public KMeans
5550

5651
while (maxIter-- > 0)
5752
{
58-
mEmbedding = mMeans.matrix() * dataPointsT;
53+
mEmbedding.noalias() = mMeans.matrix() * dataPoints.transpose();
5954
auto assignments = assignClusters(mEmbedding);
6055
if (mAssignments.rows() && !changed(assignments)) { break; }
6156
else
@@ -66,7 +61,6 @@ class SKMeans : public KMeans
6661
mTrained = true;
6762
}
6863

69-
7064
void encode(RealMatrixView data, RealMatrixView out,
7165
double alpha = 0.25) const
7266
{
@@ -78,47 +72,43 @@ class SKMeans : public KMeans
7872
}
7973

8074
private:
81-
void initMeans(Eigen::MatrixXd& dataPoints, unsigned initializer)
75+
void initMeans(Eigen::MatrixXd& dataPoints, InitMethod init)
8276
{
8377
using namespace Eigen;
8478
mMeans = ArrayXXd::Zero(mK, mDims);
8579

86-
switch (initializer)
80+
using namespace _impl::kmeans_init;
81+
switch(init)
8782
{
88-
default:
89-
case Initializer::Random_Partition:
90-
mAssignments =
91-
((0.5 + (0.5 * ArrayXd::Random(dataPoints.rows()))) * (mK - 1))
92-
.round()
93-
.cast<int>();
94-
mEmbedding = MatrixXd::Zero(mK, dataPoints.rows());
95-
for (index i = 0; i < dataPoints.rows(); i++)
96-
mEmbedding(mAssignments(i), i) = 1;
97-
computeMeans(dataPoints);
98-
break;
99-
100-
case Initializer::Forgy: // means from random selection of data points
101-
ArrayXidx dataIndices =
102-
ArrayXidx::LinSpaced(dataPoints.rows(), 0, dataPoints.rows() - 1);
103-
std::vector<Index> samples(mK);
104-
std::sample(dataIndices.begin(), dataIndices.end(), samples.begin(), mK,
105-
std::mt19937{std::random_device{}()});
106-
mMeans = dataPoints(samples, Eigen::all);
107-
break;
108-
}
83+
case InitMethod::randomSampling:
84+
{
85+
mMeans = akmc2(dataPoints, mK,cosine);
86+
break;
87+
}
88+
case InitMethod::randomPoint:
89+
{
90+
mMeans = randomPoints(dataPoints, mK);
91+
break;
92+
}
93+
default: {
94+
mMeans = randomPartition(dataPoints, mK);
95+
mMeans.matrix().rowwise().normalize();
96+
}
97+
}
10998
}
11099

111100
void updateEmbedding()
112101
{
113-
for (index i = 0; i < mAssignments.cols(); i++)
102+
for (index i = 0; i < mAssignments.rows(); i++)
114103
{
115104
mEmbedding.col(i).setZero();
116105
mEmbedding(mAssignments(i), i) = 1.0;
117106
}
118107
}
119108

120109

121-
Eigen::VectorXi assignClusters(Eigen::MatrixXd& embedding) const
110+
Eigen::VectorXi
111+
assignClusters(MatrixLike const& embedding) const
122112
{
123113
Eigen::VectorXi assignments = Eigen::VectorXi::Zero(embedding.cols());
124114
for (index i = 0; i < embedding.cols(); i++)
@@ -131,9 +121,9 @@ class SKMeans : public KMeans
131121
}
132122

133123

134-
void computeMeans(Eigen::MatrixXd& dataPoints)
124+
void computeMeans(MatrixLike const& dataPoints)
135125
{
136-
mMeans = mEmbedding * dataPoints;
126+
mMeans.matrix().noalias() = mEmbedding * dataPoints;
137127
mMeans.matrix().rowwise().normalize();
138128
}
139129

include/flucoma/clients/nrt/KMeansClient.hpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,21 @@ namespace fluid {
2020
namespace client {
2121
namespace kmeans {
2222

23+
2324
constexpr auto KMeansParams = defineParameters(
2425
StringParam<Fixed<true>>("name", "Name"),
2526
LongParam("numClusters", "Number of Clusters", 4, Min(1)),
26-
LongParam("maxIter", "Max number of Iterations", 100, Min(1)));
27+
LongParam("maxIter", "Max number of Iterations", 100, Min(1)),
28+
EnumParam("initialize", "Initialize method", 0, "Random Assignment",
29+
"Random Points", "Sampling"));
2730

2831
class KMeansClient : public FluidBaseClient,
2932
OfflineIn,
3033
OfflineOut,
3134
ModelObject,
3235
public DataClient<algorithm::KMeans>
3336
{
34-
enum { kName, kNumClusters, kMaxIter };
37+
enum {kName, kNumClusters, kMaxIter, kInit};
3538
ParameterTrackChanges<index> mTracker;
3639
public:
3740
using string = std::string;
@@ -69,6 +72,8 @@ class KMeansClient : public FluidBaseClient,
6972
return {};
7073
}
7174

75+
using InitMethod = algorithm::KMeans::InitMethod;
76+
7277
MessageResult<IndexVector> fit(InputDataSetClientRef datasetClient)
7378
{
7479
index k = get<kNumClusters>();
@@ -78,8 +83,8 @@ class KMeansClient : public FluidBaseClient,
7883
auto dataSet = datasetClientPtr->getDataSet();
7984
if (dataSet.size() == 0) return Error<IndexVector>(EmptyDataSet);
8085
if (k <= 1) return Error<IndexVector>(SmallK);
81-
if(mTracker.changed(k)) mAlgorithm.clear();
82-
mAlgorithm.train(dataSet, k, maxIter);
86+
if(mTracker.changed(k)) mAlgorithm.clear();
87+
mAlgorithm.train(dataSet, k, maxIter, static_cast<InitMethod>(get<kInit>()));
8388
IndexVector assignments(dataSet.size());
8489
mAlgorithm.getAssignments(assignments);
8590
return getCounts(assignments, k);
@@ -98,8 +103,8 @@ class KMeansClient : public FluidBaseClient,
98103
if (!labelsetClientPtr) return Error<IndexVector>(NoLabelSet);
99104
if (k <= 1) return Error<IndexVector>(SmallK);
100105
if (maxIter <= 0) maxIter = 100;
101-
if(mTracker.changed(k)) mAlgorithm.clear();
102-
mAlgorithm.train(dataSet, k, maxIter);
106+
if(mTracker.changed(k)) mAlgorithm.clear();
107+
mAlgorithm.train(dataSet, k, maxIter, static_cast<InitMethod>(get<kInit>()));
103108
IndexVector assignments(dataSet.size());
104109
mAlgorithm.getAssignments(assignments);
105110
StringVectorView ids = dataSet.getIds();
@@ -167,7 +172,7 @@ class KMeansClient : public FluidBaseClient,
167172
if (dataSet.size() == 0) return Error<IndexVector>(EmptyDataSet);
168173
if (k <= 1) return Error<IndexVector>(SmallK);
169174
if (maxIter <= 0) maxIter = 100;
170-
mAlgorithm.train(dataSet, k, maxIter);
175+
mAlgorithm.train(dataSet, k, maxIter, static_cast<InitMethod>(get<kInit>()));
171176
IndexVector assignments(dataSet.size());
172177
mAlgorithm.getAssignments(assignments);
173178
transform(srcClient, dstClient);

0 commit comments

Comments
 (0)