Skip to content

Commit 3ef2426

Browse files
fixup! Add standard deviation (std) to KMeans.
1 parent baf2486 commit 3ef2426

File tree

4 files changed

+113
-115
lines changed

4 files changed

+113
-115
lines changed

src/shogun/clustering/KMeans.cpp

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -44,31 +44,29 @@ void KMeans::Lloyd_KMeans(SGMatrix<float64_t> centers, int32_t num_centers)
4444
{
4545
auto lhs = distance->get_lhs()->as<DenseFeatures<float64_t>>();
4646

47-
int32_t lhs_size = lhs->get_num_vectors();
47+
int32_t lhs_size=lhs->get_num_vectors();
4848

49-
SGVector<int32_t> cluster_assignments = SGVector<int32_t>(lhs_size);
49+
auto rhs_cache = distance->get_rhs();
50+
51+
SGVector<int32_t> cluster_assignments=SGVector<int32_t>(lhs_size);
5052
cluster_assignments.zero();
5153

5254
/* Weights : Number of points in each cluster */
5355
SGVector<int64_t> weights_set(num_centers);
5456
weights_set.zero();
55-
/* Initially set all weights for zeroth cluster, Changes in assignement step
56-
*/
57-
weights_set[0] = lhs_size;
57+
/* Initially set all weights for zeroth cluster, Changes in assignement step */
58+
weights_set[0]=lhs_size;
5859

5960
distance->precompute_lhs();
6061

6162
for (auto iter : SG_PROGRESS(range(max_iter)))
6263
{
63-
if (iter == max_iter - 1)
64-
io::warn(
65-
"KMeans clustering has reached maximum number of ( {} ) "
66-
"iterations without having converged. Terminating. ",
67-
iter);
64+
if (iter==max_iter-1)
65+
io::warn("KMeans clustering has reached maximum number of ( {} ) iterations without having converged. \
66+
Terminating. ", iter);
6867

6968
int32_t changed;
70-
auto rhs_mus =
71-
std::make_shared<DenseFeatures<float64_t>>(centers.clone());
69+
auto rhs_mus = std::make_shared<DenseFeatures<float64_t>>(centers.clone());
7270
distance->replace_rhs(rhs_mus);
7371

7472
auto change_centers_step = [this,
@@ -120,26 +118,26 @@ void KMeans::Lloyd_KMeans(SGMatrix<float64_t> centers, int32_t num_centers)
120118
num_centers, change_centers_step, cluster_assignments,
121119
weights_set);
122120

123-
if (changed == 0)
121+
if(changed==0)
124122
break;
125123

126124
/* Update Step : Calculate new means */
127125
if (!fixed_centers)
128126
{
129127
centers.zero();
130128

131-
for (int32_t i = 0; i < lhs_size; i++)
129+
for (int32_t i=0; i<lhs_size; i++)
132130
{
133-
int32_t cluster_i = cluster_assignments[i];
131+
int32_t cluster_i=cluster_assignments[i];
134132

135133
auto vec = lhs->get_feature_vector(i);
136134
linalg::add_col_vec(centers, cluster_i, vec, centers);
137135
lhs->free_feature_vector(vec, i);
138136
}
139137

140-
for (int32_t i = 0; i < num_centers; i++)
138+
for (int32_t i=0; i<num_centers; i++)
141139
{
142-
if (weights_set[i] != 0)
140+
if (weights_set[i]!=0)
143141
{
144142
auto col = centers.get_column(i);
145143
linalg::scale(col, col, 1.0 / weights_set[i]);
@@ -149,20 +147,16 @@ void KMeans::Lloyd_KMeans(SGMatrix<float64_t> centers, int32_t num_centers)
149147

150148
observe<SGMatrix<float64_t>>(iter, "cluster_centers");
151149

152-
if (iter % (max_iter / 10) == 0)
153-
io::info(
154-
"Iteration[{}/{}]: Assignment of {} patterns changed.", iter,
155-
max_iter, changed);
150+
if (iter%(max_iter/10) == 0)
151+
io::info("Iteration[{}/{}]: Assignment of {} patterns changed.", iter, max_iter, changed);
156152
}
153+
distance->replace_rhs(rhs_cache);
157154
}
158155

159156
bool KMeans::train_machine(std::shared_ptr<Features> data)
160157
{
161158
initialize_training(data);
162-
auto rhs_cache = distance->get_rhs();
163159
Lloyd_KMeans(cluster_centers, k);
164-
compute_stds();
165-
distance->replace_rhs(rhs_cache);
166160
compute_cluster_variances();
167161
auto cluster_centres =
168162
std::make_shared<DenseFeatures<float64_t>>(cluster_centers);

src/shogun/clustering/KMeansBase.cpp

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -130,15 +130,52 @@ void KMeansBase::compute_cluster_variances()
130130
}
131131
}
132132

133+
SGMatrix<float64_t> KMeansBase::get_stds() const
134+
{
135+
SGMatrix<float64_t> stds(dimensions, k);
136+
137+
auto rhs = distance->get_lhs()->as<DenseFeatures<float64_t>>();
138+
SGVector<int32_t> cluster_assignments;
139+
SGVector<int64_t> weights_set;
140+
std::tie(cluster_assignments, weights_set, std::ignore) =
141+
compute_cluster_assignments(k);
142+
143+
auto cluster_indexes = std::vector<SGVector<index_t>>(k);
144+
auto cluster_counters = SGVector<index_t>(k);
145+
for (int32_t current_cluster = 0; current_cluster < k; ++current_cluster)
146+
{
147+
cluster_indexes[current_cluster] =
148+
SGVector<index_t>(weights_set[current_cluster]);
149+
cluster_counters[current_cluster] = 0;
150+
}
151+
152+
for (int32_t i = 0; i < rhs->get_num_vectors(); ++i)
153+
{
154+
int point_cluster = cluster_assignments[i];
155+
int& cluster_counter = cluster_counters[point_cluster];
156+
cluster_indexes[point_cluster][cluster_counter] = i;
157+
++cluster_counter;
158+
}
159+
160+
for (int32_t i = 0; i < k; ++i)
161+
{
162+
stds.set_column(
163+
i, rhs->copy_subset(cluster_indexes[i])
164+
->as<DenseFeatures<float64_t>>()
165+
->std());
166+
}
167+
168+
return stds;
169+
}
170+
133171
std::tuple<SGVector<int32_t>, SGVector<int64_t>, int32_t>
134172
KMeansBase::compute_cluster_assignments(
135173
int32_t num_centers,
136174
std::function<void(ChangeCentersContext)> change_centers,
137175
SGVector<int32_t> cluster_assignments,
138-
SGVector<int64_t> weights_set)
176+
SGVector<int64_t> weights_set) const
139177
{
140-
auto lhs =
141-
std::dynamic_pointer_cast<DenseFeatures<float64_t>>(distance->get_lhs());
178+
auto lhs = distance->get_lhs()->as<DenseFeatures<float64_t>>();
142179

143180
int32_t lhs_size=lhs->get_num_vectors();
144181
int32_t dim=lhs->get_num_features();
@@ -199,37 +236,6 @@ KMeansBase::compute_cluster_assignments(
199236
return {cluster_assignments, weights_set, changed};
200237
}
201238

202-
void KMeansBase::compute_stds()
203-
{
204-
auto lhs=
205-
distance->get_lhs()->as<DenseFeatures<float64_t>>();
206-
SGVector<int32_t> cluster_assignments;
207-
SGVector<int64_t> weights_set;
208-
std::tie(cluster_assignments, weights_set, std::ignore) =
209-
compute_cluster_assignments(k);
210-
211-
auto cluster_indexes = new SGVector<index_t>[k];
212-
auto cluster_counters = new index_t[k];
213-
for (int32_t current_cluster = 0; current_cluster < k; ++current_cluster) {
214-
cluster_indexes[current_cluster] = SGVector<index_t>(weights_set[current_cluster]);
215-
cluster_counters[current_cluster] = 0;
216-
}
217-
218-
for (int32_t i = 0; i < lhs->get_num_vectors(); ++i) {
219-
int point_cluster = cluster_assignments[i];
220-
int& cluster_counter = cluster_counters[point_cluster];
221-
cluster_indexes[point_cluster][cluster_counter] = i;
222-
++cluster_counter;
223-
}
224-
225-
for (int32_t i = 0; i < k; ++i) {
226-
stds.set_column(i, lhs->copy_subset(cluster_indexes[0])->as<DenseFeatures<float64_t>>()->std());
227-
}
228-
229-
delete[] cluster_indexes;
230-
delete[] cluster_counters;
231-
}
232-
233239
void KMeansBase::initialize_training(const std::shared_ptr<Features>& data)
234240
{
235241
require(distance, "Distance is not provided");
@@ -247,23 +253,21 @@ void KMeansBase::initialize_training(const std::shared_ptr<Features>& data)
247253
if (data)
248254
distance->init(data, data);
249255

250-
auto lhs = distance->get_lhs()->as<DenseFeatures<float64_t>>();
256+
auto lhs=
257+
distance->get_lhs()->as<DenseFeatures<float64_t>>();
251258

252259
require(lhs, "Lhs features of distance not provided");
253-
int32_t lhs_size = lhs->get_num_vectors();
254-
dimensions = lhs->get_num_features();
260+
int32_t lhs_size=lhs->get_num_vectors();
261+
dimensions=lhs->get_num_features();
255262

256-
require(lhs_size > 0, "Lhs features should not be empty");
257-
require(
258-
dimensions > 0, "Lhs features should have more than zero dimensions");
263+
require(lhs_size>0, "Lhs features should not be empty");
264+
require(dimensions>0, "Lhs features should have more than zero dimensions");
259265

260266
/* if kmeans++ to be used */
261267
if (use_kmeanspp)
262268
initial_centers = kmeanspp();
263269

264-
R = SGVector<float64_t>(k);
265-
266-
stds = SGMatrix<float64_t>(dimensions, k);
270+
R=SGVector<float64_t>(k);
267271

268272
cluster_centers = SGMatrix<float64_t>(dimensions, k);
269273

@@ -415,11 +419,11 @@ void KMeansBase::init()
415419
&fixed_centers, "fixed_centers", "Whether to use fixed centers",
416420
ParameterProperties::HYPER | ParameterProperties::SETTING);
417421
SG_ADD(&R, "radiuses", "Cluster radiuses", ParameterProperties::MODEL);
418-
SG_ADD(&stds, "stds", "Cluster standard deviations", ParameterProperties::MODEL);
419422
SG_ADD(
420423
&use_kmeanspp, "kmeanspp", "Whether to use kmeans++",
421424
ParameterProperties::HYPER | ParameterProperties::SETTING);
422425
watch_method("cluster_centers", &KMeansBase::get_cluster_centers);
426+
watch_method("stds", &KMeansBase::get_stds);
423427
SG_ADD(
424428
&initial_centers, "initial_centers", "Initial centers",
425429
ParameterProperties::HYPER);

src/shogun/clustering/KMeansBase.h

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ class KMeansBase : public RandomMixin<DistanceMachine>
7676
*/
7777
SGMatrix<float64_t> get_cluster_centers() const;
7878

79+
/** get cluster standard deviations
80+
*
81+
* @return cluster centers or empty matrix if no radiuses are there (not trained yet)
82+
*/
83+
SGMatrix<float64_t> get_stds() const;
84+
7985
/** @return object name */
8086
virtual const char* get_name() const { return "KMeansBase"; }
8187

@@ -113,31 +119,29 @@ class KMeansBase : public RandomMixin<DistanceMachine>
113119

114120
void compute_cluster_variances();
115121

116-
void compute_stds();
117-
118-
struct ChangeCentersContext
119-
{
120-
std::shared_ptr<DenseFeatures<float64_t>> lhs;
121-
int32_t i, dim, cluster_assignments_i, min_cluster;
122-
SGVector<int64_t> weights_set;
123-
};
124-
125-
/** Matches points and clusters
126-
* @param change_centers optional coroutine to change centers in
127-
* Lloyd Kmeans
128-
* @return A tuple of: \n
129-
* 1) an assignments vector whose index is a point number and value is
130-
* a point cluster \n
131-
* 2) a weights vector whose index is a cluster number and value is
132-
* a number of points belong to this cluster \n
133-
* 3) a number of changed assignments
134-
*/
135-
std::tuple<SGVector<int32_t>, SGVector<int64_t>, int32_t>
136-
compute_cluster_assignments(
137-
int32_t num_centers,
138-
std::function<void(ChangeCentersContext)> change_centers = nullptr,
139-
SGVector<int32_t> cluster_assignments = SGVector<int32_t>(),
140-
SGVector<int64_t> weights_set = SGVector<int64_t>());
122+
struct ChangeCentersContext
123+
{
124+
std::shared_ptr<DenseFeatures<float64_t>> lhs;
125+
int32_t i, dim, cluster_assignments_i, min_cluster;
126+
SGVector<int64_t> weights_set;
127+
};
128+
129+
/** Matches points and clusters
130+
* @param change_centers optional coroutine to change centers in
131+
* Lloyd Kmeans
132+
* @return A tuple of: \n
133+
* 1) an assignments vector whose index is a point number and value is
134+
* a point cluster \n
135+
* 2) a weights vector whose index is a cluster number and value is
136+
* a number of points belong to this cluster \n
137+
* 3) a number of changed assignments
138+
*/
139+
std::tuple<SGVector<int32_t>, SGVector<int64_t>, int32_t>
140+
compute_cluster_assignments(
141+
int32_t num_centers,
142+
std::function<void(ChangeCentersContext)> change_centers = nullptr,
143+
SGVector<int32_t> cluster_assignments = SGVector<int32_t>(),
144+
SGVector<int64_t> weights_set = SGVector<int64_t>()) const;
141145
protected:
142146
/** Maximum number of iterations */
143147
int32_t max_iter;
@@ -154,9 +158,6 @@ class KMeansBase : public RandomMixin<DistanceMachine>
154158
/** Radi of the clusters (size k) */
155159
SGVector<float64_t> R;
156160

157-
/** Std of the clusters (size k) */
158-
SGMatrix<float64_t> stds;
159-
160161
/** Initial centers supplied */
161162
SGMatrix<float64_t> initial_centers;
162163

src/shogun/clustering/KMeansMiniBatch.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -53,39 +53,41 @@ void KMeansMiniBatch::minibatch_KMeans()
5353
"iterations {} ",
5454
max_iter);
5555

56-
auto lhs = distance->get_lhs()->as<DenseFeatures<float64_t>>();
57-
56+
auto lhs=
57+
distance->get_lhs()->as<DenseFeatures<float64_t>>();
5858
auto rhs_mus = std::make_shared<DenseFeatures<float64_t>>(cluster_centers);
59-
distance->replace_rhs(rhs_mus);
60-
59+
auto rhs_cache = distance->get_rhs();
60+
distance->replace_rhs(rhs_mus);
6161
int32_t XSize=lhs->get_num_vectors();
6262

6363
SGVector<float64_t> v=SGVector<float64_t>(k);
6464
v.zero();
6565

6666
for (auto i : SG_PROGRESS(range(max_iter)))
6767
{
68-
SGVector<int32_t> M = mbchoose_rand(batch_size, XSize);
69-
SGVector<int32_t> ncent = SGVector<int32_t>(batch_size);
70-
for (int32_t j = 0; j < batch_size; j++)
68+
SGVector<int32_t> M=mbchoose_rand(batch_size,XSize);
69+
SGVector<int32_t> ncent=SGVector<int32_t>(batch_size);
70+
for (int32_t j=0; j<batch_size; j++)
7171
{
72-
SGVector<float64_t> dists = SGVector<float64_t>(k);
73-
for (int32_t p = 0; p < k; p++)
74-
dists[p] = distance->distance(M[j], p);
72+
SGVector<float64_t> dists=SGVector<float64_t>(k);
73+
for (int32_t p=0; p<k; p++)
74+
dists[p]=distance->distance(M[j],p);
7575
ncent[j] = Math::arg_min(dists.vector, 1, dists.vlen);
7676
}
77-
for (int32_t j = 0; j < batch_size; j++)
77+
for (int32_t j=0; j<batch_size; j++)
7878
{
79-
int32_t near = ncent[j];
80-
SGVector<float64_t> c_alive = rhs_mus->get_feature_vector(near);
81-
SGVector<float64_t> x = lhs->get_feature_vector(M[j]);
82-
v[near] += 1.0;
83-
float64_t eta = 1.0 / v[near];
79+
int32_t near=ncent[j];
80+
SGVector<float64_t> c_alive=rhs_mus->get_feature_vector(near);
81+
SGVector<float64_t> x=lhs->get_feature_vector(M[j]);
82+
v[near]+=1.0;
83+
float64_t eta=1.0/v[near];
8484
linalg::add(c_alive, x, c_alive, 1.0 - eta, eta);
8585
}
8686
cluster_centers = rhs_mus->get_feature_matrix();
8787
observe<SGMatrix<float64_t>>(i, "cluster_centers");
8888
}
89+
90+
distance->replace_rhs(rhs_cache);
8991
}
9092

9193
SGVector<int32_t> KMeansMiniBatch::mbchoose_rand(int32_t b, int32_t num)
@@ -120,10 +122,7 @@ void KMeansMiniBatch::init_mb_params()
120122
bool KMeansMiniBatch::train_machine(std::shared_ptr<Features> data)
121123
{
122124
initialize_training(data);
123-
auto rhs_cache = distance->get_rhs();
124125
minibatch_KMeans();
125-
compute_stds();
126-
distance->replace_rhs(rhs_cache);
127126
compute_cluster_variances();
128127
return true;
129128
}

0 commit comments

Comments
 (0)