Skip to content

Versions of algorithms implementing tighter LB update and other improvements #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c4d7264
The newer version of kmeans++
petrrysavy Aug 16, 2015
bcf004e
Aggregation of results from multiple runs
petrrysavy Aug 16, 2015
5f432f7
Merge https://github.com/BaylorCS/baylorml
petrrysavy Aug 16, 2015
0567fd8
Merge remote-tracking branch 'origin/bugfix2'
petrrysavy Aug 30, 2015
b09cd36
Modified update code for hamerly, heap & elkan
petrrysavy Sep 3, 2015
ea676f9
cleanup of comments
petrrysavy Sep 4, 2015
c1761a2
cleanup of comments
petrrysavy Sep 4, 2015
6a98a1b
Heap hmeans enhanced by upper bound array and modified update.
petrrysavy Sep 6, 2015
29b0e30
heap kmeans with neighbors iteration
petrrysavy Sep 19, 2015
52e2410
version of annular algorithm with the modified update
petrrysavy Sep 19, 2015
75dc650
merged modified_update_triangle_based_kmeans and
petrrysavy Sep 20, 2015
500b58f
hamerly's algorithm with neighbors in the first iteration
petrrysavy Sep 20, 2015
08aa03f
Version of Hamerly's algorithm that implements iteration over neighbors
petrrysavy Sep 20, 2015
d3c8ec8
Merge https://github.com/BaylorCS/baylorml into modified_update
petrrysavy Sep 20, 2015
8307c71
small change in elkan's algorithm that had influenced runtime a lot (…
petrrysavy Sep 20, 2015
55cf5ec
Version of elkan kmeans that supporst iteration over neighbors
petrrysavy Sep 20, 2015
b59b34b
Version of the Elkan's algorithm where the upper/lower bounds are stored
petrrysavy Sep 20, 2015
581a843
Reviewed comments.
petrrysavy Nov 12, 2015
338f73b
Removed an unnecessary condition in ElkanKmeansNeighbors
petrrysavy Nov 12, 2015
36d1ab3
cleaned method for tighter update calculation
petrrysavy Nov 14, 2015
0411f28
Simplified code of HamerlyKmeansNeighbors1st
petrrysavy Nov 17, 2015
0c074f6
draft of multithreaded version
petrrysavy Nov 19, 2015
6e2fd5a
Made HamerlyKmeansModified to work in parallel.
petrrysavy Nov 19, 2015
aeb9cc5
Fixes of multithreaded versions
petrrysavy Nov 29, 2015
1d2d7d2
Paralellized calculate_max_upper_bound
petrrysavy Nov 30, 2015
6435a9c
Made the heap algorithm multithreadeMade the heap algorithm multithre…
petrrysavy Dec 1, 2015
15c940d
Formatting changes
petrrysavy Dec 2, 2015
9714219
Made the first iteration multithreaded in HamerlyKmeansNeighbors1st
petrrysavy Dec 2, 2015
8a29934
Removed code duplication in multithreaded calculation of m(c)
petrrysavy Dec 4, 2015
0411cf1
cleaned #include directives
petrrysavy Dec 4, 2015
6a7b72b
formatting changes
petrrysavy Dec 4, 2015
8532ecc
fixed signature of inner_product method in
petrrysavy Dec 13, 2015
cf88c11
Small changes in docs
petrrysavy Mar 24, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion fast_kmeans/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ CPPFLAGS += -g
CPPFLAGS += -O3
CPPFLAGS += -Wno-long-long

CPPFLAGS += -std=c++0x

# Verify algorithm correctness while debugging
#CPPFLAGS += -DDEBUG
#CPPFLAGS += -DVERIFY_ASSIGNMENTS

# To use pthreads, uncomment both lines below.
#CPPFLAGS += -DUSE_THREADS
#LDFLAGS += -lpthread
#CPPFLAGS += -lpthread

# Monitor internal algorithm effectiveness
#CPPFLAGS += -DCOUNT_DISTANCES
Expand Down
136 changes: 136 additions & 0 deletions fast_kmeans/annulus_kmeans_modified.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
* Feedback: [email protected]
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2015
*/

#include "annulus_kmeans_modified.h"
#include "general_functions.h"
#include <cmath>
#include <numeric>
#include <algorithm>

void AnnulusKmeansModified::free() {
HamerlyKmeansModified::free();

delete [] guard;
delete [] xNorm;
delete [] cOrder;
guard = NULL;
xNorm = NULL;
cOrder = NULL;
}

int AnnulusKmeansModified::runThread(int threadId, int maxIterations) {
int iterations = 0;

int startNdx = start(threadId);
int endNdx = end(threadId);

// here we need to calculate s & the centroid-centroid distances before the first iteration
// the remaining calls to this method are hidden by move_centers
update_s(threadId);
synchronizeAllThreads();

while ((iterations < maxIterations) && !converged) {
++iterations;

if (threadId == 0) {
sort_means_by_norm();
}
synchronizeAllThreads();

for (int i = startNdx; i < endNdx; ++i) {
unsigned short closest = assignment[i];

double upper_comparison_bound = std::max(s[closest], lower[i]);

if (upper[i] <= upper_comparison_bound) {
continue;
}

double u2 = pointCenterDist2(i, closest);
upper[i] = sqrt(u2);

if (upper[i] <= upper_comparison_bound) {
continue;
}

double l2 = pointCenterDist2(i, guard[i]);
lower[i] = sqrt(l2);

double beta = std::max(lower[i], upper[i]);

std::pair<double, int>* begin = std::lower_bound(cOrder, cOrder + k, std::make_pair(xNorm[i] - beta, k));
std::pair<double, int>* end = std::lower_bound(begin, cOrder + k, std::make_pair(xNorm[i] + beta, k));

for (std::pair<double, int>* jp = begin; jp != end; ++jp) {
if (jp->second == closest) continue;

double dist2 = pointCenterDist2(i, jp->second);
if (dist2 <= u2) {
if (dist2 == u2) {
if (jp->second < closest) closest = jp->second;
} else {
l2 = u2;
u2 = dist2;
guard[i] = closest;
closest = jp->second;
}
} else if (dist2 < l2) {
l2 = dist2;
guard[i] = jp->second;
}
}

lower[i] = sqrt(l2);

if (assignment[i] != closest) {
upper[i] = sqrt(u2);
changeAssignment(i, closest, threadId);
}
}

verifyAssignment(iterations, startNdx, endNdx);

synchronizeAllThreads();
move_centers(threadId);

synchronizeAllThreads();
if (!converged) {
update_bounds(startNdx, endNdx);
}

synchronizeAllThreads();
}

return iterations;
}

void AnnulusKmeansModified::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
HamerlyKmeansModified::initialize(aX, aK, initialAssignment, aNumThreads);

guard = new unsigned short[n];
xNorm = new double[n];
cOrder = new std::pair<double, int>[k];

for (int i = 0; i < k; ++i) {
cOrder[i].first = 0.0;
cOrder[i].second = i;
}

std::fill(guard, guard + n, 1);
for (int i = 0; i < n; ++i) {
xNorm[i] = sqrt(pointPointInnerProduct(i, i));
}
}

void AnnulusKmeansModified::sort_means_by_norm() {
for (int c1 = 0; c1 < k; ++c1) {
cOrder[c1].first = sqrt(centerCenterInnerProduct(c1, c1));
cOrder[c1].second = c1;
}
std::sort(cOrder, cOrder + k);
}


39 changes: 39 additions & 0 deletions fast_kmeans/annulus_kmeans_modified.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef ANNULUS_KMEANS_MODIFIED_H
#define ANNULUS_KMEANS_MODIFIED_H

/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
* Feedback: [email protected]
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2015
*
* This version of Annulus algorithm implements the tighter update. Most of the
* code is copied from the default version of Annulus algorithm, only the parent
* class is HamerlyKmeansModified.
*/

#include "hamerly_kmeans_modified.h"
#include <utility>

class AnnulusKmeansModified;

class AnnulusKmeansModified : public HamerlyKmeansModified {
public:
AnnulusKmeansModified() : xNorm(NULL), cOrder(NULL), guard(NULL) {}
virtual ~AnnulusKmeansModified() { free(); }
virtual void free();
virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
virtual std::string getName() const { return "annulusmodified"; }

protected:
virtual int runThread(int threadId, int maxIterations);
void sort_means_by_norm();

double *xNorm;

std::pair<double, int> *cOrder;

unsigned short *guard;
};

#endif

5 changes: 2 additions & 3 deletions fast_kmeans/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,9 @@ class Dataset {
// destroys the dataset safely
~Dataset() {
n = d = nd = 0;
double *dp = data, *sdsp = sumDataSquared;
delete [] data;
delete [] sumDataSquared;
data = sumDataSquared = NULL;
delete [] dp;
delete [] sdsp;
}

// operator= is the standard deep-copy assignment operator, which
Expand Down
Loading