Skip to content

Versions of algorithms implementing tighter LB update and other improvements #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7372ad7
The newer version of kmeans++ * cleaned the code * added prunning of …
petrrysavy Aug 16, 2015
2fd0188
Aggregation of results from multiple runs
petrrysavy Aug 16, 2015
92a7334
Modified update code for hamerly, heap & elkan * the comments needs t…
petrrysavy Sep 3, 2015
b622fb0
cleanup of comments
petrrysavy Sep 4, 2015
91b9ae7
cleanup of comments
petrrysavy Sep 4, 2015
4864fb5
Heap hmeans enhanced by upper bound array and modified update.
petrrysavy Sep 6, 2015
d824abb
heap kmeans with neighbors iteration
petrrysavy Sep 19, 2015
74aa03b
version of annular algorithm with the modified update
petrrysavy Sep 19, 2015
ad991c2
merged modified_update_triangle_based_kmeans and triangle_based_kmean…
petrrysavy Sep 20, 2015
7b1abcf
hamerly's algorithm with neighbors in the first iteration
petrrysavy Sep 20, 2015
074e718
Version of Hamerly's algorithm that implements iteration over neighbo…
petrrysavy Sep 20, 2015
775085b
small change in elkan's algorithm that had influenced runtime a lot (…
petrrysavy Sep 20, 2015
cb22959
Version of elkan kmeans that supporst iteration over neighbors
petrrysavy Sep 20, 2015
cff8e6c
Version of the Elkan's algorithm where the upper/lower bounds are sto…
petrrysavy Sep 20, 2015
972bd30
Reviewed comments.
petrrysavy Nov 12, 2015
d3b1d88
Removed an unnecessary condition in ElkanKmeansNeighbors
petrrysavy Nov 12, 2015
c687788
cleaned method for tighter update calculation
petrrysavy Nov 14, 2015
aeca512
Simplified code of HamerlyKmeansNeighbors1st
petrrysavy Nov 17, 2015
5ce5ec3
draft of multithreaded version
petrrysavy Nov 19, 2015
ca3589b
Made HamerlyKmeansModified to work in parallel.
petrrysavy Nov 19, 2015
67282b7
Fixes of multithreaded versions
petrrysavy Nov 29, 2015
e47d508
Paralellized calculate_max_upper_bound
petrrysavy Nov 30, 2015
58ccb4a
Made the heap algorithm multithreadeMade the heap algorithm multithre…
petrrysavy Dec 1, 2015
3793d3d
Formatting changes
petrrysavy Dec 2, 2015
236909f
Made the first iteration multithreaded in HamerlyKmeansNeighbors1st
petrrysavy Dec 2, 2015
f8daec8
Removed code duplication in multithreaded calculation of m(c)
petrrysavy Dec 4, 2015
f65b730
cleaned #include directives
petrrysavy Dec 4, 2015
b7dde13
formatting changes
petrrysavy Dec 4, 2015
3bc8c5b
fixed signature of inner_product method in modified_update_triangle_b…
petrrysavy Dec 13, 2015
6235bfe
Small changes in docs
petrrysavy Mar 24, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ CPPFLAGS += -g
CPPFLAGS += -O3
CPPFLAGS += -Wno-long-long

CPPFLAGS += -std=c++0x

# Verify algorithm correctness while debugging
#CPPFLAGS += -DDEBUG
#CPPFLAGS += -DVERIFY_ASSIGNMENTS

# To use pthreads, uncomment both lines below.
#CPPFLAGS += -DUSE_THREADS
#LDFLAGS += -lpthread
#CPPFLAGS += -lpthread

# Monitor internal algorithm effectiveness
#CPPFLAGS += -DCOUNT_DISTANCES
Expand Down
134 changes: 134 additions & 0 deletions annulus_kmeans_modified.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
* Feedback: [email protected]
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2015
*/

#include "annulus_kmeans_modified.h"
#include "general_functions.h"
#include <cmath>
#include <numeric>
#include <algorithm>

void AnnulusKmeansModified::free() {
HamerlyKmeansModified::free();

delete [] guard;
delete [] xNorm;
delete [] cOrder;
guard = NULL;
xNorm = NULL;
cOrder = NULL;
}

int AnnulusKmeansModified::runThread(int threadId, int maxIterations) {
int iterations = 0;

int startNdx = start(threadId);
int endNdx = end(threadId);

// here we need to calculate s & the centroid-centroid distances before the first iteration
// the remaining calls to this method are hidden by move_centers
update_s(threadId);
synchronizeAllThreads();

while ((iterations < maxIterations) && !converged) {
++iterations;

if (threadId == 0) {
sort_means_by_norm();
}
synchronizeAllThreads();

for (int i = startNdx; i < endNdx; ++i) {
unsigned short closest = assignment[i];

double upper_comparison_bound = std::max(s[closest], lower[i]);

if (upper[i] <= upper_comparison_bound) {
continue;
}

double u2 = pointCenterDist2(i, closest);
upper[i] = sqrt(u2);

if (upper[i] <= upper_comparison_bound) {
continue;
}

double l2 = pointCenterDist2(i, guard[i]);
lower[i] = sqrt(l2);

double beta = std::max(lower[i], upper[i]);

std::pair<double, int>* begin = std::lower_bound(cOrder, cOrder + k, std::make_pair(xNorm[i] - beta, k));
std::pair<double, int>* end = std::lower_bound(begin, cOrder + k, std::make_pair(xNorm[i] + beta, k));

for (std::pair<double, int>* jp = begin; jp != end; ++jp) {
if (jp->second == closest) continue;

double dist2 = pointCenterDist2(i, jp->second);
if (dist2 <= u2) {
if (dist2 == u2) {
if (jp->second < closest) closest = jp->second;
} else {
l2 = u2;
u2 = dist2;
guard[i] = closest;
closest = jp->second;
}
} else if (dist2 < l2) {
l2 = dist2;
guard[i] = jp->second;
}
}

lower[i] = sqrt(l2);

if (assignment[i] != closest) {
upper[i] = sqrt(u2);
changeAssignment(i, closest, threadId);
}
}

verifyAssignment(iterations, startNdx, endNdx);

synchronizeAllThreads();
move_centers(threadId);

synchronizeAllThreads();
if (!converged) {
update_bounds(startNdx, endNdx);
}

synchronizeAllThreads();
}

return iterations;
}

void AnnulusKmeansModified::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
HamerlyKmeansModified::initialize(aX, aK, initialAssignment, aNumThreads);

guard = new unsigned short[n];
xNorm = new double[n];
cOrder = new std::pair<double, int>[k];

for (int i = 0; i < k; ++i) {
cOrder[i].first = 0.0;
cOrder[i].second = i;
}

std::fill(guard, guard + n, 1);
for (int i = 0; i < n; ++i) {
xNorm[i] = sqrt(pointPointInnerProduct(i, i));
}
}

void AnnulusKmeansModified::sort_means_by_norm() {
for (int c1 = 0; c1 < k; ++c1) {
cOrder[c1].first = sqrt(centerCenterInnerProduct(c1, c1));
cOrder[c1].second = c1;
}
std::sort(cOrder, cOrder + k);
}
38 changes: 38 additions & 0 deletions annulus_kmeans_modified.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef ANNULUS_KMEANS_MODIFIED_H
#define ANNULUS_KMEANS_MODIFIED_H

/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
* Feedback: [email protected]
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2015
*
* This version of Annulus algorithm implements the tighter update. Most of the
* code is copied from the default version of Annulus algorithm, only the parent
* class is HamerlyKmeansModified.
*/

#include "hamerly_kmeans_modified.h"
#include <utility>

class AnnulusKmeansModified;

class AnnulusKmeansModified : public HamerlyKmeansModified {
public:
AnnulusKmeansModified() : xNorm(NULL), cOrder(NULL), guard(NULL) {}
virtual ~AnnulusKmeansModified() { free(); }
virtual void free();
virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
virtual std::string getName() const { return "annulusmodified"; }

protected:
virtual int runThread(int threadId, int maxIterations);
void sort_means_by_norm();

double *xNorm;

std::pair<double, int> *cOrder;

unsigned short *guard;
};

#endif
5 changes: 2 additions & 3 deletions dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,9 @@ class Dataset {
// destroys the dataset safely
~Dataset() {
n = d = nd = 0;
double *dp = data, *sdsp = sumDataSquared;
delete [] data;
delete [] sumDataSquared;
data = sumDataSquared = NULL;
delete [] dp;
delete [] sdsp;
}

// operator= is the standard deep-copy assignment operator, which
Expand Down
Loading