diff --git a/fast_kmeans/Makefile b/fast_kmeans/Makefile
index f648746..b44fa33 100644
--- a/fast_kmeans/Makefile
+++ b/fast_kmeans/Makefile
@@ -7,13 +7,15 @@ CPPFLAGS += -g
 CPPFLAGS += -O3
 CPPFLAGS += -Wno-long-long
 
+CPPFLAGS += -std=c++0x
+
 # Verify algorithm correctness while debugging
 #CPPFLAGS += -DDEBUG
 #CPPFLAGS += -DVERIFY_ASSIGNMENTS
 
 # To use pthreads, uncomment both lines below.
 #CPPFLAGS += -DUSE_THREADS
-#LDFLAGS += -lpthread
+#CPPFLAGS += -lpthread
 
 # Monitor internal algorithm effectiveness
 #CPPFLAGS += -DCOUNT_DISTANCES
diff --git a/fast_kmeans/annulus_kmeans_modified.cpp b/fast_kmeans/annulus_kmeans_modified.cpp
new file mode 100644
index 0000000..9abf73e
--- /dev/null
+++ b/fast_kmeans/annulus_kmeans_modified.cpp
@@ -0,0 +1,136 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "annulus_kmeans_modified.h"
+#include "general_functions.h"
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+
+void AnnulusKmeansModified::free() {
+    HamerlyKmeansModified::free();
+
+    delete [] guard;
+    delete [] xNorm;
+    delete [] cOrder;
+    guard = NULL;
+    xNorm = NULL;
+    cOrder = NULL;
+}
+
+int AnnulusKmeansModified::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        if (threadId == 0) {
+            sort_means_by_norm();
+        }
+        synchronizeAllThreads();
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+
+            double upper_comparison_bound = std::max(s[closest], lower[i]);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double u2 = pointCenterDist2(i, closest);
+            upper[i] = sqrt(u2);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double l2 = pointCenterDist2(i, guard[i]);
+            lower[i] = sqrt(l2);
+
+            double beta = std::max(lower[i], upper[i]);
+
+            std::pair<double, int>* begin = std::lower_bound(cOrder, cOrder + k, std::make_pair(xNorm[i] - beta, k));
+            std::pair<double, int>* end = std::lower_bound(begin, cOrder + k, std::make_pair(xNorm[i] + beta, k));
+
+            for (std::pair<double, int>* jp = begin; jp != end; ++jp) {
+                if (jp->second == closest) continue;
+
+                double dist2 = pointCenterDist2(i, jp->second);
+                if (dist2 <= u2) {
+                    if (dist2 == u2) {
+                        if (jp->second < closest) closest = jp->second;
+                    } else {
+                        l2 = u2;
+                        u2 = dist2;
+                        guard[i] = closest;
+                        closest = jp->second;
+                    }
+                } else if (dist2 < l2) {
+                    l2 = dist2;
+                    guard[i] = jp->second;
+                }
+            }
+
+            lower[i] = sqrt(l2);
+
+            if (assignment[i] != closest) {
+                upper[i] = sqrt(u2);
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (!converged) {
+            update_bounds(startNdx, endNdx);
+        }
+
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void AnnulusKmeansModified::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    HamerlyKmeansModified::initialize(aX, aK, initialAssignment, aNumThreads);
+
+    guard = new unsigned short[n];
+    xNorm = new double[n];
+    cOrder = new std::pair<double, int>[k];
+
+    for (int i = 0; i < k; ++i) {
+        cOrder[i].first = 0.0;
+        cOrder[i].second = i;
+    }
+
+    std::fill(guard, guard + n, 1);
+    for (int i = 0; i < n; ++i) {
+        xNorm[i] = sqrt(pointPointInnerProduct(i, i));
+    }
+}
+
+void AnnulusKmeansModified::sort_means_by_norm() {
+    for (int c1 = 0; c1 < k; ++c1) {
+        cOrder[c1].first = sqrt(centerCenterInnerProduct(c1, c1));
+        cOrder[c1].second = c1;
+    }
+    std::sort(cOrder, cOrder + k);
+}
+
+
diff --git a/fast_kmeans/annulus_kmeans_modified.h b/fast_kmeans/annulus_kmeans_modified.h
new file mode 100644
index 0000000..3fb62da
--- /dev/null
+++ b/fast_kmeans/annulus_kmeans_modified.h
@@ -0,0 +1,39 @@
+#ifndef ANNULUS_KMEANS_MODIFIED_H
+#define ANNULUS_KMEANS_MODIFIED_H
+
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * This version of Annulus algorithm implements the tighter update. Most of the
+ * code is copied from the default version of Annulus algorithm, only the parent
+ * class is HamerlyKmeansModified.
+ */
+
+#include "hamerly_kmeans_modified.h"
+#include <utility>
+
+class AnnulusKmeansModified;
+
+class AnnulusKmeansModified : public HamerlyKmeansModified {
+    public:
+        AnnulusKmeansModified() : xNorm(NULL), cOrder(NULL), guard(NULL) {}
+        virtual ~AnnulusKmeansModified() { free(); }
+        virtual void free();
+        virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+        virtual std::string getName() const { return "annulusmodified"; }
+
+    protected:
+        virtual int runThread(int threadId, int maxIterations);
+        void sort_means_by_norm();
+
+        double *xNorm;
+
+        std::pair<double, int> *cOrder;
+
+        unsigned short *guard;
+};
+
+#endif
+
diff --git a/fast_kmeans/dataset.h b/fast_kmeans/dataset.h
index f7a16a7..70e1d97 100644
--- a/fast_kmeans/dataset.h
+++ b/fast_kmeans/dataset.h
@@ -37,10 +37,9 @@ class Dataset {
         // destroys the dataset safely
         ~Dataset() {
             n = d = nd = 0; 
-            double *dp = data, *sdsp = sumDataSquared;
+			delete [] data;
+			delete [] sumDataSquared;
             data = sumDataSquared = NULL;
-            delete [] dp;
-            delete [] sdsp;
         }
 
         // operator= is the standard deep-copy assignment operator, which
diff --git a/fast_kmeans/driver.cpp b/fast_kmeans/driver.cpp
index 6b321c4..16fb5f8 100644
--- a/fast_kmeans/driver.cpp
+++ b/fast_kmeans/driver.cpp
@@ -38,16 +38,28 @@
 
 #include "dataset.h"
 #include "general_functions.h"
+#include "stats_functions.h"
 #include "hamerly_kmeans.h"
+#include "hamerly_kmeans_modified.h"
+#include "hamerly_kmeans_neighbors.h"
+#include "hamerly_kmeans_neighbors1st.h"
+#include "hamerly_kmeans_neighbors_only.h"
 #include "annulus_kmeans.h"
+#include "annulus_kmeans_modified.h"
 #include "drake_kmeans.h"
 #include "naive_kmeans.h"
 #include "elkan_kmeans.h"
 #include "compare_kmeans.h"
 #include "sort_kmeans.h"
 #include "heap_kmeans.h"
+#include "heap_kmeans_modified.h"
+#include "heap_kmeans_ubarr.h"
+#include "heap_kmeans_ubarr_neighbors.h"
 #include "naive_kernel_kmeans.h"
 #include "elkan_kernel_kmeans.h"
+#include "elkan_kmeans_modified.h"
+#include "elkan_kmeans_neighbors.h"
+#include "elkan_kmeans_neighbors_rel.h"
 #include <iostream>
 #include <fstream>
 #include <iomanip>
@@ -59,7 +71,7 @@
 #include <sys/resource.h>
 #include <unistd.h>
 
-void execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned short k, unsigned short const *assignment,
+eval_result* execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned short k, unsigned short const *assignment,
         int xcNdx, int numThreads, int maxIterations,
         std::vector<int> *numItersHistory
         #ifdef MONITOR_ACCURACY
@@ -84,21 +96,10 @@ int main(int argc, char **argv) {
 
     int numThreads = 1;
     int maxIterations = std::numeric_limits<int>::max();
+	int repeats = 1;
 
     // Print header row
-    std::cout << std::setw(35) << "algorithm" << "\t"
-              << std::setw(5) << "iters" << "\t"
-              << std::setw(10) << "numThreads" << "\t"
-              << std::setw(10) << "cpu_secs" << "\t"
-              << std::setw(10) << "wall_secs" << "\t"
-              << std::setw(8) << "MB"
-              #ifdef MONITOR_ACCURACY
-              << "\t" << std::setw(8) << "sse"
-              #endif
-              #ifdef COUNT_DISTANCES
-              << "\t" << std::setw(11) << "#distances"
-              #endif
-              << std::endl;
+    print_header_row();
 
     // Read the command file
     for (std::string command; std::cin >> command; ) {
@@ -115,6 +116,8 @@ int main(int argc, char **argv) {
             if (maxIterations < 0) {
                 maxIterations = std::numeric_limits<int>::max();
             }
+        } else if (command == "repeats") {
+            std::cin >> repeats;
         } else if (command == "dataset" || command == "data") {
             xcNdx++;
 
@@ -161,7 +164,7 @@ int main(int argc, char **argv) {
             if (method == "kpp" || method == "kmeansplusplus") {
                 // Perform k-means++ initialization
                 std::cout << "initializing with kmeans++: k = " << k << std::endl;
-                c = init_centers_kmeanspp_v2(*x, k);
+                c = init_centers_kmeanspp(*x, k);
             } else if (method == "random") {
                 // Perform random initialization
                 std::cout << "initializing with random:  k = " << k << std::endl;
@@ -184,10 +187,26 @@ int main(int argc, char **argv) {
             algorithm = new NaiveKmeans();
         } else if (command == "hamerly") {
             algorithm = new HamerlyKmeans();
+        } else if (command == "hamerlymodified") {
+			algorithm = new HamerlyKmeansModified();
+		} else if (command == "hamerlyneighbors") {
+			algorithm = new HamerlyKmeansNeighbors();
+		} else if (command == "hamerlyneighbors1st") {
+            algorithm = new HamerlyKmeansNeighbors1st();
+        } else if (command == "hamerlyneighborsonly") {
+            algorithm = new HamerlyKmeansNeighborsOnly();
         } else if (command == "annulus" || command == "norm") {
             algorithm = new AnnulusKmeans();
+        } else if (command == "annulusmodified") {
+            algorithm = new AnnulusKmeansModified();
         } else if (command == "elkan") {
             algorithm = new ElkanKmeans();
+        } else if (command == "elkanmodified") {
+			algorithm = new ElkanKmeansModified();
+		} else if (command == "elkanneighbors") {
+            algorithm = new ElkanKmeansNeighbors();
+        } else if (command == "elkanneighborsrel") {
+            algorithm = new ElkanKmeansNeighborsRel();
         } else if (command == "drake") {
             // Read the number of bounds
             int b;
@@ -215,6 +234,12 @@ int main(int argc, char **argv) {
             algorithm = new SortKmeans();
         } else if (command == "heap") {
             algorithm = new HeapKmeans();
+        } else if (command == "heapmodified") {
+			algorithm = new HeapKmeansModified();
+        } else if (command == "heapubarr") {
+            algorithm = new HeapKmeansUBarr();
+		} else if (command == "heapubarrneighbors") {
+            algorithm = new HeapKmeansUBarrNeighbors();
         } else if (command == "kernel" || command == "elkan_kernel") {
             std::string kernelType;
             std::cin >> kernelType;
@@ -252,11 +277,33 @@ int main(int argc, char **argv) {
         }
 
         if (algorithm) {
-            execute(command, algorithm, x, k, assignment, xcNdx, numThreads, maxIterations, &numItersHistory
-                    #ifdef MONITOR_ACCURACY
-                    , &sseHistory
-                    #endif
-                   );
+            if (repeats <= 1) {
+                execute(command, algorithm, x, k, assignment, xcNdx, numThreads, maxIterations, &numItersHistory
+                        #ifdef MONITOR_ACCURACY
+                        , &sseHistory
+                        #endif
+                        );
+            } else {
+                std::vector<eval_result*> results;
+                bool valid = true;
+                for (int i = 0; i < repeats; ++i) {
+                    eval_result* result = execute(command, algorithm, x, k, assignment, xcNdx, numThreads, maxIterations, &numItersHistory
+                            #ifdef MONITOR_ACCURACY
+                            , &sseHistory
+                            #endif
+                            );
+                    if (result == NULL) {
+                        valid = false;
+                        break;
+                    }
+					results.push_back(result);
+				}
+				if(valid) {
+					print_summary(&results);
+				}
+				for(unsigned i = 0; i < results.size(); ++i)
+					delete results.at(i);
+			}
             delete algorithm;
             algorithm = NULL;
         }
@@ -316,11 +363,11 @@ double elapsed_time(rusage *start) {
     return (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
 }
 
-void execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned short k, unsigned short const *assignment,
-        int xcNdx,
-        int numThreads,
-        int maxIterations,
-        std::vector<int> *numItersHistory
+eval_result* execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned short k, unsigned short const *assignment,
+                     int xcNdx,
+                     int numThreads,
+                     int maxIterations,
+                     std::vector<int> *numItersHistory
         #ifdef MONITOR_ACCURACY
         , std::vector<double> *sseHistory
         #endif
@@ -328,11 +375,11 @@ void execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned
     // Check for missing initialization
     if (assignment == NULL) {
         std::cerr << "initialize centers first!" << std::endl;
-        return;
+        return NULL;
     }
     if (x == NULL) {
         std::cerr << "load a dataset first!\n" << std::endl;
-        return;
+        return NULL;
     }
 
     #ifdef COUNT_DISTANCES
@@ -350,30 +397,31 @@ void execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned
     unsigned short *workingAssignment = new unsigned short[x->n];
     std::copy(assignment, assignment + x->n, workingAssignment);
 
+    // here we will store the result of the run
+    eval_result* run_result = new eval_result();
+    run_result->num_threads = numThreads;
+
     // Time the execution and get the number of iterations
     rusage start_clustering_time = get_time();
     double start_clustering_wall_time = get_wall_time();
     algorithm->initialize(x, k, workingAssignment, numThreads);
-    int iterations = algorithm->run(maxIterations);
+    run_result->iterations = algorithm->run(maxIterations);
     #ifdef COUNT_DISTANCES
-    long long numDistances = algorithm->numDistances;
+    run_result->num_distances = algorithm->numDistances;
+    run_result->inner_products = algorithm->numInnerProducts;
+    run_result->assignment_changes = algorithm->assignmentChanges;
+    run_result->bounds_updates = algorithm->boundsUpdates;
     #endif
-    double cluster_time = elapsed_time(&start_clustering_time);
-    double cluster_wall_time = get_wall_time() - start_clustering_wall_time;
-
-    // Report results
-    std::cout << std::setw(5) << iterations << "\t";
-    std::cout << std::setw(10) << numThreads << "\t";
-    std::cout << std::setw(10) << cluster_time << "\t";
-    std::cout << std::setw(10) << cluster_wall_time << "\t";
-    std::cout << std::setw(8) << (getMemoryUsage() / 1024.0); // from kilo to mega
+    run_result->cluster_time = elapsed_time(&start_clustering_time);
+    run_result->cluster_wall_time = get_wall_time() - start_clustering_wall_time;
+    run_result->memory_usage = getMemoryUsage() / 1024.0;
+
     #ifdef MONITOR_ACCURACY
     {
-        double sse = algorithm->getSSE();
-        std::cout << "\t" << std::setw(11) << sse;
+        run_result->sse = algorithm->getSSE();
 
         // verification that we get the same distortion with different algorithms
-        while (sseHistory->size() <= (size_t)xcNdx) {
+        while (sseHistory->size() <= (size_t) xcNdx) {
             sseHistory->push_back(sse);
         }
         if (sse != sseHistory->back()) {
@@ -381,22 +429,25 @@ void execute(std::string command, Kmeans *algorithm, Dataset const *x, unsigned
         }
     }
     #endif
-    #ifdef COUNT_DISTANCES
-    {
-        std::cout << "\t" << std::setw(11) << numDistances;
-    }
-    #endif
+
+    // output the result
+    print_result(run_result);
 
     // verification that we get the same number of iterations with different algorithms
-    while (numItersHistory->size() <= (size_t)xcNdx) {
-        numItersHistory->push_back(iterations);
+    while (numItersHistory->size() <= (size_t) xcNdx) {
+        numItersHistory->push_back(run_result->iterations);
     }
-    if (iterations != numItersHistory->back()) {
-        std::cerr << "ERROR: iterations = " << iterations << " but last iterations was " << numItersHistory->back() << std::endl;
+    if (run_result->iterations != numItersHistory->back()) {
+        std::cerr << "ERROR: iterations = " << run_result->iterations << " but last iterations was " << numItersHistory->back() << std::endl;
     }
 
     std::cout << std::endl;
 
+    #ifdef ITERATION_STATS
+    print_iteration_stats(&(algorithm->stats));
+    #endif
+
     delete [] workingAssignment;
+    return run_result;
 }
 
diff --git a/fast_kmeans/elkan_kmeans_modified.cpp b/fast_kmeans/elkan_kmeans_modified.cpp
new file mode 100644
index 0000000..7e5392e
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_modified.cpp
@@ -0,0 +1,122 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "elkan_kmeans_modified.h"
+#include "general_functions.h"
+#include <cmath>
+
+void ElkanKmeansModified::calculate_lower_bound_update(int threadId) {
+    // big C is the point for that we calculate the update
+    for (int C = 0; C < k; ++C)
+        #ifdef USE_THREADS
+        if (C % numThreads == threadId)
+            #endif
+            // and small c is the other point that moved
+            for (int c = 0; c < k; ++c)
+                // ignore the diagonal of the matrix, the bound is not used
+                if (c != C) {
+                    // calculate the update and store it on the place
+                    double update = 0.0;
+                    if (centerMovement[c] != 0.0)
+                        update = calculate_update(C, c, true);
+                    lowerBoundUpdate[C * k + c] = update;
+                }
+}
+
+void ElkanKmeansModified::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    numLowerBounds = aK;
+    ModifiedUpdateTriangleBasedKmeans::initialize(aX, aK, initialAssignment, aNumThreads);
+
+    std::fill(lowerBoundUpdate, lowerBoundUpdate + k * k, 0.0);
+}
+
+int ElkanKmeansModified::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        synchronizeAllThreads();
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+            bool r = true;
+
+            if (upper[i] <= s[closest]) {
+                continue;
+            }
+
+            for (int j = 0; j < k; ++j) {
+                if (j == closest) {
+                    continue;
+                }
+                if (upper[i] <= lower[i * k + j]) {
+                    continue;
+                }
+                if (upper[i] <= centerCenterDistDiv2[closest * k + j]) {
+                    continue;
+                }
+
+                // ELKAN 3(a)
+                if (r) {
+                    upper[i] = sqrt(pointCenterDist2(i, closest));
+                    lower[i * k + closest] = upper[i];
+                    r = false;
+                    if ((upper[i] <= lower[i * k + j]) || (upper[i] <= centerCenterDistDiv2[closest * k + j])) {
+                        continue;
+                    }
+                }
+
+                // ELKAN 3(b)
+                lower[i * k + j] = sqrt(pointCenterDist2(i, j));
+                if (lower[i * k + j] < upper[i]) {
+                    closest = j;
+                    upper[i] = lower[i * k + j];
+                }
+            }
+            if (assignment[i] != closest) {
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        // ELKAN 4, 5, AND 6
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (!converged) {
+            update_bounds(startNdx, endNdx);
+        }
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void ElkanKmeansModified::update_bounds(int startNdx, int endNdx) {
+    #ifdef COUNT_DISTANCES
+    for (int i = 0; i < k; ++i)
+        for (int j = 0; j < numLowerBounds; ++j) {
+            boundsUpdates += ((double) clusterSize[0][i]) * (lowerBoundUpdate[i * numLowerBounds + j]);
+        }
+    #endif
+    for (int i = startNdx; i < endNdx; ++i) {
+        upper[i] += centerMovement[assignment[i]];
+        for (int j = 0; j < k; ++j) {
+            // each lower bound is updated by specified value, not by the center movement
+            lower[i * numLowerBounds + j] -= lowerBoundUpdate[assignment[i] * numLowerBounds + j];
+        }
+    }
+}
\ No newline at end of file
diff --git a/fast_kmeans/elkan_kmeans_modified.h b/fast_kmeans/elkan_kmeans_modified.h
new file mode 100644
index 0000000..426ed05
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_modified.h
@@ -0,0 +1,45 @@
+#ifndef ELKAN_KMEANS_MODIFIED_H
+#define ELKAN_KMEANS_MODIFIED_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * Elkan's k-means algorithm that uses k lower bounds per point to prune
+ * distance calculations.
+ * 
+ * This adds to the Elkan's algorithm tighter lower bound udpate.
+ */
+
+#include "modified_update_triangle_based_kmeans.h"
+
+class ElkanKmeansModified : public ModifiedUpdateTriangleBasedKmeans {
+public:
+
+	ElkanKmeansModified() {};
+
+	virtual ~ElkanKmeansModified() {
+		free();
+	}
+
+	virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+
+	virtual std::string getName() const {
+		return "elkanmodified";
+	}
+
+protected:
+
+	// override this as we are calculating each bound explicitly
+	// this is different from the default implemntation
+	virtual void calculate_lower_bound_update(int threadId);
+
+	virtual int runThread(int threadId, int maxIterations);
+
+	// Update the upper and lower bounds for the range of points given.
+	void update_bounds(int startNdx, int endNdx);
+
+};
+
+#endif
diff --git a/fast_kmeans/elkan_kmeans_neighbors.cpp b/fast_kmeans/elkan_kmeans_neighbors.cpp
new file mode 100644
index 0000000..0dfa297
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_neighbors.cpp
@@ -0,0 +1,105 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "elkan_kmeans_neighbors.h"
+#include "general_functions.h"
+#include <cmath>
+
+int ElkanKmeansNeighbors::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        // now we need to filter neighbors so that we remain only with those
+        // that fulfil the stronger condition that is used for elkan_kmeans
+        if (iterations != 1)
+            filter_neighbors(threadId);
+
+        synchronizeAllThreads();
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+            bool r = true;
+
+            if (upper[i] <= s[closest]) {
+                continue;
+            }
+
+            // iterate only over centroids that can be closest to some x in the dataset
+            for (int* ptr = neighbours[closest]; (*ptr) != -1; ++ptr) {
+                // now j has the same meaning as before
+                const int j = (*ptr);
+                if (upper[i] <= lower[i * k + j]) {
+                    continue;
+                }
+                if (upper[i] <= centerCenterDistDiv2[closest * k + j]) {
+                    continue;
+                }
+
+                // ELKAN 3(a)
+                if (r) {
+                    upper[i] = sqrt(pointCenterDist2(i, closest));
+                    lower[i * k + closest] = upper[i];
+                    r = false;
+                    if ((upper[i] <= lower[i * k + j]) || (upper[i] <= centerCenterDistDiv2[closest * k + j])) {
+                        continue;
+                    }
+                }
+
+                // ELKAN 3(b)
+                lower[i * k + j] = sqrt(pointCenterDist2(i, j));
+                if (lower[i * k + j] < upper[i]) {
+                    closest = j;
+                    upper[i] = lower[i * k + j];
+                }
+            }
+            if (assignment[i] != closest) {
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        // ELKAN 4, 5, AND 6
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (!converged) {
+            update_bounds(startNdx, endNdx);
+        }
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void ElkanKmeansNeighbors::filter_neighbors(int threadId) {
+    for (int c = 0; c < k; ++c)
+        #ifdef USE_THREADS
+        if (c % numThreads == threadId)
+            #endif
+        {
+            // use the stronger condition without s[c] on the left
+            double boundOnOtherDistance = maxUpperBound[c] + centerMovement[c];
+            int neighboursPos = 0;
+            // fill the array in the similar manner as we did before
+            for (int C = 0; C < k; ++C) {
+                // select only centroids that fulfil the stronger condition
+                if (c != C && boundOnOtherDistance > centerCenterDistDiv2[c * k + C])
+                    neighbours[c][neighboursPos++] = C;
+            }
+            neighbours[c][neighboursPos] = -1;
+        }
+}
\ No newline at end of file
diff --git a/fast_kmeans/elkan_kmeans_neighbors.h b/fast_kmeans/elkan_kmeans_neighbors.h
new file mode 100644
index 0000000..6bd1c7a
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_neighbors.h
@@ -0,0 +1,41 @@
+#ifndef ELKAN_KMEANS_NEIGHBORS_H
+#define ELKAN_KMEANS_NEIGHBORS_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * Elkan's k-means algorithm that uses k lower bounds per point to prune
+ * distance calculations. This adds to the Elkan's algorithm tighter
+ * lower bound udpate and iteration over neighbors. Note that for Elkan's
+ * algorithm this condition is more strong than for other algorithms.
+ */
+
+#include "elkan_kmeans_modified.h"
+
+class ElkanKmeansNeighbors : public ElkanKmeansModified {
+public:
+
+	ElkanKmeansNeighbors() {};
+
+	virtual ~ElkanKmeansNeighbors() { free(); }
+
+	virtual std::string getName() const {
+		return "elkanneighbors";
+	}
+
+protected:
+
+	/*
+	 * This method filter neighbors so that they contain only points
+	 * that fulfil the stronger condition for elkan kmeans, i.e.
+	 * m(cj) > 1/2 \| ci - cj \|.
+	 */
+	void filter_neighbors(int threadId);
+	
+	virtual int runThread(int threadId, int maxIterations);
+
+};
+
+#endif
diff --git a/fast_kmeans/elkan_kmeans_neighbors_rel.cpp b/fast_kmeans/elkan_kmeans_neighbors_rel.cpp
new file mode 100644
index 0000000..c0e1a80
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_neighbors_rel.cpp
@@ -0,0 +1,138 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "elkan_kmeans_neighbors_rel.h"
+#include "general_functions.h"
+#include <cmath>
+
+void ElkanKmeansNeighborsRel::free() {
+    ElkanKmeansNeighbors::free();
+    delete [] upperRel;
+    delete [] lowerRel;
+    upperRel = NULL;
+    lowerRel = NULL;
+}
+
+/* This function initializes the the upper and lower bound arrays that
+ * are used for storing the sum of center movement.
+ */
+void ElkanKmeansNeighborsRel::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    ElkanKmeansNeighbors::initialize(aX, aK, initialAssignment, aNumThreads);
+    upperRel = new double[k];
+    std::fill(upperRel, upperRel + k, 0.0);
+    lowerRel = new double[k * k];
+    std::fill(lowerRel, lowerRel + k*k, 0.0);
+}
+
+void ElkanKmeansNeighborsRel::calculate_max_upper_bound(int threadId) {
+    ElkanKmeansNeighbors::calculate_max_upper_bound(threadId);
+    synchronizeAllThreads();
+    // do not forget that the upper bound array is smaller by upperRel
+    addVectors(maxUpperBound, upperRel, k);
+}
+
+int ElkanKmeansNeighborsRel::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        // now we need to filter neighbors so that we remain only with those
+        // that fulfil the stronger condition that is used for elkan_kmeans
+        if (iterations != 1)
+            filter_neighbors(threadId);
+
+        synchronizeAllThreads();
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+            bool r = true;
+            // the true value of the upper bound
+            double upperI = upper[i] + upperRel[closest];
+
+            if (upperI <= s[closest]) {
+                continue;
+            }
+
+            // iterate only over centroids that can be closest to some x in the dataset
+            for (int* ptr = neighbours[closest]; (*ptr) != -1; ++ptr) {
+                // now j has the same meaning as before
+                int j = (*ptr);
+                // here is the true value of the lower bound
+                double lowerIJ = lower[i * k + j] - lowerRel[assignment[i] * k + j];
+                if (upperI <= lowerIJ) {
+                    continue;
+                }
+                if (upperI <= centerCenterDistDiv2[closest * k + j]) {
+                    continue;
+                }
+
+                // ELKAN 3(a)
+                if (r) {
+                    upperI = sqrt(pointCenterDist2(i, closest));
+                    // the lower bound has to be stored relative to the movement
+                    lower[i * k + closest] = upperI + lowerRel[assignment[i] * k + closest];
+                    r = false;
+                    if ((upperI <= lowerIJ) || (upperI <= centerCenterDistDiv2[closest * k + j])) {
+                        continue;
+                    }
+                }
+
+                // ELKAN 3(b)
+                lowerIJ = sqrt(pointCenterDist2(i, j));
+                if (lowerIJ < upperI) {
+                    closest = j;
+                    upperI = lowerIJ;
+                }
+                // agan, the lower bound needs to be stored relative to the movement
+                lower[i * k + j] = lowerIJ + lowerRel[assignment[i] * k + j];
+            }
+            // and the upper as well
+            upper[i] = upperI - upperRel[assignment[i]];
+            if (assignment[i] != closest) {
+                // update the upper bound so that it is relative to the new assignment
+                upper[i] += upperRel[assignment[i]] - upperRel[closest];
+                // do the same for all k lower bounds
+                for (int j = 0; j < k; ++j)
+                    lower[i * k + j] -= lowerRel[assignment[i] * k + j] - lowerRel[closest * k + j];
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        // ELKAN 4, 5, AND 6
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (threadId == 0 && !converged)
+            update_bounds(startNdx, endNdx);
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void ElkanKmeansNeighborsRel::update_bounds(int startNdx, int endNdx) {
+    #ifdef COUNT_DISTANCES
+    for (int i = 0; i < k; ++i)
+        for (int j = 0; j < k; ++j) {
+            boundsUpdates += ((double) clusterSize[0][i]) * (lowerBoundUpdate[i * k + j]);
+        }
+    #endif
+
+    // only add the center movement to the upperRel and lower bound update to the lowerRel
+    addVectors(upperRel, centerMovement, k);
+    addVectors(lowerRel, lowerBoundUpdate, k * k);
+}
\ No newline at end of file
diff --git a/fast_kmeans/elkan_kmeans_neighbors_rel.h b/fast_kmeans/elkan_kmeans_neighbors_rel.h
new file mode 100644
index 0000000..4eab219
--- /dev/null
+++ b/fast_kmeans/elkan_kmeans_neighbors_rel.h
@@ -0,0 +1,52 @@
+#ifndef ELKAN_KMEANS_NEIGHBORS_REL_H
+#define ELKAN_KMEANS_NEIGHBORS_REL_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * Elkan's k-means algorithm that uses k lower bounds per point to prune
+ * distance calculations. This adds to the Elkan's algorithm tighter
+ * lower bound udpate, iteration over neighbors and also stores the upper/lower
+ * bounds relative to the center movements.
+ */
+
+#include "elkan_kmeans_neighbors.h"
+
+class ElkanKmeansNeighborsRel : public ElkanKmeansNeighbors {
+public:
+
+	ElkanKmeansNeighborsRel() : upperRel(NULL), lowerRel(NULL) {};
+
+	virtual ~ElkanKmeansNeighborsRel() { free(); }
+
+	virtual std::string getName() const {
+		return "elkanneighborsrel";
+	}
+	
+	virtual void free();
+	virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+
+protected:
+
+	virtual int runThread(int threadId, int maxIterations);
+	
+	// now we have effectively to update k bounds, instead of n
+	virtual void update_bounds(int startNdx, int endNdx);
+	
+	// the maximum upper bound needs to be updated by the relative movement
+	virtual void calculate_max_upper_bound(int threadId);
+	
+	// in those two arrays we will store how much the upper/lower bound
+	// is changed so that we do not need to update them, but only the accumulated
+	// center movement
+	// the upper bound is relative to the movement of the assigned centroid
+	double * upperRel;
+	// and each lower bound is relative to the sum of all lower bound updates
+	// that were calculated for this bound
+	double * lowerRel;
+
+};
+
+#endif
diff --git a/fast_kmeans/general_functions.cpp b/fast_kmeans/general_functions.cpp
index 63b6cd1..7fba838 100644
--- a/fast_kmeans/general_functions.cpp
+++ b/fast_kmeans/general_functions.cpp
@@ -39,7 +39,6 @@ double distance2silent(double const *a, double const *b, int d) {
     return d2;
 }
 
-
 void centerDataset(Dataset *x) {
     double *xCentroid = new double[x->d];
 
@@ -55,7 +54,7 @@ void centerDataset(Dataset *x) {
     for (int d = 0; d < x->d; ++d) {
         xCentroid[d] /= x->n;
     }
-    
+
     // re-center the dataset
     const double *xEnd = x->data + x->n * x->d;
     for (double *xp = x->data; xp != xEnd; xp += x->d) {
@@ -79,7 +78,7 @@ Dataset *init_centers(Dataset const &x, unsigned short k) {
                     break;
                 }
             }
-        } while (! acceptable);
+        } while (!acceptable);
         double *cdp = c->data + i * x.d;
         memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
         if (c->sumDataSquared) {
@@ -92,89 +91,17 @@ Dataset *init_centers(Dataset const &x, unsigned short k) {
     return c;
 }
 
-
 Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) {
     int *chosen_pts = new int[k];
-    std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
-    double *distribution = new double[x.n];
-
-    // initialize dist2
-    for (int i = 0; i < x.n; ++i) {
-        dist2[i].first = std::numeric_limits<double>::max();
-        dist2[i].second = i;
-    }
-
-    // choose the first point randomly
-    int ndx = 1;
-    chosen_pts[ndx - 1] = rand() % x.n;
-
-    while (ndx < k) {
-        double sum_distribution = 0.0;
-        // look for the point that is furthest from any center
-        for (int i = 0; i < x.n; ++i) {
-            int example = dist2[i].second;
-            double d2 = 0.0, diff;
-            for (int j = 0; j < x.d; ++j) {
-                diff = x(example,j) - x(chosen_pts[ndx - 1],j);
-                d2 += diff * diff;
-            }
-            if (d2 < dist2[i].first) {
-                dist2[i].first = d2;
-            }
-
-            sum_distribution += dist2[i].first;
-        }
-
-        // sort the examples by their distance from centers
-        sort(dist2, dist2 + x.n);
-
-        // turn distribution into a CDF
-        distribution[0] = dist2[0].first / sum_distribution;
-        for (int i = 1; i < x.n; ++i) {
-            distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution;
-        }
-
-        // choose a random interval according to the new distribution
-        double r = (double)rand() / (double)RAND_MAX;
-        double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r);
-        int distribution_ndx = (int)(new_center_ptr - distribution);
-        chosen_pts[ndx] = dist2[distribution_ndx].second;
-        /*
-        cout << "chose " << distribution_ndx << " which is actually " 
-             << chosen_pts[ndx] << " with distance " 
-             << dist2[distribution_ndx].first << std::endl;
-             */
-
-        ++ndx;
-    }
-
-    Dataset *c = new Dataset(k, x.d);
-
-    for (int i = 0; i < k; ++i) {
-        double *cdp = c->data + i * x.d;
-        memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
-        if (c->sumDataSquared) {
-            c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
-        }
-    }
-
-    delete [] chosen_pts;
-    delete [] dist2;
-    delete [] distribution;
-
-    return c;
-}
-
-
-Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
-    int *chosen_pts = new int[k];
-    std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
+    double *dist2 = new double[x.n];
+    int *closest = new int[x.n]; // index of centroid that is closest to a point
+    // distances between new centroid and all others divided by 2
+    double *centroid_dist2_div4 = new double[k - 1];
 
     // initialize dist2
-    for (int i = 0; i < x.n; ++i) {
-        dist2[i].first = std::numeric_limits<double>::max();
-        dist2[i].second = i;
-    }
+    std::fill(dist2, dist2 + x.n, std::numeric_limits<double>::max());
+    std::fill(closest, closest + x.n, 0);
+    std::fill(centroid_dist2_div4, centroid_dist2_div4 + k - 1, 0.0);
 
     // choose the first point randomly
     int ndx = 1;
@@ -185,39 +112,44 @@ Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
         // look for the point that is furthest from any center
         double max_dist = 0.0;
         for (int i = 0; i < x.n; ++i) {
-            int example = dist2[i].second;
-            double d2 = 0.0, diff;
-            for (int j = 0; j < x.d; ++j) {
-                diff = x(example,j) - x(chosen_pts[ndx - 1],j);
-                d2 += diff * diff;
-            }
-            if (d2 < dist2[i].first) {
-                dist2[i].first = d2;
+            // we need to consider the new centroid (ndx) as closest to i
+            if (dist2[i] > centroid_dist2_div4[closest[i]]) {
+                double d2 = distance2silent(x.data + i * x.d, x.data + chosen_pts[ndx - 1] * x.d, x.d);
+
+                if (d2 < dist2[i]) {
+                    dist2[i] = d2;
+                    closest[i] = ndx - 1;
+                }
             }
 
-            if (dist2[i].first > max_dist) {
-                max_dist = dist2[i].first;
+            if (dist2[i] > max_dist) {
+                max_dist = dist2[i];
             }
 
-            sum_distribution += dist2[i].first;
+            sum_distribution += dist2[i];
         }
 
         bool unique = true;
 
         do {
             // choose a random interval according to the new distribution
-            double r = sum_distribution * (double)rand() / (double)RAND_MAX;
-            double sum_cdf = dist2[0].first;
+            double r = sum_distribution * (double) rand() / (double) RAND_MAX;
+            double sum_cdf = dist2[0];
             int cdf_ndx = 0;
             while (sum_cdf < r) {
-                sum_cdf += dist2[++cdf_ndx].first;
+                sum_cdf += dist2[++cdf_ndx];
             }
             chosen_pts[ndx] = cdf_ndx;
 
             for (int i = 0; i < ndx; ++i) {
                 unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
             }
-        } while (! unique);
+        } while (!unique);
+
+        // calculate the squared distance between the new point and all others div 4
+        for (int i = 0; i < ndx; ++i) {
+            centroid_dist2_div4[i] = distance2silent(x.data + chosen_pts[i] * x.d, x.data + chosen_pts[ndx] * x.d, x.d) / 4.0;
+        }
 
         ++ndx;
     }
@@ -237,13 +169,12 @@ Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
     return c;
 }
 
-
 /**
  * in MB
  */
 double getMemoryUsage() {
     char buf[30];
-    snprintf(buf, 30, "/proc/%u/statm", (unsigned)getpid());
+    snprintf(buf, 30, "/proc/%u/statm", (unsigned) getpid());
     FILE* pf = fopen(buf, "r");
     unsigned int totalProgramSizeInPages = 0;
     unsigned int residentSetSizeInPages = 0;
@@ -253,17 +184,16 @@ double getMemoryUsage() {
             return 0.0;
         }
     }
-    
+
     fclose(pf);
     pf = NULL;
-    
+
     double sizeInKilobytes = residentSetSizeInPages * 4.0; // assumes 4096 byte page
     // getconf PAGESIZE
-    
+
     return sizeInKilobytes;
 }
 
-
 void assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
     for (int i = 0; i < x.n; ++i) {
         double shortestDist2 = std::numeric_limits<double>::max();
diff --git a/fast_kmeans/hamerly_kmeans_modified.cpp b/fast_kmeans/hamerly_kmeans_modified.cpp
new file mode 100644
index 0000000..f7ea2f9
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_modified.cpp
@@ -0,0 +1,103 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "hamerly_kmeans_modified.h"
+#include "general_functions.h"
+#include <cmath>
+
+/* This class is extension to Hamerly's algorithm with tighter lower bound update.
+ *  - For information about Hamerly's algorithm see hamerly_kmeans.cpp
+ *  - For information about tighter lower bound update see
+ *    modified_update_triangle_based_kmeans.h.
+ */
+int HamerlyKmeansModified::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+
+            double upper_comparison_bound = std::max(s[closest], lower[i]);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double u2 = pointCenterDist2(i, closest);
+            upper[i] = sqrt(u2);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double l2 = std::numeric_limits<double>::max(); // the squared lower bound
+            for (int j = 0; j < k; ++j) {
+                if (j == closest) {
+                    continue;
+                }
+                double dist2 = pointCenterDist2(i, j);
+
+                if (dist2 < u2) {
+                    l2 = u2;
+                    u2 = dist2;
+                    closest = j;
+                } else if (dist2 < l2) {
+                    l2 = dist2;
+                }
+            }
+
+            lower[i] = sqrt(l2);
+
+            if (assignment[i] != closest) {
+                upper[i] = sqrt(u2);
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+
+        if (!converged) {
+            update_bounds(startNdx, endNdx);
+        }
+
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void HamerlyKmeansModified::update_bounds(int startNdx, int endNdx) {
+    #ifdef COUNT_DISTANCES
+    for (int i = 0; i < k; ++i)
+        boundsUpdates += ((double) clusterSize[0][i]) * (lowerBoundUpdate[i]);
+    #endif
+
+    // update upper/lower bounds
+    for (int i = startNdx; i < endNdx; ++i) {
+        // the upper bound increases by the amount that its center moved 
+        upper[i] += centerMovement[assignment[i]];
+
+        // The lower bound decreases by the update that was calculated by
+        // the superclass
+        lower[i] -= lowerBoundUpdate[assignment[i]];
+    }
+}
+
diff --git a/fast_kmeans/hamerly_kmeans_modified.h b/fast_kmeans/hamerly_kmeans_modified.h
new file mode 100644
index 0000000..0579dec
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_modified.h
@@ -0,0 +1,30 @@
+#ifndef HAMERLY_KMEANS_MODIFIED_H
+#define HAMERLY_KMEANS_MODIFIED_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower
+ * bound per point. This implementation adds to Hamerly's algorithm tighter
+ * lower bound update.
+ */
+
+#include "modified_update_triangle_based_kmeans.h"
+
+class HamerlyKmeansModified : public ModifiedUpdateTriangleBasedKmeans {
+    public:
+        HamerlyKmeansModified() { numLowerBounds = 1; }
+        virtual ~HamerlyKmeansModified() { free(); }
+        virtual std::string getName() const { return "hamerlymodified"; }
+
+    protected:
+        // Update the upper and lower bounds for the given range of points.
+        void update_bounds(int startNdx, int endNdx);
+
+        virtual int runThread(int threadId, int maxIterations);
+};
+
+#endif
+
diff --git a/fast_kmeans/hamerly_kmeans_neighbors.cpp b/fast_kmeans/hamerly_kmeans_neighbors.cpp
new file mode 100644
index 0000000..a1ae390
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors.cpp
@@ -0,0 +1,84 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "hamerly_kmeans_neighbors.h"
+#include "general_functions.h"
+#include <cmath>
+
+/* This class implements Hamerly's algorithm together with tighter lower bound
+ * update and iteration over neighbors.
+ *  - For information about Hamerly's algorithm see hamerly_kmeans.cpp
+ *  - For information about tighter lower bound update and neighbors see
+ *    modified_update_triangle_based_kmeans.h.
+ */
+int HamerlyKmeansNeighbors::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        for (int i = startNdx; i < endNdx; ++i) {
+            unsigned short closest = assignment[i];
+
+            double upper_comparison_bound = std::max(s[closest], lower[i]);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double u2 = pointCenterDist2(i, closest);
+            upper[i] = sqrt(u2);
+
+            if (upper[i] <= upper_comparison_bound) {
+                continue;
+            }
+
+            double l2 = std::numeric_limits<double>::max(); // the squared lower bound
+            // iterate only over centroids that can be closest or second closest to some x in the dataset
+            for (int* ptr = neighbours[closest]; (*ptr) != -1; ++ptr) {
+                double dist2 = pointCenterDist2(i, (*ptr));
+
+                if (dist2 < u2) {
+                    l2 = u2;
+                    u2 = dist2;
+                    closest = (*ptr);
+                } else if (dist2 < l2) {
+                    l2 = dist2;
+                }
+            }
+
+            lower[i] = sqrt(l2);
+
+            if (assignment[i] != closest) {
+                upper[i] = sqrt(u2);
+                changeAssignment(i, closest, threadId);
+            }
+        }
+
+        verifyAssignment(iterations, startNdx, endNdx);
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+
+        if (!converged) {
+            update_bounds(startNdx, endNdx);
+        }
+
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
diff --git a/fast_kmeans/hamerly_kmeans_neighbors.h b/fast_kmeans/hamerly_kmeans_neighbors.h
new file mode 100644
index 0000000..5acdd4c
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors.h
@@ -0,0 +1,29 @@
+#ifndef HAMERLY_KMEANS_NEIGHBORS_H
+#define HAMERLY_KMEANS_NEIGHBORS_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower
+ * bound per point.
+ * 
+ * This adds to the Hamerly's algorith tighter lower bound update and
+ * iteration over neighbours.
+ */
+
+#include "hamerly_kmeans_modified.h"
+
+class HamerlyKmeansNeighbors : public HamerlyKmeansModified {
+    public:
+        HamerlyKmeansNeighbors() {}
+        virtual ~HamerlyKmeansNeighbors() { free(); }
+        virtual std::string getName() const { return "hamerlyneighbors"; }
+
+    protected:
+        virtual int runThread(int threadId, int maxIterations);
+};
+
+#endif
+
diff --git a/fast_kmeans/hamerly_kmeans_neighbors1st.cpp b/fast_kmeans/hamerly_kmeans_neighbors1st.cpp
new file mode 100644
index 0000000..27042fe
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors1st.cpp
@@ -0,0 +1,135 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "hamerly_kmeans_neighbors1st.h"
+#include "general_functions.h"
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+
+/* This class implements Hamerly's algorithm together with tighter lower bound
+ * update and iteration over neighbors (including the first iteration).
+ *  - For information about Hamerly's algorithm see hamerly_kmeans.cpp
+ *  - For information about tighter lower bound update and neighbors see
+ *    modified_update_triangle_based_kmeans.h.
+ * The iteration over neighbors is based on the following:
+ *  - If the initial assignment is good and tight, we can reuse its results.
+ *  - We use the neighbors condition in the first iteration.
+ *  - To use it we evaluate the maximum upper bound.
+ *  - Therefore we calculate the upper bound separately.
+ */
+int HamerlyKmeansNeighbors1st::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    int startNdx = start(threadId);
+    int endNdx = end(threadId);
+
+    // here we need to calculate s & the centroid-centroid distances before the first iteration
+    // the remaining calls to this method are hidden by move_centers
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    ++iterations;
+
+    // calculate the distance between each point and its assigned centroid
+    // we would have to calculate this anyway
+    if (threadId == 0)
+        std::fill(maxUpperBound, maxUpperBound + k, 0.0);
+    #ifdef USE_THREADS
+    if (threadId == 0)
+        std::fill(maxUpperBoundAgg, maxUpperBoundAgg + k * numThreads, 0.0);
+    #endif
+    for (int i = startNdx; i < endNdx; ++i) {
+        unsigned short closest = assignment[i];
+        upper[i] = sqrt(pointCenterDist2(i, closest));
+
+        // also evaluate the maximum upper bound
+        #ifdef USE_THREADS
+        if (maxUpperBoundAgg[threadId * k + closest] < upper[i])
+            maxUpperBoundAgg[threadId * k + closest] = upper[i];
+        #else
+        if (upper[i] > maxUpperBound[closest])
+            maxUpperBound[closest] = upper[i];
+        #endif
+    }
+    #ifdef USE_THREADS
+    synchronizeAllThreads();
+    aggregate_maximum_upper_bound(threadId);
+    #endif
+    synchronizeAllThreads();
+
+    // now get the neighbours, but we cannot use the superclass as centroids have
+    // not moved yet, therefore there is no centerMovement
+    for (int C = 0; C < k; ++C)
+        #ifdef USE_THREADS
+        if (C % numThreads == threadId)
+            #endif
+        {
+            double boundOnOtherDistance = maxUpperBound[C] + s[C];
+            int neighboursPos = 0;
+            for (int c = 0; c < k; ++c) {
+                if (c != C && boundOnOtherDistance >= centerCenterDistDiv2[C * k + c])
+                    neighbours[C][neighboursPos++] = c;
+            }
+            neighbours[C][neighboursPos] = -1;
+        }
+    synchronizeAllThreads();
+
+    for (int i = startNdx; i < endNdx; ++i) {
+        unsigned short closest = assignment[i];
+        // check that we can skip the distance calculations
+        if (s[closest] >= upper[i]) {
+            // if so, initialize the lower bound
+            lower[i] = 2 * s[closest] - upper[i];
+            continue;
+        }
+
+        // we cannot tighten the upper bound - it is already tight, skip it therefore
+
+        double u2 = upper[i] * upper[i];
+        double l2 = std::numeric_limits<double>::max(); // the squared lower bound
+        // now iterate over the neighbors
+        for (int* ptr = neighbours[closest]; (*ptr) != -1; ++ptr) {
+            double dist2 = pointCenterDist2(i, (*ptr));
+
+            if (dist2 < u2) {
+                l2 = u2;
+                u2 = dist2;
+                closest = (*ptr);
+            } else if (dist2 < l2) {
+                l2 = dist2;
+            }
+        }
+
+        lower[i] = sqrt(l2);
+
+        if (assignment[i] != closest) {
+            upper[i] = sqrt(u2);
+            changeAssignment(i, closest, threadId);
+        }
+    }
+
+    verifyAssignment(iterations, startNdx, endNdx);
+
+    synchronizeAllThreads();
+    move_centers(threadId);
+
+    synchronizeAllThreads();
+
+    if (!converged) {
+        update_bounds(startNdx, endNdx);
+    }
+
+    synchronizeAllThreads();
+
+    // end of the first iteration
+
+    // ... now do everything in the same way as we did before
+    // therefore call the superclass
+    // note that this implementation costs us unnecessary k*k-k distance calculations
+    // on update_s() function - this k*(k-1) distances can be eliminated by the change
+    return HamerlyKmeansNeighbors::runThread(threadId, maxIterations - 1) + 1;
+}
diff --git a/fast_kmeans/hamerly_kmeans_neighbors1st.h b/fast_kmeans/hamerly_kmeans_neighbors1st.h
new file mode 100644
index 0000000..82189a2
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors1st.h
@@ -0,0 +1,31 @@
+#ifndef HAMERLY_KMEANS_NEIGHBORS_1ST_H
+#define HAMERLY_KMEANS_NEIGHBORS_1ST_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower
+ * bound per point.
+ * 
+ * This adds to the Hamerly's algorith tighter lower bound update and
+ * iteration over neighbours. This iteration over neighbors is implemented
+ * also for the first iteration, which means some additional modifications
+ * to the runThread method.
+ */
+
+#include "hamerly_kmeans_neighbors.h"
+
+class HamerlyKmeansNeighbors1st : public HamerlyKmeansNeighbors {
+    public:
+        HamerlyKmeansNeighbors1st() {}
+        virtual ~HamerlyKmeansNeighbors1st() { free(); }
+        virtual std::string getName() const { return "hamerlyneighbors1st"; }
+
+    protected:
+        virtual int runThread(int threadId, int maxIterations);
+};
+
+#endif
+
diff --git a/fast_kmeans/hamerly_kmeans_neighbors_only.cpp b/fast_kmeans/hamerly_kmeans_neighbors_only.cpp
new file mode 100644
index 0000000..47e04a0
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors_only.cpp
@@ -0,0 +1,55 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "hamerly_kmeans_neighbors_only.h"
+#include "general_functions.h"
+#include <cmath>
+
+void HamerlyKmeansNeighborsOnly::move_centers(int threadId) {
+    if (threadId == 0) {
+        // move the centers
+        int furthestMovingCenter = TriangleInequalityBaseKmeans::move_centers();
+        converged = (0.0 == centerMovement[furthestMovingCenter]);
+    }
+    synchronizeAllThreads();
+
+    if (!converged) {
+        // ... calculate the neighbors and the lower bound update given by the triangle inequality
+        update_s(threadId);
+        calculate_max_upper_bound(threadId);
+        synchronizeAllThreads();
+        for (int c = 0; c < k; ++c)
+            #ifdef USE_THREADS
+            if (c % numThreads == threadId)
+                #endif
+                calculate_neighbors(c);
+        synchronizeAllThreads();
+        if (threadId == 0)
+            calculate_lower_bound_update(threadId);
+    }
+}
+
+/* This method fills the lower bound update array in the same manner as it
+ * would be filled for the default Hamerly's kmeans implementation. The
+ * code is copied from hamerly_kmeans.cpp with small modification. */
+void HamerlyKmeansNeighborsOnly::calculate_lower_bound_update(int threadId) {
+    int furthestMovingCenter = 0;
+    double longest = centerMovement[furthestMovingCenter];
+    double secondLongest = 0.0;
+    for (int j = 0; j < k; ++j) {
+        if (longest < centerMovement[j]) {
+            secondLongest = longest;
+            longest = centerMovement[j];
+            furthestMovingCenter = j;
+        } else if (secondLongest < centerMovement[j]) {
+            secondLongest = centerMovement[j];
+        }
+    }
+
+    // store the update instead of updating the bound directly
+    for (int i = 0; i < k; ++i)
+        lowerBoundUpdate[i] = (i == furthestMovingCenter) ? secondLongest : longest;
+}
\ No newline at end of file
diff --git a/fast_kmeans/hamerly_kmeans_neighbors_only.h b/fast_kmeans/hamerly_kmeans_neighbors_only.h
new file mode 100644
index 0000000..5a8a391
--- /dev/null
+++ b/fast_kmeans/hamerly_kmeans_neighbors_only.h
@@ -0,0 +1,37 @@
+#ifndef HAMERLY_KMEANS_NEIGHBORS_ONLY_H
+#define HAMERLY_KMEANS_NEIGHBORS_ONLY_H
+
+/* Authors: Greg Hamerly and Jonathan Drake
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2014
+ *
+ * HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower
+ * bound per point.
+ * 
+ * In this implementation the iteration over neighbors is implemented.
+ * - For information about neighbors see
+ *    modified_update_triangle_based_kmeans.h
+ *   note the the tighter lower bound update is not used.
+ */
+
+#include "hamerly_kmeans_neighbors.h"
+
+class HamerlyKmeansNeighborsOnly : public HamerlyKmeansNeighbors {
+    public:
+        HamerlyKmeansNeighborsOnly() {}
+        virtual ~HamerlyKmeansNeighborsOnly() { free(); }
+        virtual std::string getName() const { return "hamerlyneighborsonly"; }
+
+    protected:
+		// we have to override this method in order to skip the tighter lower
+		// bound update calculation
+		virtual void move_centers(int threadId);
+		
+		// here we need to consider only the centroid movement
+		// therefore we get the default update given by the triangle inequality
+		virtual void calculate_lower_bound_update(int threadId);
+};
+
+#endif
+
diff --git a/fast_kmeans/heap_kmeans_modified.cpp b/fast_kmeans/heap_kmeans_modified.cpp
new file mode 100644
index 0000000..ed7a39f
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_modified.cpp
@@ -0,0 +1,138 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "modified_update_triangle_based_kmeans.h"
+#include "general_functions.h"
+#include "heap_kmeans_modified.h"
+#include <cmath>
+#include <algorithm>
+
+void HeapKmeansModified::free() {
+    ModifiedUpdateTriangleBasedKmeans::free();
+    for (int t = 0; t < numThreads; ++t) {
+        delete [] heaps[t];
+    }
+    delete [] heaps;
+    delete [] heapBounds;
+    heaps = NULL;
+    heapBounds = NULL;
+}
+
+void HeapKmeansModified::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    ModifiedUpdateTriangleBasedKmeans::initialize(aX, aK, initialAssignment, aNumThreads);
+
+    heaps = new Heap*[numThreads];
+    heapBounds = new double[k];
+
+    for (int t = 0; t < numThreads; ++t) {
+        heaps[t] = new Heap[k];
+        int startNdx = start(t);
+        int endNdx = end(t);
+        heaps[t][0].resize(endNdx - startNdx, std::make_pair(-1.0, 0));
+        for (int i = 0; i < endNdx - startNdx; ++i) {
+            heaps[t][0][i].second = i + startNdx;
+        }
+    }
+
+    std::fill(heapBounds, heapBounds + k, 0.0);
+    // start with zeros here
+    std::fill(maxUpperBound, maxUpperBound + k, 0.0);
+}
+
+int HeapKmeansModified::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    std::greater<std::pair<double, int>> heapComp;
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        for (int h = 0; h < k; ++h) {
+            Heap &heap = heaps[threadId][h];
+
+            while (heap.size() > 0) {
+                if (heapBounds[h] <= heap[0].first)
+                    break;
+
+                int i = heap[0].second;
+
+                std::pop_heap(heap.begin(), heap.end(), heapComp);
+                heap.pop_back();
+
+                unsigned short closest = assignment[i];
+                unsigned short nextClosest = 0;
+
+                double u2 = pointCenterDist2(i, closest);
+                double l2 = std::numeric_limits<double>::max();
+
+                for (unsigned short j = 0; j < k; ++j) {
+                    if (j == closest) continue;
+
+                    double dist2 = pointCenterDist2(i, j);
+                    if (dist2 < u2) {
+                        l2 = u2;
+                        u2 = dist2;
+                        nextClosest = closest;
+                        closest = j;
+                    } else if (dist2 < l2) {
+                        l2 = dist2;
+                        nextClosest = j;
+                    }
+                }
+
+                const double u = sqrt(u2);
+                const double bound = sqrt(l2) - u;
+
+                if ((bound == 0.0) && (nextClosest < closest)) {
+                    closest = nextClosest;
+                }
+
+                // save the maximum upper bound, should be active only in the first iteration and when assignment changes
+                if (u > maxUpperBound[closest])
+                    maxUpperBound[closest] = u;
+
+                if (closest != assignment[i]) {
+                    changeAssignment(i, closest, threadId);
+                }
+
+                Heap &newHeap = heaps[threadId][closest];
+                newHeap.push_back(std::make_pair(heapBounds[closest] + bound, i));
+                std::push_heap(newHeap.begin(), newHeap.end(), heapComp);
+            }
+        }
+
+        verifyAssignment(iterations, start(threadId), end(threadId));
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (threadId == 0 && !converged)
+            update_bounds();
+
+        synchronizeAllThreads();
+    }
+    return iterations;
+}
+
+void HeapKmeansModified::update_bounds() {
+    #ifdef COUNT_DISTANCES
+    for (int i = 0; i < k; ++i)
+        boundsUpdates += ((double) clusterSize[0][i]) * (lowerBoundUpdate[i]);
+    #endif
+    for (int j = 0; j < k; ++j) {
+        // this is the worst case by that the maximum upper bound can grow
+        maxUpperBound[j] += centerMovement[j];
+        heapBounds[j] += centerMovement[j];
+        // update the lower bound by the calculated tighter update
+        heapBounds[j] += lowerBoundUpdate[j];
+    }
+}
+
+// do not do anything, we will update maximum upper bound in update_bounds
+
+void HeapKmeansModified::calculate_max_upper_bound(int threadId) {
+}
diff --git a/fast_kmeans/heap_kmeans_modified.h b/fast_kmeans/heap_kmeans_modified.h
new file mode 100644
index 0000000..c5fe98b
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_modified.h
@@ -0,0 +1,53 @@
+#ifndef HEAP_KMEANS_MODIFIED_H
+#define HEAP_KMEANS_MODIFIED_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * This class is extension to the HeapKmeans. It implement the tigher upper bound update
+ * that uses only estimate of m(c_i). To get more information about heap kmeans see
+ * heap_kmeans.cpp/h. To get information about the tighter update see
+ * modified_update_triangle_based_kmeans.h. The most of the code is inherited from the
+ * parent class or copied from the HeapKmeans.h.
+ */
+
+#include "modified_update_triangle_based_kmeans.h"
+#include <vector>
+
+typedef std::vector<std::pair<double, int> > Heap;
+
+class HeapKmeansModified : public ModifiedUpdateTriangleBasedKmeans {
+public:
+
+	HeapKmeansModified() : heaps(NULL), heapBounds(NULL) {
+	}
+
+	virtual ~HeapKmeansModified() { free(); }
+
+	virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+	virtual void free();
+
+	virtual std::string getName() const {
+		return "heapmodified";
+	}
+
+
+protected:
+
+	virtual int runThread(int threadId, int maxIterations);
+	
+	void update_bounds();
+	
+	// this method is overridden, takes as maximum upper bound the
+	// maximum upper bound from the first iteration increased in
+	// each iteration by the center movement
+	virtual void calculate_max_upper_bound(int threadId);
+	
+	Heap **heaps;
+
+	double *heapBounds;
+};
+
+#endif
\ No newline at end of file
diff --git a/fast_kmeans/heap_kmeans_ubarr.cpp b/fast_kmeans/heap_kmeans_ubarr.cpp
new file mode 100644
index 0000000..0f41862
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_ubarr.cpp
@@ -0,0 +1,216 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "heap_kmeans_ubarr.h"
+#include "general_functions.h"
+#include <cmath>
+#include <algorithm>
+
+void HeapKmeansUBarr::free() {
+    HeapKmeansModified::free();
+    for (int t = 0; t < numThreads; ++t) {
+        delete [] maxUBHeap[t];
+    }
+    delete[] maxUBHeap;
+    delete[] ubHeapBounds;
+    maxUBHeap = NULL;
+    ubHeapBounds = NULL;
+}
+
+/* Calculates the maximum upper bound over each cluster. This is achieved by
+ * looking onto the top of the heaps that contain the upper bound.
+ */
+void HeapKmeansUBarr::calculate_max_upper_bound(int threadId) {
+    for (int c = 0; c < k; ++c) {
+        Heap &heap = maxUBHeap[threadId][c];
+        while (heap.size() > 1) {
+            // look onto the top of the heap
+            // check that the point is still assigned to this cluter
+            if (c == assignment[heap[0].second]) {
+                // also the upper bound may be invalid - it may have been tigtened
+                if (upper[heap[0].second] == heap[0].first)
+                    break;
+
+                // if the entry is invalid, push a new one with the correct upper bound
+                heap.push_back(std::make_pair(upper[heap[0].second], heap[0].second));
+                std::push_heap(heap.begin(), heap.end());
+            }
+
+            // drop the outdated entries
+            std::pop_heap(heap.begin(), heap.end());
+            heap.pop_back();
+        }
+
+        // and do not forget that the upper bound is relative to the center movement history
+        #ifdef USE_THREADS
+        // remember that in multithreaded version the size of the heap may be zero
+        maxUpperBoundAgg[threadId * k + c] = heap.empty() ? -1.0 : heap[0].first + ubHeapBounds[c];
+        #else
+        maxUpperBound[c] = heap[0].first + ubHeapBounds[c];
+        #endif
+    }
+
+    #ifdef USE_THREADS
+    if (threadId == 0)
+        std::fill(maxUpperBound, maxUpperBound + k, 0.0);
+    synchronizeAllThreads();
+    aggregate_maximum_upper_bound(threadId);
+    #endif
+}
+
+void HeapKmeansUBarr::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    HeapKmeansModified::initialize(aX, aK, initialAssignment, aNumThreads);
+
+    maxUBHeap = new Heap*[numThreads];
+    ubHeapBounds = new double[k];
+
+    for (int t = 0; t < numThreads; ++t) {
+        maxUBHeap[t] = new Heap[k];
+    }
+    std::fill(ubHeapBounds, ubHeapBounds + k, 0.0);
+}
+
+int HeapKmeansUBarr::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    std::greater<std::pair<double, int>> heapComp;
+
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        for (int h = 0; h < k; ++h) {
+            Heap &heap = heaps[threadId][h];
+
+            while (heap.size() > 0) {
+                if (heapBounds[h] <= heap[0].first) {
+                    break;
+                }
+
+                int i = heap[0].second;
+                unsigned short closest = assignment[i];
+                unsigned short nextClosest = 0;
+                double bound = heap[0].first - heapBounds[closest];
+
+                std::pop_heap(heap.begin(), heap.end(), heapComp);
+                heap.pop_back();
+
+                // calculate the real value of the upper bound
+                double u = upper[i] + ubHeapBounds[closest];
+                const double originalLower = bound + u;
+
+                // if the upper bound is less than s, closest, push the point to the heap
+                // with new key - using s[closest] to estimate the lower bound
+                // this is equivalent to tightening the upper bound in Hamerly's aglorithm
+                if (u <= s[closest]) { // note that this cannot happen in the 1st iteration (u = inf)
+                    const double newLower = heapBounds[closest] + 2 * (s[closest] - u);
+                    heap.push_back(std::make_pair(newLower, i));
+                    std::push_heap(heap.begin(), heap.end(), heapComp);
+                    continue;
+                }
+
+                double u2 = pointCenterDist2(i, closest);
+                u = sqrt(u2);
+
+                // the same condition as in the case of Hamerly's algorithm
+                if (u <= std::max(s[closest], originalLower) && iterations != 1) {
+                    upper[i] = u - ubHeapBounds[closest];
+                    const double newLowerUpper = heapBounds[closest] + std::max(originalLower, 2 * s[closest] - u) - u;
+                    heap.push_back(std::make_pair(newLowerUpper, i));
+                    std::push_heap(heap.begin(), heap.end(), heapComp);
+                    continue;
+                }                    // in the case of first iteration, store the upper bound in the heap
+                    // using s[closest] as estimate, if possible
+                else if (iterations == 1 && u < s[closest]) {
+                    upper[i] = u - ubHeapBounds[closest];
+                    Heap &newHeap = heaps[threadId][closest];
+                    const double newLower = heapBounds[closest] + 2 * (s[closest] - u);
+                    newHeap.push_back(std::make_pair(newLower, i));
+                    std::push_heap(newHeap.begin(), newHeap.end(), heapComp);
+
+                    // we have to store into the maxUBHeap as we need to ensure that
+                    // *all* points are in this heap with at least some upper bound
+                    Heap &ubHeap = maxUBHeap[threadId][closest];
+                    ubHeap.push_back(std::make_pair(u - ubHeapBounds[closest], i));
+                    std::push_heap(ubHeap.begin(), ubHeap.end());
+                    continue;
+                }
+
+                double l2 = std::numeric_limits<double>::max();
+
+                for (unsigned short j = 0; j < k; ++j) {
+                    if (j == closest) continue;
+
+                    double dist2 = pointCenterDist2(i, j);
+                    if (dist2 < u2) {
+                        l2 = u2;
+                        u2 = dist2;
+                        nextClosest = closest;
+                        closest = j;
+                    } else if (dist2 < l2) {
+                        l2 = dist2;
+                        nextClosest = j;
+                    }
+                }
+
+                u = sqrt(u2);
+                bound = sqrt(l2) - u;
+
+                if ((bound == 0.0) && (nextClosest < closest)) {
+                    closest = nextClosest;
+                }
+
+                if (closest != assignment[i] || iterations == 1) // iterations == 1 needed for points that are assigned correctly
+                {
+                    // we need to make sure that all points are in ubHeap after 1st iteration
+                    // or after the assignment change
+                    Heap &ubHeap = maxUBHeap[threadId][closest];
+                    ubHeap.push_back(std::make_pair(u - ubHeapBounds[closest], i));
+                    std::push_heap(ubHeap.begin(), ubHeap.end());
+                }
+
+                if (closest != assignment[i]) {
+                    changeAssignment(i, closest, threadId);
+                }
+
+                upper[i] = u - ubHeapBounds[closest];
+                Heap &newHeap = heaps[threadId][closest];
+                newHeap.push_back(std::make_pair(heapBounds[closest] + bound, i));
+                std::push_heap(newHeap.begin(), newHeap.end(), heapComp);
+            }
+        }
+
+        verifyAssignment(iterations, start(threadId), end(threadId));
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (threadId == 0 && !converged)
+            update_bounds();
+
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
+
+void HeapKmeansUBarr::update_bounds() {
+    #ifdef COUNT_DISTANCES
+    for (int i = 0; i < k; ++i)
+        boundsUpdates += ((double) clusterSize[0][i]) * (lowerBoundUpdate[i]);
+    #endif
+    for (int j = 0; j < k; ++j) {
+        // this is by what grow the upper bounds
+        ubHeapBounds[j] += centerMovement[j];
+        heapBounds[j] += centerMovement[j];
+        heapBounds[j] += lowerBoundUpdate[j];
+    }
+}
+
+
diff --git a/fast_kmeans/heap_kmeans_ubarr.h b/fast_kmeans/heap_kmeans_ubarr.h
new file mode 100644
index 0000000..d462bdc
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_ubarr.h
@@ -0,0 +1,59 @@
+#ifndef HEAP_KMEANS_UBARR_H
+#define HEAP_KMEANS_UBARR_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * HeapKmeansUBarr extends HeapKmeansModified. This algorithm enhances the
+ * heap algorithm by array with upper bound and also a heap for finding
+ * maximum over this array. As a result this algorithm can take advantage
+ * from tightening the upper bound and other advantages that Hamerly's algorithm
+ * have over the heap algorithm.
+ */
+
+#include "heap_kmeans_modified.h"
+#include <vector>
+
+class HeapKmeansUBarr : public HeapKmeansModified {
+public:
+
+	HeapKmeansUBarr() : maxUBHeap(NULL), ubHeapBounds(NULL) {}
+
+	virtual ~HeapKmeansUBarr() {
+		free();
+	}
+
+	virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+	virtual void free();
+
+	virtual std::string getName() const {
+		return "heapubarr";
+	}
+
+
+protected:
+
+	virtual int runThread(int threadId, int maxIterations);
+	
+	void update_bounds();
+
+	// When we will calculate the maximum upper bound, we will use the values
+	// stored in the *maxUBHeap to find the maximum of the upper bound
+	virtual void calculate_max_upper_bound(int threadId);
+
+	// this will be set of k heaps, one for each centroid
+	// each heap will store a pair of upper bound and point index
+	// this allows us to find maximum over upper bound
+	Heap **maxUBHeap;
+	
+	// this array relativizes the upper bound, similarly as the
+	// heap key, so that we do not have to update all the n
+	// upper bounds, but updating k is enough
+	// ... note that the upper bound array is in parent class,
+	//     but it is not used there
+	double *ubHeapBounds;
+};
+
+#endif
\ No newline at end of file
diff --git a/fast_kmeans/heap_kmeans_ubarr_neighbors.cpp b/fast_kmeans/heap_kmeans_ubarr_neighbors.cpp
new file mode 100644
index 0000000..df2b5b5
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_ubarr_neighbors.cpp
@@ -0,0 +1,128 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "heap_kmeans_ubarr_neighbors.h"
+#include "general_functions.h"
+#include <cmath>
+#include <algorithm>
+
+int HeapKmeansUBarrNeighbors::runThread(int threadId, int maxIterations) {
+    int iterations = 0;
+
+    std::greater<std::pair<double, int>> heapComp;
+
+    update_s(threadId);
+    synchronizeAllThreads();
+
+    while ((iterations < maxIterations) && !converged) {
+        ++iterations;
+
+        for (int h = 0; h < k; ++h) {
+            Heap &heap = heaps[threadId][h];
+
+            while (heap.size() > 0) {
+                if (heapBounds[h] <= heap[0].first) {
+                    break;
+                }
+
+                int i = heap[0].second;
+                unsigned short closest = assignment[i];
+                unsigned short nextClosest = 0;
+                double bound = heap[0].first - heapBounds[closest];
+
+                std::pop_heap(heap.begin(), heap.end(), heapComp);
+                heap.pop_back();
+
+                double u = upper[i] + ubHeapBounds[closest];
+                const double originalLower = bound + u;
+
+                if (u <= s[closest]) {
+                    const double newLower = heapBounds[closest] + 2 * (s[closest] - u);
+                    heap.push_back(std::make_pair(newLower, i));
+                    std::push_heap(heap.begin(), heap.end(), heapComp);
+                    continue;
+                }
+
+                double u2 = pointCenterDist2(i, closest);
+                u = sqrt(u2);
+
+                if (u <= std::max(s[closest], originalLower) && iterations != 1) {
+                    upper[i] = u - ubHeapBounds[closest];
+                    const double newLowerUpper = heapBounds[closest] + std::max(originalLower, 2 * s[closest] - u) - u;
+                    heap.push_back(std::make_pair(newLowerUpper, i));
+                    std::push_heap(heap.begin(), heap.end(), heapComp);
+                    continue;
+                } else if (iterations == 1 && u < s[closest]) {
+                    upper[i] = u - ubHeapBounds[closest];
+                    Heap &newHeap = heaps[threadId][closest];
+                    const double newLower = heapBounds[closest] + 2 * (s[closest] - u);
+                    newHeap.push_back(std::make_pair(newLower, i));
+                    std::push_heap(newHeap.begin(), newHeap.end(), heapComp);
+
+                    Heap &ubHeap = maxUBHeap[threadId][closest];
+                    ubHeap.push_back(std::make_pair(u - ubHeapBounds[closest], i));
+                    std::push_heap(ubHeap.begin(), ubHeap.end());
+                    continue;
+                }
+
+                double l2 = std::numeric_limits<double>::max();
+
+                // this is the difference from the parent class, we iterate only over selected centroids here
+                for (int* ptr = neighbours[closest]; (*ptr) != -1; ++ptr) {
+                    // now j has the same meaning as in the parent class
+                    int j = (*ptr);
+                    if (j == closest) continue;
+
+                    double dist2 = pointCenterDist2(i, j);
+                    if (dist2 < u2) {
+                        l2 = u2;
+                        u2 = dist2;
+                        nextClosest = closest;
+                        closest = j;
+                    } else if (dist2 < l2) {
+                        l2 = dist2;
+                        nextClosest = j;
+                    }
+                }
+
+                u = sqrt(u2);
+                bound = sqrt(l2) - u;
+
+                if ((bound == 0.0) && (nextClosest < closest)) {
+                    closest = nextClosest;
+                }
+
+                if (closest != assignment[i] || iterations == 1) {
+                    Heap &ubHeap = maxUBHeap[threadId][closest];
+                    ubHeap.push_back(std::make_pair(u - ubHeapBounds[closest], i));
+                    std::push_heap(ubHeap.begin(), ubHeap.end());
+                }
+
+                if (closest != assignment[i]) {
+                    changeAssignment(i, closest, threadId);
+                }
+
+                upper[i] = u - ubHeapBounds[closest];
+                Heap &newHeap = heaps[threadId][closest];
+                newHeap.push_back(std::make_pair(heapBounds[closest] + bound, i));
+                std::push_heap(newHeap.begin(), newHeap.end(), heapComp);
+            }
+        }
+
+        verifyAssignment(iterations, start(threadId), end(threadId));
+
+        synchronizeAllThreads();
+        move_centers(threadId);
+
+        synchronizeAllThreads();
+        if (threadId == 0 && !converged)
+            update_bounds();
+
+        synchronizeAllThreads();
+    }
+
+    return iterations;
+}
diff --git a/fast_kmeans/heap_kmeans_ubarr_neighbors.h b/fast_kmeans/heap_kmeans_ubarr_neighbors.h
new file mode 100644
index 0000000..3b1f93e
--- /dev/null
+++ b/fast_kmeans/heap_kmeans_ubarr_neighbors.h
@@ -0,0 +1,39 @@
+#ifndef HEAP_KMEANS_UBARR_NEIGHBORS_H
+#define HEAP_KMEANS_UBARR_NEIGHBORS_H
+
+/* Authors: Greg Hamerly, Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * HeapKmeansUBarrNeighbors extends HeapKmeansModifiedUBarr. This algorithm
+ * enhances the the previous algorithm by iteration over neighbors, which
+ * allows it to eliminate centroids from the innermost loop. Everything is
+ * exactly same as in the parent class.
+ */
+
+#include "heap_kmeans_ubarr.h"
+#include <vector>
+
+typedef std::vector<std::pair<double, int> > Heap;
+
+class HeapKmeansUBarrNeighbors : public HeapKmeansUBarr {
+public:
+
+	HeapKmeansUBarrNeighbors() {}
+
+	virtual ~HeapKmeansUBarrNeighbors() {
+		free();
+	}
+
+	virtual std::string getName() const {
+		return "heapubarrneighbors";
+	}
+
+
+protected:
+
+	virtual int runThread(int threadId, int maxIterations);
+};
+
+#endif
\ No newline at end of file
diff --git a/fast_kmeans/kmeans.cpp b/fast_kmeans/kmeans.cpp
index dc7f719..6cc9157 100644
--- a/fast_kmeans/kmeans.cpp
+++ b/fast_kmeans/kmeans.cpp
@@ -10,9 +10,12 @@
 #include <cmath>
 
 Kmeans::Kmeans() : x(NULL), n(0), k(0), d(0), numThreads(0), converged(false),
-    clusterSize(NULL), centerMovement(NULL), assignment(NULL) {
+clusterSize(NULL), centerMovement(NULL), assignment(NULL) {
     #ifdef COUNT_DISTANCES
     numDistances = 0;
+    numInnerProducts = 0;
+    assignmentChanges = 0;
+    boundsUpdates = 0.0;
     #endif
 }
 
@@ -28,8 +31,6 @@ void Kmeans::free() {
     n = k = d = numThreads = 0;
 }
 
-
-
 void Kmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
     free();
 
@@ -61,10 +62,16 @@ void Kmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *in
 
     #ifdef COUNT_DISTANCES
     numDistances = 0;
+    numInnerProducts = 0;
+    assignmentChanges = 0;
+    boundsUpdates = 0.0;
     #endif
 }
 
 void Kmeans::changeAssignment(int xIndex, int closestCluster, int threadId) {
+    #ifdef COUNT_DISTANCES
+    ++assignmentChanges;
+    #endif
     --clusterSize[threadId][assignment[xIndex]];
     ++clusterSize[threadId][closestCluster];
     assignment[xIndex] = closestCluster;
@@ -72,19 +79,19 @@ void Kmeans::changeAssignment(int xIndex, int closestCluster, int threadId) {
 
 #ifdef USE_THREADS
 struct ThreadInfo {
-    public:
-        ThreadInfo() : km(NULL), threadId(0), pthread_id(0) {}
-        Kmeans *km;
-        int threadId;
-        pthread_t pthread_id;
-        int numIterations;
-        int maxIterations;
+public:
+    ThreadInfo() : km(NULL), threadId(0), pthread_id(0) {}
+    Kmeans *km;
+    int threadId;
+    pthread_t pthread_id;
+    int numIterations;
+    int maxIterations;
 };
 #endif
 
 void *Kmeans::runner(void *args) {
     #ifdef USE_THREADS
-    ThreadInfo *ti = (ThreadInfo *)args;
+    ThreadInfo *ti = (ThreadInfo *) args;
     ti->numIterations = ti->km->runThread(ti->threadId, ti->maxIterations);
     #endif
     return NULL;
@@ -151,11 +158,11 @@ void Kmeans::verifyAssignment(int iteration, int startNdx, int endNdx) const {
         // the program
         if (closest != assignment[i]) {
             std::cerr << "assignment error:" << std::endl;
-            std::cerr << "iteration             = " << iteration              << std::endl;
-            std::cerr << "point index           = " << i                      << std::endl;
-            std::cerr << "closest center        = " << closest                << std::endl;
-            std::cerr << "closest center dist2  = " << closest_dist2          << std::endl;
-            std::cerr << "assigned center       = " << assignment[i]          << std::endl;
+            std::cerr << "iteration             = " << iteration << std::endl;
+            std::cerr << "point index           = " << i << std::endl;
+            std::cerr << "closest center        = " << closest << std::endl;
+            std::cerr << "closest center dist2  = " << closest_dist2 << std::endl;
+            std::cerr << "assigned center       = " << assignment[i] << std::endl;
             std::cerr << "assigned center dist2 = " << original_closest_dist2 << std::endl;
             assert(false);
         }
diff --git a/fast_kmeans/kmeans.h b/fast_kmeans/kmeans.h
index 25643e1..48b32c8 100644
--- a/fast_kmeans/kmeans.h
+++ b/fast_kmeans/kmeans.h
@@ -86,6 +86,9 @@ class Kmeans {
             #error Counting distances and using multiple threads is not supported.
             #endif
         mutable long long numDistances;
+        mutable long long numInnerProducts;
+        mutable long long assignmentChanges;
+        mutable double boundsUpdates;
         #endif
 
     protected:
diff --git a/fast_kmeans/modified_update_triangle_based_kmeans.cpp b/fast_kmeans/modified_update_triangle_based_kmeans.cpp
new file mode 100644
index 0000000..1636c0a
--- /dev/null
+++ b/fast_kmeans/modified_update_triangle_based_kmeans.cpp
@@ -0,0 +1,344 @@
+/* Authors: Greg Hamerly and Jonathan Drake and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ */
+
+#include "modified_update_triangle_based_kmeans.h"
+#include "general_functions.h"
+#include <cmath>
+#include <string.h>
+#include <algorithm>
+#include <functional>
+
+void ModifiedUpdateTriangleBasedKmeans::free() {
+    TriangleInequalityBaseKmeans::free();
+    centersByMovement.clear();
+    delete oldCenters;
+    delete [] lowerBoundUpdate;
+    delete [] maxUpperBound;
+    #ifdef USE_THREADS
+    delete [] maxUpperBoundAgg;
+    #endif
+    delete [] oldCentroidsNorm2;
+    delete [] centroidsNorm2;
+    delete [] oldNewCentroidInnerProduct;
+    delete [] centerCenterDistDiv2;
+    delete [] neighbours;
+    oldCenters = NULL;
+    lowerBoundUpdate = NULL;
+    maxUpperBound = NULL;
+    maxUpperBoundAgg = NULL;
+    oldCentroidsNorm2 = NULL;
+    centroidsNorm2 = NULL;
+    oldNewCentroidInnerProduct = NULL;
+    centerCenterDistDiv2 = NULL;
+    neighbours = NULL;
+}
+
+/* This method moves the newCenters to their new locations, based on the
+ * sufficient statistics in sumNewCenters. It also computes the centerMovement
+ * and the center that moved the furthest.
+ * 
+ * Here the implementation adds the lower bound update. This includes copying
+ * the old centroid locations and calculating the update. After this method
+ * the center-center distances are known, the s array is filled and the tighter
+ * lower bound update is calculated.
+ *
+ * Parameters: none
+ *
+ * Return value: none
+ */
+void ModifiedUpdateTriangleBasedKmeans::move_centers(int threadId) {
+    // copy the location of centers into oldCenters so that we know it
+    if (threadId == 0) {
+        memcpy(oldCenters->data, centers->data, sizeof (double) * (d * k));
+
+        // move the centers as we do in the original algorithms
+        int furthestMovingCenter = TriangleInequalityBaseKmeans::move_centers();
+        converged = (0.0 == centerMovement[furthestMovingCenter]);
+    }
+    synchronizeAllThreads();
+
+    // if not converged ...
+    if (!converged) {
+        // ... calculate the lower bound update
+        update_s(threadId); // first update center-center distances
+        calculate_max_upper_bound(threadId); // get m(c_i)
+        update_cached_inner_products(threadId); // update precalculated norms
+        synchronizeAllThreads();
+        calculate_lower_bound_update(threadId); // and get the update
+    }
+}
+
+void ModifiedUpdateTriangleBasedKmeans::update_cached_inner_products(int threadId) {
+    // copy the oldCentroids norms, we need to store this value
+    #ifdef USE_THREADS
+    if (threadId == 0)
+        #endif
+        memcpy(oldCentroidsNorm2, centroidsNorm2, sizeof (double) * k);
+    synchronizeAllThreads();
+
+    for (int c = 0; c < k; ++c) // for each centroid
+        #ifdef USE_THREADS
+        if (c % numThreads == threadId)
+            #endif
+        {
+            int offset = c*d;
+            // calculate norm of each centroid
+            centroidsNorm2[c] = inner_product(centers->data + offset, centers->data + offset);
+            // calculate the inner product of new and old location
+            oldNewCentroidInnerProduct[c] = inner_product(centers->data + offset, oldCenters->data + offset);
+        }
+
+    // sort centers by movemenet, use function center_movement_comparator_function as comparator
+    #ifdef USE_THREADS
+    if (threadId == 0)
+        #endif
+        std::sort(centersByMovement.begin(), centersByMovement.end(), std::bind(&ModifiedUpdateTriangleBasedKmeans::center_movement_comparator_function, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+/* This method does the aggregation of the updates so that we can
+ * use it in algorithms with single bound. It is overridden in modified
+ * versions of Elkan's algorithm.
+ */
+void ModifiedUpdateTriangleBasedKmeans::calculate_lower_bound_update(int threadId) {
+    // big C is the point for that we calculate the update
+    for (int C = 0; C < k; ++C)
+        #ifdef USE_THREADS
+        if (C % numThreads == threadId)
+            #endif
+        {
+            calculate_neighbors(C);
+
+            double maxUpdate = 0;
+
+            // iterate in decreasing order of centroid movement
+            // ... it is already stored this way in neighbors array
+            for (int* ptr = neighbours[C]; (*ptr) != -1; ++ptr) {
+                // and small c is the other point that moved
+                const int c = (*ptr);
+
+                // if all remaining centroids moved less than the current update, we do not
+                // need to consider them - the case of Hamerly's & heap
+                if (centerMovement[c] <= maxUpdate)
+                    break;
+
+                // calculate update and overwrite if it is bigger than the current value
+                double update = calculate_update(C, c);
+                if (update > maxUpdate)
+                    maxUpdate = update;
+            }
+
+            // store the tighter update for this cluster
+            lowerBoundUpdate[C] = maxUpdate;
+        }
+}
+
+// notion: C is the point c_i in the thesis c is the point c_j in the thesis
+
+double ModifiedUpdateTriangleBasedKmeans::calculate_update(const unsigned int C, const unsigned int c, bool consider_negative) {
+    // those values will be needed
+    const double cCInnerProduct = inner_product(oldCenters->data + c * d, oldCenters->data + C * d);
+    const double cPrimeCInnerProduct = inner_product(centers->data + c * d, oldCenters->data + C * d);
+    const double ccPrimeInnerProduct = oldNewCentroidInnerProduct[c];
+    const double cNorm2 = oldCentroidsNorm2[c];
+    const double cPrimeNorm2 = centroidsNorm2[c];
+    const double CNorm2 = oldCentroidsNorm2[C];
+
+    double maxUpperBoundC = maxUpperBound[C];
+    double cMovement = centerMovement[c];
+
+    // t, that specifies the projection of C onto c cPrime line (P(c_i) = c_j + t * (c_j' - c_j))
+    double factor = (cNorm2 - cCInnerProduct + cPrimeCInnerProduct - ccPrimeInnerProduct) / cMovement / cMovement;
+
+    // calculate the distance ||P(c_i)-c_i|| using the extended form, now only square
+    double distanceOfCFromLine = cNorm2 * (1 - factor) * (1 - factor)
+        + ccPrimeInnerProduct * 2 * factor * (1 - factor)
+        - cCInnerProduct * 2 * (1 - factor)
+        - 2 * factor * cPrimeCInnerProduct
+        + CNorm2
+        + factor * factor * cPrimeNorm2;
+    // rounding errors make this sometimes negative if the distance is low
+    // then this sqrt causes NaN - do abs value therefore
+    // ... from the definition this should never happen
+    if (distanceOfCFromLine < 0)
+        distanceOfCFromLine = -distanceOfCFromLine;
+    // calculate the distance
+    distanceOfCFromLine = sqrt(distanceOfCFromLine);
+
+    // project the y coordinate
+    double y = 1 - factor * 2;
+    // project the radius of the sphere around the cluster
+    double r = 2 * maxUpperBoundC / cMovement;
+
+    // here we will store the update divided by cMovement
+    double update;
+    // the case when the sphere with radius r around C goes through c-c' line
+    if (distanceOfCFromLine < maxUpperBoundC) {
+        // take the bottommost point where the sphere can be = bound by hyperplane
+        // perpendicular to the line through c and c'
+        update = r - y;
+        if (update >= 1.0) // this is too bad, triangle inequality gives us better result
+            return cMovement;
+            // put there zero, because sphere can be curved less than the hyperbola and therefore
+            // condition that while circle is above the hyperbola may be invalid
+            // bound therefore by a hyperplane that goes throught the origin and is perpendicular to c-c'
+        else if (update < 0)
+            return 0.0; // we do not need to scale zero by multiplying, return here
+        return update * cMovement;
+    }
+
+    // this is the case when the circle contains no point with a negative coordinate
+    if (y > r) {
+        if (!consider_negative)
+            return 0.0;
+        // Note that if the bottommost point of the sphere has y-coordinate
+        // from interval [0,1], we can use update 0 (update from line 213 would be >0)
+        if (y - r <= 1.0)
+            return 0.0;
+
+        // handle the negative update ... it is the same as decreasing y by 1,
+        // i.e. moving sphere by half center movement down
+        y -= 1.0;
+    }
+
+    // the case that fulfils conditions of Lemma 3.1 (or the case when y = y-1)
+    // encode the formula
+    const double x = 2 * distanceOfCFromLine / cMovement;
+    const double xSqPlusYSq = x * x + y*y;
+    const double aNorm = sqrt(xSqPlusYSq - r * r);
+    update = (x * r - y * aNorm) / xSqPlusYSq;
+
+    // multiply back by the movement of c
+    return update * cMovement;
+}
+
+void ModifiedUpdateTriangleBasedKmeans::calculate_max_upper_bound(int threadId) {
+    if (threadId == 0)
+        std::fill(maxUpperBound, maxUpperBound + k, 0.0);
+    #ifdef USE_THREADS
+    if (threadId == 0)
+        std::fill(maxUpperBoundAgg, maxUpperBoundAgg + k * numThreads, 0.0);
+    synchronizeAllThreads();
+    for (int i = 0; i < n; ++i)
+        if (maxUpperBoundAgg[threadId * k + assignment[i]] < upper[i])
+            maxUpperBoundAgg[threadId * k + assignment[i]] = upper[i];
+    synchronizeAllThreads();
+    aggregate_maximum_upper_bound(threadId);
+    #else
+    // calculate over the array of upper bound and find maximum for each cluster
+    for (int i = 0; i < n; ++i)
+        if (maxUpperBound[assignment[i]] < upper[i])
+            maxUpperBound[assignment[i]] = upper[i];
+    #endif
+}
+
+#ifdef USE_THREADS
+// for each cluster: get maximum upper bound for each thread and find maximum
+void ModifiedUpdateTriangleBasedKmeans::aggregate_maximum_upper_bound(int threadId) {
+    for (int c = 0; c < k; ++c)
+        if (c % numThreads == threadId)
+            for (int tId = 0; tId < numThreads; tId++)
+                if (maxUpperBound[c] < maxUpperBoundAgg[tId * k + c])
+                    maxUpperBound[c] = maxUpperBoundAgg[tId * k + c];
+}
+#endif
+
+bool ModifiedUpdateTriangleBasedKmeans::center_movement_comparator_function(int c1, int c2) {
+    return (centerMovement[c1] > centerMovement[c2]); // values must be decreaing
+}
+
+// copied from elkan_kmeans.cpp
+// finds the center-center distances
+
+void ModifiedUpdateTriangleBasedKmeans::update_s(int threadId) {
+    // find the inter-center distances
+    for (int c1 = 0; c1 < k; ++c1)
+        #ifdef USE_THREADS
+        if (c1 % numThreads == threadId)
+            #endif
+        {
+            s[c1] = std::numeric_limits<double>::max();
+
+            for (int c2 = 0; c2 < k; ++c2)
+                if (c1 != c2) {
+                    centerCenterDistDiv2[c1 * k + c2] = sqrt(centerCenterDist2(c1, c2)) / 2.0;
+
+                    if (centerCenterDistDiv2[c1 * k + c2] < s[c1])
+                        s[c1] = centerCenterDistDiv2[c1 * k + c2];
+                }
+        }
+}
+
+/* This function initializes the old centers, bound update, maxUpperBound,
+ * cached inner product, the vector of centroids sorted by movement, distances
+ * between centroids and also norms of centroids.
+ *
+ * Parameters: none
+ *
+ * Return value: none
+ */
+void ModifiedUpdateTriangleBasedKmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
+    TriangleInequalityBaseKmeans::initialize(aX, aK, initialAssignment, aNumThreads);
+
+    oldCenters = new Dataset(k, d);
+
+    // set length k if heap (numLB = 0), otherwise k*numLowerBounds
+    lowerBoundUpdate = new double[k * (numLowerBounds == 0 ? 1 : numLowerBounds)];
+    maxUpperBound = new double[k];
+    #ifdef USE_THREADS
+    maxUpperBoundAgg = new double[k * aNumThreads];
+    #endif
+
+    // the cached inner products
+    oldCentroidsNorm2 = new double[k];
+    centroidsNorm2 = new double[k];
+    oldNewCentroidInnerProduct = new double[k];
+
+    centerCenterDistDiv2 = new double[k * k];
+    std::fill(centerCenterDistDiv2, centerCenterDistDiv2 + k * k, 0.0);
+
+    for (int c = 0; c < k; ++c) {
+        int offset = c*d;
+        // calcualte norms of initial centers
+        centroidsNorm2[c] = inner_product(centers->data + offset, centers->data + offset);
+        // let centers by movement contain initially 0-1-2-3-...-k, it will be sorted each iteration
+        centersByMovement.push_back(c);
+    }
+
+    // there we will store neighbouring clusters to each cluster
+    // in first iteration we have to go through all neighbours as all the upper bounds are invalid
+    neighbours = new int*[k];
+    for (int i = 0; i < k; ++i) {
+        neighbours[i] = new int[k];
+        int pos = 0;
+        for (int j = 0; j < k; ++j)
+            if (i != j)
+                neighbours[i][pos++] = j;
+        // place the termination sign, iteration will stop there
+        neighbours[i][k - 1] = -1;
+    }
+}
+
+/* Here we will calculate the neighbors of the cluster centered at C. */
+void ModifiedUpdateTriangleBasedKmeans::calculate_neighbors(const int C) {
+    // This is half of the bound on distance between center C and some
+    // other c. If || C-c || is greater than twice this, then c is not
+    // neighbor of C.
+    double boundOnOtherDistance = maxUpperBound[C] + s[C] + centerMovement[C];
+
+    // find out which clusters are neighbours of cluster C
+    int neighboursPos = 0;
+    for (int i = 0; i < k; ++i) {
+        // let them be sorted by movement, we need to go through in this order in the second loop
+        // so as to eliminate the updates calculations for Hamerly & heap
+        int c = centersByMovement[i];
+        // exclude also C from negihbors and check the condition
+        if (c != C && boundOnOtherDistance >= centerCenterDistDiv2[C * k + c])
+            neighbours[C][neighboursPos++] = c;
+    }
+    // place the stop mark
+    neighbours[C][neighboursPos] = -1;
+}
+
diff --git a/fast_kmeans/modified_update_triangle_based_kmeans.h b/fast_kmeans/modified_update_triangle_based_kmeans.h
new file mode 100644
index 0000000..c0f3b57
--- /dev/null
+++ b/fast_kmeans/modified_update_triangle_based_kmeans.h
@@ -0,0 +1,182 @@
+#ifndef MODIFIED_UPDATE_TRIANGLE_BASED_KMEANS_H
+#define MODIFIED_UPDATE_TRIANGLE_BASED_KMEANS_H
+
+/* Authors: Greg Hamerly and Petr Ryšavý
+ * Feedback: hamerly@cs.baylor.edu
+ * See: http://cs.baylor.edu/~hamerly/software/kmeans.php
+ * Copyright 2015
+ *
+ * ModifiedUpdateTriangleBasedKmeans is a base class for kmeans algorithms
+ * that want to benefit from the tigher bound update, iteration over neighbors
+ * and other improvements proposed in (Ryšavý, Hamerly 2015). Goal of this class
+ * is to provide infrastructure that allows efficient calculations of this
+ * tigher update and neighbors set.
+ * 
+ * The idea about the tighter lower bound is the following:
+ *  - If we update the lower bound using the triangle inequality, we are too
+ *    pessimistic as the triangle inequality holds everywhere in the space.
+ *  - Therefore we bound the cluster into a sphere.
+ *  - And on the sphere we calculate the maximal update that will be needed
+ *    for the lower bound considering each centroid move.
+ * If we consider the neighbors set, the idea is the following:
+ *  - Again we bound the cluster in a sphere.
+ *  - We evaluate for each centroid a condition, which must be fulfilled if
+ *    the point is the closest or the second closest to some point in the
+ *    cluster.
+ *  - We can ignore the poitns that violate this condition from the innermost
+ *    loop in Hamerly's algorithm and also in the tighter update calculation.
+ */
+
+#include "triangle_inequality_base_kmeans.h"
+#include <numeric>
+#include <vector>
+
+class ModifiedUpdateTriangleBasedKmeans : public TriangleInequalityBaseKmeans {
+public:
+
+	ModifiedUpdateTriangleBasedKmeans() : oldCenters(NULL),
+	lowerBoundUpdate(NULL), maxUpperBound(NULL), maxUpperBoundAgg(NULL),
+	oldCentroidsNorm2(NULL), centroidsNorm2(NULL), oldNewCentroidInnerProduct(NULL),
+	centerCenterDistDiv2(NULL), neighbours(NULL) {
+	}
+
+	virtual ~ModifiedUpdateTriangleBasedKmeans() {
+		free();
+	}
+
+	virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
+	virtual void free();
+
+	/* This function claculated inner product of two vectors of dimension d.
+	 * The number of inner products is counted in the superclass.
+	 */
+	inline double inner_product(const double *a, double const *b) const {
+#ifdef COUNT_DISTANCES
+		++numInnerProducts;
+#endif
+		return std::inner_product(a, a + d, b, 0.0);
+	};
+
+
+protected:
+	/* This method is overloaded because we use it also for calculating the
+	 * center-center distances. They need to be stored so that we can use them
+	 * for the optimizations.
+	 */
+	virtual void update_s(int threadId);
+
+	/* Override how the centers are moved. Before the movement, we need to
+	 * backup the old location of centroids and after movement we need to
+	 * calculate the tigher upper bound update and the neighbors set.
+	 */
+	virtual void move_centers(int threadId);
+
+	/* Calculates the lower bound update assuming that all the cached values
+	 * are properly calculated. This, default implementation assumes a single
+	 * lower bound and therefore takes max update over centroids that fulfil
+	 * the neighbor condition. This is true for Hamerly's algorithm, the heap
+	 * algorithm and the annular algorithm. In the case of Elkan's algorithm
+	 * this method is implemented differently.
+	 */
+	virtual void calculate_lower_bound_update(int threadId);
+
+	/* Updates the cached inner products. We cache for each centroid its new
+	 * norm, its old norm and also the inner product of the old centroid location
+	 * and the new centroid location.
+	 */
+	void update_cached_inner_products(int threadId);
+
+	/* Calculates the maximum upper bound by simple iteration over upper
+	 * bound array.
+	 */
+	virtual void calculate_max_upper_bound(int threadId);
+
+	#ifdef USE_THREADS
+	/* This method is used in multithreaded version to collect the maximum upper
+	 * bound from the results that are produced by each thread. Each thread outputs
+	 * the maximum upper bound per cluster for points that it manages into the
+	 * maxUpperBoundAgg array. This method converts the result into maximumUpperBound
+	 * array. All threads must be synchronized before calling this method. */
+	void aggregate_maximum_upper_bound(int threadId);
+	#endif
+
+	/* Comarator of two points by their movement. The points that have moved
+	 * more will be first if the standard order is used.
+	 * 
+	 * Parameters: The indices of centroids that have moved to compare.
+	 */
+	bool center_movement_comparator_function(int c1, int c2);
+
+	/**
+	 * Calculates the tigher update.
+	 * 
+	 * Parameters:
+	 *  C the center whose update should be calculated
+	 *  c the center that moved
+	 *  consider_negative Should the calculation consider the negative value.
+	 *       This option should be true for Elkan kmeans. Negative update calculation
+	 *       costs a bit more, but gives tighter update.
+	 * Returns: tigher update of the lower bound of points assigned to C if we
+	 *       consider only centroid c
+	 */
+	double calculate_update(const unsigned int C, const unsigned int c, bool consider_negative = false);
+
+	/*
+	 * Calculates the neighbors of centroid C and fills the corresponding row
+	 * of the neighbors array. The neighbor array contains for each cluster
+	 * a list of centroids that can be second closest to any point of the cluster.
+	 * In this implementation we exclude from the list the cluster centroid.
+	 * 
+	 * Parameters:
+	 *  C the centroid for that we need to know the neighbors
+	 * Returns: nothing
+	 */
+	void calculate_neighbors(const int C);
+
+	/* Backup here the old location of the centers. */
+	Dataset *oldCenters;
+
+	/* Array for the lower bound update. Here we will store the more tighter
+	 * lower bound update.
+	 * 
+	 * Size is set automatically using the numLowerBounds * k.
+	 * When numLowerBounds is zero (the heap algorithm), then this has size of k.
+	 */
+	double *lowerBoundUpdate;
+
+	/* Array for storing maximum upper bound per cluster. */
+	double *maxUpperBound;
+
+	/* Temporary array used for finding the maximal upper bound using multiple threads. */
+	double *maxUpperBoundAgg;
+
+	/* Norm of the centroids in the last iteration. */
+	double *oldCentroidsNorm2;
+
+	/* Norm of the current centroids. */
+	double *centroidsNorm2;
+
+	/* Inner product of each centroid's new location with old location. */
+	double *oldNewCentroidInnerProduct;
+
+	/* Keep track of the distance (divided by 2) between each pair of
+	 * points. */
+	double *centerCenterDistDiv2;
+
+	/* Here we will store centroids sorted by how much they moved. This is used for
+	 * more effective calculation of the tighter lower bound update so that we can
+	 * skip some of the points from the iteration. */
+	std::vector<int> centersByMovement;
+
+	/* 
+	 * Here we will store the set of neighbors. It is k array of k arrays, which
+	 * contains the set of neighbors. For each centroid there is list of neighbors,
+	 * that ends by -1, which means end.
+	 * 
+	 * As this is needed for evaluating the tighter lower bound update, this array
+	 * is filled every time and it is algorithm's responsibility to use it or not.
+	 */
+	int** neighbours;
+};
+
+#endif
diff --git a/fast_kmeans/stats_functions.cpp b/fast_kmeans/stats_functions.cpp
new file mode 100644
index 0000000..20028ba
--- /dev/null
+++ b/fast_kmeans/stats_functions.cpp
@@ -0,0 +1,141 @@
+#include "stats_functions.h"
+#include <iostream>
+#include <iomanip>
+#include <math.h>
+#include <cstring>
+
+void stdev(eval_result* target, eval_result* sum, eval_result* sum_of_squares, int n);
+double stdev(int n, double sum_of_squares, double sum);
+eval_result* get_min(std::vector<eval_result*> *data);
+eval_result* get_max(std::vector<eval_result*> *data);
+eval_result* get_average(std::vector<eval_result*> *data);
+eval_result* get_stdev(std::vector<eval_result*> *data);
+
+void print_result(eval_result* result) {
+    // Report results
+    std::cout << std::setw(5) << result->iterations << "\t";
+    std::cout << std::setw(10) << result->num_threads << "\t";
+    std::cout << std::setw(10) << result->cluster_time << "\t";
+    std::cout << std::setw(10) << result->cluster_wall_time << "\t";
+    std::cout << std::setw(8) << result->memory_usage; // from kilo to mega
+    #ifdef MONITOR_ACCURACY
+    std::cout << "\t" << std::setw(8) << result->sse;
+    #endif
+    #ifdef COUNT_DISTANCES
+    std::cout << "\t" << std::setw(11) << result->num_distances;
+    std::cout << "\t" << std::setw(11) << result->inner_products;
+    std::cout << "\t" << std::setw(11) << result->assignment_changes;
+    std::cout << "\t" << std::setw(11) << result->bounds_updates;
+    #endif
+}
+
+void print_summary(std::vector<eval_result*>* results) {
+    std::cout << "---------------------------------------" << std::endl;
+    std::cout << std::setw(35) << "min\t";
+    eval_result* min = get_min(results);
+    print_result(min);
+    delete min;
+    std::cout << std::endl << std::setw(35) << "max\t";
+    eval_result* max = get_max(results);
+    print_result(max);
+    delete max;
+    std::cout << std::endl << std::setw(35) << "average\t";
+    eval_result* avg = get_average(results);
+    print_result(avg);
+    delete avg;
+    std::cout << std::endl << std::setw(35) << "standard deviation\t";
+    eval_result* stdev = get_stdev(results);
+    print_result(stdev);
+    delete stdev;
+    std::cout << std::endl << "---------------------------------------" << std::endl;
+}
+
+void print_header_row() {
+    std::cout << std::setw(35) << "algorithm" << "\t"
+            << std::setw(5) << "iters" << "\t"
+            << std::setw(10) << "numThreads" << "\t"
+            << std::setw(10) << "cpu_secs" << "\t"
+            << std::setw(10) << "wall_secs" << "\t"
+            << std::setw(8) << "MB"
+            #ifdef MONITOR_ACCURACY
+            << "\t" << std::setw(8) << "sse"
+            #endif
+            #ifdef COUNT_DISTANCES
+            << "\t" << std::setw(11) << "#distances"
+            << "\t" << std::setw(11) << "#i. products"
+            << "\t" << std::setw(11) << "#assig. changes"
+            << "\t" << std::setw(11) << "#boundsUpdts."
+            #endif
+            << std::endl;
+}
+
+void stdev(eval_result* target, eval_result* sum, eval_result* sum_of_squares, int n) {
+    target->iterations = stdev(n, sum_of_squares->iterations, sum->iterations);
+    target->num_threads = stdev(n, sum_of_squares->num_threads, sum->num_threads);
+    target->cluster_time = stdev(n, sum_of_squares->cluster_time, sum->cluster_time);
+    target->cluster_wall_time = stdev(n, sum_of_squares->cluster_wall_time, sum->cluster_wall_time);
+    target->memory_usage = stdev(n, sum_of_squares->memory_usage, sum->memory_usage);
+    #ifdef MONITOR_ACCURACY
+    target->sse = stdev(n, sum_of_squares->sse, sum->sse);
+    #endif
+    #ifdef COUNT_DISTANCES
+    target->num_distances = stdev(n, sum_of_squares->num_distances, sum->num_distances);
+    target->inner_products = stdev(n, sum_of_squares->inner_products, sum->inner_products);
+    target->assignment_changes = stdev(n, sum_of_squares->assignment_changes, sum->assignment_changes);
+    target->bounds_updates = stdev(n, sum_of_squares->bounds_updates, sum->bounds_updates);
+    #endif
+}
+
+double stdev(int n, double sum_of_squares, double sum) {
+    return sqrt(1.0 / n * sum_of_squares - 1.0 / n / n * sum * sum);
+}
+
+eval_result* get_min(std::vector<eval_result*> *data) {
+    eval_result* retval = new eval_result();
+    std::memcpy(retval, data->at(0), sizeof(eval_result));
+    for (unsigned int i = 1; i < data->size(); ++i) {
+        eval_result* result = data->at(i);
+        retval->min(result);
+    }
+    return retval;
+}
+
+eval_result* get_max(std::vector<eval_result*> *data) {
+    eval_result* retval = new eval_result();
+    std::memcpy(retval, data->at(0), sizeof(eval_result));
+    for (unsigned int i = 1; i < data->size(); ++i) {
+        eval_result* result = data->at(i);
+        retval->max(result);
+    }
+    return retval;
+}
+
+eval_result* get_average(std::vector<eval_result*> *data) {
+    eval_result* retval = new eval_result();
+    const unsigned int count = data->size();
+    for (unsigned int i = 0; i < count; ++i) {
+        eval_result* result = data->at(i);
+        retval->add(result);
+    }
+
+    retval->divide(count);
+    return retval;
+}
+
+eval_result* get_stdev(std::vector<eval_result*> *data) {
+    eval_result* retval = new eval_result();
+    eval_result* sum = new eval_result();
+    eval_result* sumSq = new eval_result();
+    const unsigned int count = data->size();
+    for (unsigned int i = 0; i < count; ++i) {
+        eval_result* result = data->at(i);
+        sum->add(result);
+        sumSq->add_square(result);
+    }
+
+    stdev(retval, sum, sumSq, count);
+
+    delete sum;
+    delete sumSq;
+    return retval;
+}
\ No newline at end of file
diff --git a/fast_kmeans/stats_functions.h b/fast_kmeans/stats_functions.h
new file mode 100644
index 0000000..0d7db56
--- /dev/null
+++ b/fast_kmeans/stats_functions.h
@@ -0,0 +1,130 @@
+/* 
+ * File:   stats_functions.h
+ * Author: Petr Rysavy
+ *
+ * Created on 16. August 2015, 12:57
+ */
+
+#ifndef STATS_FUNCTIONS_H
+#define	STATS_FUNCTIONS_H
+
+#include <vector>
+
+/* The result of execution of algorithm. */
+struct eval_result {
+    // number of iterations that were needed to reach the optimum
+    int iterations;
+    // number of threads used
+    int num_threads;
+    // CPU time needed for calculations
+    double cluster_time;
+    // wall time needed for calculations
+    double cluster_wall_time;
+    // memory usage
+    double memory_usage;
+    #ifdef MONITOR_ACCURACY
+    double sse;
+    #endif
+    #ifdef COUNT_DISTANCES
+    // number of distances evaluated by the algorithm
+    long long num_distances;
+    // number of inner products evaluated by the algorithm; does not include num_distances
+    long long inner_products;
+    // how many times did a point change its assignment
+    long long assignment_changes;
+    // total update of the bounds
+    double bounds_updates;
+    #endif
+
+    void add(const eval_result* a) {
+        iterations += a->iterations;
+        num_threads += a->num_threads;
+        cluster_time += a->cluster_time;
+        cluster_wall_time += a->cluster_wall_time;
+        memory_usage += a->memory_usage;
+        #ifdef MONITOR_ACCURACY
+        sse += a->sse;
+        #endif
+        #ifdef COUNT_DISTANCES
+        num_distances += a->num_distances;
+        inner_products += a->inner_products;
+        assignment_changes += a->assignment_changes;
+        bounds_updates += a->bounds_updates;
+        #endif
+    }
+
+    void divide(const int n) {
+        iterations = ((double) iterations) / n;
+        num_threads /= n;
+        cluster_time /= n;
+        cluster_wall_time /= n;
+        memory_usage /= n;
+        #ifdef MONITOR_ACCURACY
+        sse /= n;
+        #endif
+        #ifdef COUNT_DISTANCES
+        num_distances /= n;
+        inner_products /= n;
+        assignment_changes /= n;
+        bounds_updates /= n;
+        #endif
+    }
+
+    void min(const eval_result* a) {
+        iterations = std::min(iterations, a->iterations);
+        num_threads = std::min(num_threads, a->num_threads);
+        cluster_time = std::min(cluster_time, a->cluster_time);
+        cluster_wall_time = std::min(cluster_wall_time, a->cluster_wall_time);
+        memory_usage = std::min(memory_usage, a->memory_usage);
+        #ifdef MONITOR_ACCURACY
+        sse = std::min(sse, a->sse);
+        #endif
+        #ifdef COUNT_DISTANCES
+        num_distances = std::min(num_distances, a->num_distances);
+        inner_products = std::min(inner_products, a->inner_products);
+        assignment_changes = std::min(assignment_changes, a->assignment_changes);
+        bounds_updates = std::min(bounds_updates, a->bounds_updates);
+        #endif
+    }
+
+    void max(const eval_result* a) {
+        iterations = std::max(iterations, a->iterations);
+        num_threads = std::max(num_threads, a->num_threads);
+        cluster_time = std::max(cluster_time, a->cluster_time);
+        cluster_wall_time = std::max(cluster_wall_time, a->cluster_wall_time);
+        memory_usage = std::max(memory_usage, a->memory_usage);
+        #ifdef MONITOR_ACCURACY
+        sse = std::max(sse, a->sse);
+        #endif
+        #ifdef COUNT_DISTANCES
+        num_distances = std::max(num_distances, a->num_distances);
+        inner_products = std::max(inner_products, a->inner_products);
+        assignment_changes = std::max(assignment_changes, a->assignment_changes);
+        bounds_updates = std::max(bounds_updates, a->bounds_updates);
+        #endif
+    }
+
+    void add_square(const eval_result* a) {
+        iterations += a->iterations * a->iterations;
+        num_threads += a->num_threads * a->num_threads;
+        cluster_time += a->cluster_time * a->cluster_time;
+        cluster_wall_time += a->cluster_wall_time * a->cluster_wall_time;
+        memory_usage += a->memory_usage * a->memory_usage;
+        #ifdef MONITOR_ACCURACY
+        sse += a->sse * a->sse;
+        #endif
+        #ifdef COUNT_DISTANCES
+        num_distances += a->num_distances * a->num_distances;
+        inner_products += a->inner_products * a->inner_products;
+        assignment_changes += a->assignment_changes * a->assignment_changes;
+        bounds_updates += a->bounds_updates * a->bounds_updates;
+        #endif
+    }
+};
+
+void print_result(eval_result* result);
+void print_summary(std::vector<eval_result*>* results);
+void print_header_row();
+
+#endif	/* STATS_FUNCTIONS_H */
+