-
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathkmeans
More file actions
128 lines (106 loc) · 5.22 KB
/
kmeans
File metadata and controls
128 lines (106 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
Copyright (C) 2018-2024 Geoffrey Daniels. https://gpdaniels.com/
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License only.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#ifndef GTL_ALGORITHM_KMEANS_HPP
#define GTL_ALGORITHM_KMEANS_HPP
// Summary: Implementation of the KMeans clustering algorithm.
#ifndef NDEBUG
#if defined(_MSC_VER)
#define __builtin_trap() __debugbreak()
#endif
/// @brief A simple assert macro to break the program if the kmeans is misused.
#define GTL_KMEANS_ASSERT(ASSERTION, MESSAGE) static_cast<void>((ASSERTION) || (__builtin_trap(), 0))
#else
/// @brief At release time the assert macro is implemented as a nop.
#define GTL_KMEANS_ASSERT(ASSERTION, MESSAGE) static_cast<void>(0)
#endif
#if defined(_MSC_VER)
#pragma warning(push, 0)
#endif
#include <vector>
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
namespace gtl {
/// @brief The kmeans class implements the K-means clustering algorithm for partitioning data into k clusters.
/// @tparam data_type The type of data points to be clustered.
/// @tparam distance_type The type used for distance calculations.
template <typename data_type, typename distance_type>
class kmeans final {
private:
/// @brief Type alias for the distance function that calculates distance between two data points.
using distance_function_type = distance_type(const data_type&, const data_type&);
public:
/// @brief Performs K-means clustering on the provided data.
/// @param data The vector of data points to be clustered.
/// @param cluster_count The number of clusters to create.
/// @param max_iterations The maximum number of iterations to perform.
/// @param min_delta The minimum change in centroid positions to continue iterating.
/// @param distance_function The function used to calculate distance between data points.
/// @return A vector containing the cluster assignment for each data point.
static std::vector<std::size_t> compute(
const std::vector<data_type>& data,
std::size_t cluster_count,
int max_iterations,
distance_type min_delta,
distance_function_type distance_function
) {
GTL_KMEANS_ASSERT(cluster_count < data.size(), "KMeans requires more data points than the number of clusters.");
// Vector to store the centroids of each cluster.
std::vector<data_type> cluster_centroids(cluster_count);
// Vector to store the cluster assignment for each data point.
std::vector<std::size_t> cluster_data(data.size());
// Initialise cluster centroids using data (could be done using random points).
for (std::size_t i = 0; i < cluster_count; ++i) {
cluster_centroids[i] = data[i];
}
// Iteratively improve the clusters.
for (int iteration = 0; iteration < max_iterations; ++iteration) {
// Allocate points to clusters.
for (std::size_t i = 0; i < data.size(); ++i) {
// Assume first cluster.
cluster_data[i] = 0;
distance_type cluster_distance = distance_function(data[i], cluster_centroids[cluster_data[i]]);
// Search remaining clusters for closer centroids.
for (std::size_t j = 1; j < cluster_count; ++j) {
distance_type distance = distance_function(data[i], cluster_centroids[j]);
if (distance < cluster_distance) {
cluster_data[i] = j;
cluster_distance = distance;
}
}
}
// Recalculate cluster centroids.
std::vector<data_type> centroid_data_sum(cluster_count);
std::vector<int> centroid_data_count(cluster_count);
for (std::size_t i = 0; i < data.size(); ++i) {
centroid_data_sum[cluster_data[i]] = centroid_data_sum[cluster_data[i]] + data[i];
++centroid_data_count[cluster_data[i]];
}
// TODO: If data count is zero, should I move the centroid?
distance_type delta = 0;
for (std::size_t i = 0; i < cluster_count; ++i) {
data_type centroid = centroid_data_sum[i] / centroid_data_count[i];
delta += distance_function(cluster_centroids[i], centroid);
cluster_centroids[i] = centroid;
}
if (delta < min_delta) {
break;
}
}
return cluster_data;
}
};
}
#undef GTL_KMEANS_ASSERT
#endif // GTL_ALGORITHM_KMEANS_HPP