Skip to content

Commit b66a8d6

Browse files
authored
Merge pull request rapidsai#2882 from rapidsai/release/25.12
Forward-merge release/25.12 into main
2 parents 630efd7 + 4301fd5 commit b66a8d6

28 files changed

Lines changed: 3269 additions & 5 deletions

cpp/include/raft/cluster/detail/kmeans_deprecated.cuh

Lines changed: 990 additions & 0 deletions
Large diffs are not rendered by default.

cpp/include/raft/cluster/kmeans.cuh

Lines changed: 726 additions & 0 deletions
Large diffs are not rendered by default.

cpp/include/raft/cluster/kmeans_balanced.cuh

Lines changed: 360 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
#pragma once
6+
7+
#include <raft/cluster/detail/kmeans_deprecated.cuh>
8+
9+
namespace raft {
10+
namespace cluster {
11+
12+
/**
13+
* @brief Find clusters with k-means algorithm.
14+
* Initial centroids are chosen with k-means++ algorithm. Empty
15+
* clusters are reinitialized by choosing new centroids with
16+
* k-means++ algorithm.
17+
* @tparam index_type_t the type of data used for indexing.
18+
* @tparam value_type_t the type of data used for weights, distances.
19+
* @param handle the raft handle.
20+
* @param n Number of observation vectors.
21+
* @param d Dimension of observation vectors.
22+
* @param k Number of clusters.
23+
* @param tol Tolerance for convergence. k-means stops when the
24+
* change in residual divided by n is less than tol.
25+
* @param maxiter Maximum number of k-means iterations.
26+
* @param obs (Input, device memory, d*n entries) Observation
27+
* matrix. Matrix is stored column-major and each column is an
28+
* observation vector. Matrix dimensions are d x n.
29+
* @param codes (Output, device memory, n entries) Cluster
30+
* assignments.
31+
* @param residual On exit, residual sum of squares (sum of squares
32+
* of distances between observation vectors and centroids).
33+
* @param iters on exit, number of k-means iterations.
34+
* @param seed random seed to be used.
35+
* @return error flag
36+
*/
37+
template <typename index_type_t, typename value_type_t>
38+
int kmeans(raft::resources const& handle,
39+
index_type_t n,
40+
index_type_t d,
41+
index_type_t k,
42+
value_type_t tol,
43+
index_type_t maxiter,
44+
const value_type_t* __restrict__ obs,
45+
index_type_t* __restrict__ codes,
46+
value_type_t& residual,
47+
index_type_t& iters,
48+
unsigned long long seed = 123456)
49+
{
50+
return detail::kmeans<index_type_t, value_type_t>(
51+
handle, n, d, k, tol, maxiter, obs, codes, residual, iters, seed);
52+
}
53+
} // namespace cluster
54+
} // namespace raft
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
#pragma once
6+
7+
#include <raft/cluster/detail/single_linkage.cuh>
8+
#include <raft/cluster/single_linkage_types.hpp>
9+
#include <raft/core/device_mdspan.hpp>
10+
11+
namespace raft::cluster {
12+
13+
/**
14+
* Note: All of the functions below in the raft::cluster namespace are deprecated
15+
* and will be removed in a future release. Please use raft::cluster::hierarchy
16+
* instead.
17+
*/
18+
19+
/**
20+
* Single-linkage clustering, capable of constructing a KNN graph to
21+
* scale the algorithm beyond the n^2 memory consumption of implementations
22+
* that use the fully-connected graph of pairwise distances by connecting
23+
* a knn graph when k is not large enough to connect it.
24+
25+
* @tparam value_idx
26+
* @tparam value_t
27+
* @tparam dist_type method to use for constructing connectivities graph
28+
* @param[in] handle raft handle
29+
* @param[in] X dense input matrix in row-major layout
30+
* @param[in] m number of rows in X
31+
* @param[in] n number of columns in X
32+
* @param[in] metric distance metrix to use when constructing connectivities graph
33+
* @param[out] out struct containing output dendrogram and cluster assignments
34+
* @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
35+
control
36+
* of k. The algorithm will set `k = log(n) + c`
37+
* @param[in] n_clusters number of clusters to assign data samples
38+
*/
39+
template <typename value_idx,
40+
typename value_t,
41+
LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
42+
[[deprecated("Use cuVS instead")]] void single_linkage(raft::resources const& handle,
43+
const value_t* X,
44+
value_idx m,
45+
value_idx n,
46+
raft::distance::DistanceType metric,
47+
linkage_output<value_idx>* out,
48+
int c,
49+
size_t n_clusters)
50+
{
51+
detail::single_linkage<value_idx, value_t, dist_type>(
52+
handle, X, m, n, metric, out, c, n_clusters);
53+
}
54+
}; // namespace raft::cluster
55+
56+
namespace raft::cluster::hierarchy {
57+
58+
constexpr int DEFAULT_CONST_C = 15;
59+
60+
/**
61+
* Single-linkage clustering, capable of constructing a KNN graph to
62+
* scale the algorithm beyond the n^2 memory consumption of implementations
63+
* that use the fully-connected graph of pairwise distances by connecting
64+
* a knn graph when k is not large enough to connect it.
65+
66+
* @tparam value_idx
67+
* @tparam value_t
68+
* @tparam dist_type method to use for constructing connectivities graph
69+
* @param[in] handle raft handle
70+
* @param[in] X dense input matrix in row-major layout
71+
* @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2)
72+
* @param[out] labels output labels vector (size n_rows)
73+
* @param[in] metric distance metrix to use when constructing connectivities graph
74+
* @param[in] n_clusters number of clusters to assign data samples
75+
* @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
76+
control of k. The algorithm will set `k = log(n) + c`
77+
*/
78+
template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
79+
[[deprecated("Use cuVS instead")]] void single_linkage(
80+
raft::resources const& handle,
81+
raft::device_matrix_view<const value_t, idx_t, row_major> X,
82+
raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
83+
raft::device_vector_view<idx_t, idx_t> labels,
84+
raft::distance::DistanceType metric,
85+
size_t n_clusters,
86+
std::optional<int> c = std::make_optional<int>(DEFAULT_CONST_C))
87+
{
88+
linkage_output<idx_t> out_arrs;
89+
out_arrs.children = dendrogram.data_handle();
90+
out_arrs.labels = labels.data_handle();
91+
92+
raft::cluster::single_linkage<idx_t, value_t, dist_type>(
93+
handle,
94+
X.data_handle(),
95+
X.extent(0),
96+
X.extent(1),
97+
metric,
98+
&out_arrs,
99+
c.has_value() ? c.value() : DEFAULT_CONST_C,
100+
n_clusters);
101+
}
102+
}; // namespace raft::cluster::hierarchy

cpp/include/raft/neighbors/brute_force_types.hpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,71 @@ struct index : ann::index {
7474
auto operator=(index&&) -> index& = default;
7575
~index() = default;
7676

77+
/** Construct a brute force index from dataset
78+
*
79+
* Constructs a brute force index from a dataset. This lets us precompute norms for
80+
* the dataset, providing a speed benefit over doing this at query time.
81+
82+
* If the dataset is already in GPU memory, then this class stores a non-owning reference to
83+
* the dataset. If the dataset is in host memory, it will be copied to the device and the
84+
* index will own the device memory.
85+
*/
86+
87+
template <typename data_accessor>
88+
[[deprecated("Use cuVS instead")]] index(
89+
raft::resources const& res,
90+
mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
91+
std::optional<raft::device_vector<T, int64_t>>&& norms,
92+
raft::distance::DistanceType metric,
93+
T metric_arg = 0.0)
94+
: ann::index(),
95+
metric_(metric),
96+
dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
97+
norms_(std::move(norms)),
98+
metric_arg_(metric_arg)
99+
{
100+
if (norms_) { norms_view_ = make_const_mdspan(norms_.value().view()); }
101+
update_dataset(res, dataset);
102+
resource::sync_stream(res);
103+
}
104+
105+
/** Construct a brute force index from dataset
106+
*
107+
* This class stores a non-owning reference to the dataset and norms here.
108+
* Having precomputed norms gives us a performance advantage at query time.
109+
*/
110+
[[deprecated("Use cuVS instead")]] index(
111+
raft::resources const& res,
112+
raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
113+
std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
114+
raft::distance::DistanceType metric,
115+
T metric_arg = 0.0)
116+
: ann::index(),
117+
metric_(metric),
118+
dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
119+
dataset_view_(dataset_view),
120+
norms_view_(norms_view),
121+
metric_arg_(metric_arg)
122+
{
123+
}
124+
125+
template <typename data_accessor>
126+
[[deprecated("Use cuVS instead")]] index(
127+
raft::resources const& res,
128+
index_params const& params,
129+
mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
130+
std::optional<raft::device_vector<T, int64_t>>&& norms = std::nullopt)
131+
: ann::index(),
132+
metric_(params.metric),
133+
dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
134+
norms_(std::move(norms)),
135+
metric_arg_(params.metric_arg)
136+
{
137+
if (norms_) { norms_view_ = make_const_mdspan(norms_.value().view()); }
138+
update_dataset(res, dataset);
139+
resource::sync_stream(res);
140+
}
141+
77142
/**
78143
* Replace the dataset with a new dataset.
79144
*/

cpp/include/raft/neighbors/cagra.cuh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,3 +374,12 @@ void search(raft::resources const& res,
374374
/** @} */ // end group cagra
375375

376376
} // namespace raft::neighbors::cagra
377+
378+
// TODO: Remove deprecated experimental namespace in 23.12 release
379+
namespace raft::neighbors::experimental::cagra {
380+
using raft::neighbors::cagra::build;
381+
using raft::neighbors::cagra::build_knn_graph;
382+
using raft::neighbors::cagra::optimize;
383+
using raft::neighbors::cagra::search;
384+
using raft::neighbors::cagra::sort_knn_graph;
385+
} // namespace raft::neighbors::experimental::cagra

cpp/include/raft/neighbors/cagra_serialize.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,10 @@ index<T, IdxT> deserialize(raft::resources const& handle, const std::string& fil
217217
/**@}*/
218218

219219
} // namespace raft::neighbors::cagra
220+
221+
// TODO: Remove deprecated experimental namespace in 23.12 release
222+
namespace raft::neighbors::experimental::cagra {
223+
using raft::neighbors::cagra::deserialize;
224+
using raft::neighbors::cagra::serialize;
225+
226+
} // namespace raft::neighbors::experimental::cagra

cpp/include/raft/neighbors/cagra_types.hpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,21 @@ struct index : ann::index {
154154
return graph_view_.extent(1);
155155
}
156156

157+
/**
158+
* DEPRECATED: please use data() instead.
159+
* If you need to query dataset dimensions, use the dim() and size() of the cagra index.
160+
* The data_handle() is not always available: you need to do a dynamic_cast to the expected
161+
* dataset type at runtime.
162+
*/
163+
[[nodiscard]] [[deprecated("Use data()")]] inline auto dataset() const noexcept
164+
-> device_matrix_view<const T, int64_t, layout_stride>
165+
{
166+
auto p = dynamic_cast<strided_dataset<T, int64_t>*>(dataset_.get());
167+
if (p != nullptr) { return p->view(); }
168+
auto d = dataset_->dim();
169+
return make_device_strided_matrix_view<const T, int64_t>(nullptr, 0, d, d);
170+
}
171+
157172
/** Dataset [size, dim] */
158173
[[nodiscard]] inline auto data() const noexcept -> const neighbors::dataset<int64_t>&
159174
{
@@ -174,6 +189,88 @@ struct index : ann::index {
174189
auto operator=(index&&) -> index& = default;
175190
~index() = default;
176191

192+
/** Construct an empty index. */
193+
[[deprecated("Use cuVS instead")]] index(
194+
raft::resources const& res,
195+
raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
196+
: ann::index(),
197+
metric_(metric),
198+
graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
199+
dataset_(new neighbors::empty_dataset<int64_t>(0))
200+
{
201+
}
202+
203+
/** Construct an index from dataset and knn_graph arrays
204+
*
205+
* If the dataset and graph is already in GPU memory, then the index is just a thin wrapper around
206+
* these that stores a non-owning a reference to the arrays.
207+
*
208+
* The constructor also accepts host arrays. In that case they are copied to the device, and the
209+
* device arrays will be owned by the index.
210+
*
211+
* In case the dasates rows are not 16 bytes aligned, then we create a padded copy in device
212+
* memory to ensure alignment for vectorized load.
213+
*
214+
* Usage examples:
215+
*
216+
* - Cagra index is normally created by the cagra::build
217+
* @code{.cpp}
218+
* using namespace raft::neighbors::experimental;
219+
* auto dataset = raft::make_host_matrix<float, int64_t>(n_rows, n_cols);
220+
* load_dataset(dataset.view());
221+
* // use default index parameters
222+
* cagra::index_params index_params;
223+
* // create and fill the index from a [N, D] dataset
224+
* auto index = cagra::build(res, index_params, dataset);
225+
* // use default search parameters
226+
* cagra::search_params search_params;
227+
* // search K nearest neighbours
228+
* auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k);
229+
* auto distances = raft::make_device_matrix<float, int64_t>(res, n_queries, k);
230+
* cagra::search(res, search_params, index, queries, neighbors, distances);
231+
* @endcode
232+
* In the above example, we have passed a host dataset to build. The returned index will own a
233+
* device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a
234+
* device_mdspan to build, then it will only store a reference to it.
235+
*
236+
* - Constructing index using existing knn-graph
237+
* @code{.cpp}
238+
* using namespace raft::neighbors::experimental;
239+
*
240+
* auto dataset = raft::make_device_matrix<float, int64_t>(res, n_rows, n_cols);
241+
* auto knn_graph = raft::make_device_matrix<uint32_n, int64_t>(res, n_rows, graph_degree);
242+
*
243+
* // custom loading and graph creation
244+
* // load_dataset(dataset.view());
245+
* // create_knn_graph(knn_graph.view());
246+
*
247+
* // Wrap the existing device arrays into an index structure
248+
* cagra::index<T, IdxT> index(res, metric, raft::make_const_mdspan(dataset.view()),
249+
* raft::make_const_mdspan(knn_graph.view()));
250+
*
251+
* // Both knn_graph and dataset objects have to be in scope while the index is used because
252+
* // the index only stores a reference to these.
253+
* cagra::search(res, search_params, index, queries, neighbors, distances);
254+
* @endcode
255+
*
256+
*/
257+
template <typename data_accessor, typename graph_accessor>
258+
[[deprecated("Use cuVS instead")]] index(
259+
raft::resources const& res,
260+
raft::distance::DistanceType metric,
261+
mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
262+
mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
263+
: ann::index(),
264+
metric_(metric),
265+
graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
266+
dataset_(make_aligned_dataset(res, dataset, 16))
267+
{
268+
RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
269+
"Dataset and knn_graph must have equal number of rows");
270+
update_graph(res, knn_graph);
271+
resource::sync_stream(res);
272+
}
273+
177274
/**
178275
* Replace the dataset with a new dataset.
179276
*
@@ -263,3 +360,13 @@ struct index : ann::index {
263360
/** @} */
264361

265362
} // namespace raft::neighbors::cagra
363+
364+
// TODO: Remove deprecated experimental namespace in 23.12 release
365+
namespace raft::neighbors::experimental::cagra {
366+
using raft::neighbors::cagra::graph_build_algo;
367+
using raft::neighbors::cagra::hash_mode;
368+
using raft::neighbors::cagra::index;
369+
using raft::neighbors::cagra::index_params;
370+
using raft::neighbors::cagra::search_algo;
371+
using raft::neighbors::cagra::search_params;
372+
} // namespace raft::neighbors::experimental::cagra

cpp/include/raft/neighbors/hnsw_types.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,20 @@ struct search_params : ann::search_params {
3030
template <typename T>
3131
struct index : ann::index {
3232
public:
33+
/**
34+
* @brief load a base-layer-only hnswlib index originally saved from a built CAGRA index.
35+
* This is a virtual class and it cannot be used directly. To create an index, use the factory
36+
* function `raft::neighbors::hnsw::from_cagra` from the header
37+
* `raft/neighbors/hnsw.hpp`
38+
*
39+
* @param[in] dim dimensions of the training dataset
40+
* @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
41+
*/
42+
[[deprecated("Use cuVS instead")]] index(int dim, raft::distance::DistanceType metric)
43+
: dim_{dim}, metric_{metric}
44+
{
45+
}
46+
3347
/**
3448
@brief Get underlying index
3549
*/

0 commit comments

Comments
 (0)