Skip to content

Commit 81532b6

Browse files
authored
PCA C and Python API (#1987)
Resolves #1977 and Resolves #1994 Authors: - Anupam (https://github.com/aamijar) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Micka (https://github.com/lowener) - Divye Gala (https://github.com/divyegala) URL: #1987
1 parent 60a7088 commit 81532b6

15 files changed

Lines changed: 1508 additions & 0 deletions

File tree

c/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ add_library(
9999
src/neighbors/refine.cpp
100100
src/neighbors/tiered_index.cpp
101101
src/neighbors/all_neighbors.cpp
102+
src/preprocessing/pca.cpp
102103
src/preprocessing/quantize/binary.cpp
103104
src/preprocessing/quantize/pq.cpp
104105
src/preprocessing/quantize/scalar.cpp

c/include/cuvs/core/all.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <cuvs/neighbors/mg_ivf_pq.h>
3939
#endif
4040

41+
#include <cuvs/preprocessing/pca.h>
4142
#include <cuvs/preprocessing/quantize/binary.h>
4243
#include <cuvs/preprocessing/quantize/pq.h>
4344
#include <cuvs/preprocessing/quantize/scalar.h>

c/include/cuvs/preprocessing/pca.h

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include <cuvs/core/c_api.h>
9+
#include <dlpack/dlpack.h>
10+
#include <stdbool.h>
11+
#include <stdint.h>
12+
13+
#ifdef __cplusplus
14+
extern "C" {
15+
#endif
16+
17+
/**
18+
* @defgroup preprocessing_c_pca C API for PCA (Principal Component Analysis)
19+
* @{
20+
*/
21+
22+
/**
23+
* @brief Solver algorithm for PCA eigen decomposition.
24+
*/
25+
enum cuvsPcaSolver {
26+
/** Covariance + divide-and-conquer eigen decomposition */
27+
CUVS_PCA_COV_EIG_DQ = 0,
28+
/** Covariance + Jacobi eigen decomposition */
29+
CUVS_PCA_COV_EIG_JACOBI = 1
30+
};
31+
32+
/**
33+
* @brief Parameters for PCA decomposition.
34+
*/
35+
struct cuvsPcaParams {
36+
/** Number of principal components to keep. */
37+
int n_components;
38+
39+
/**
40+
* If false, data passed to fit are overwritten and running fit(X).transform(X) will
41+
* not yield the expected results; use fit_transform(X) instead.
42+
*/
43+
bool copy;
44+
45+
/**
46+
* When true the component vectors are multiplied by the square root of n_samples and then
47+
* divided by the singular values to ensure uncorrelated outputs with unit component-wise
48+
* variances.
49+
*/
50+
bool whiten;
51+
52+
/** Solver algorithm to use. */
53+
enum cuvsPcaSolver algorithm;
54+
55+
/** Tolerance for singular values (used by Jacobi solver). */
56+
float tol;
57+
58+
/** Number of iterations for the power method (Jacobi solver). */
59+
int n_iterations;
60+
};
61+
62+
typedef struct cuvsPcaParams* cuvsPcaParams_t;
63+
64+
/**
65+
* @brief Allocate PCA params and populate with default values.
66+
*
67+
* @param[out] params cuvsPcaParams_t to allocate
68+
* @return cuvsError_t
69+
*/
70+
cuvsError_t cuvsPcaParamsCreate(cuvsPcaParams_t* params);
71+
72+
/**
73+
* @brief De-allocate PCA params.
74+
*
75+
* @param[in] params cuvsPcaParams_t to de-allocate
76+
* @return cuvsError_t
77+
*/
78+
cuvsError_t cuvsPcaParamsDestroy(cuvsPcaParams_t params);
79+
80+
/**
81+
* @brief Perform PCA fit operation.
82+
*
83+
* Computes the principal components, explained variances, singular values, and column means
84+
* from the input data.
85+
*
86+
* @code {.c}
87+
* #include <cuvs/core/c_api.h>
88+
* #include <cuvs/preprocessing/pca.h>
89+
*
90+
* // Create cuvsResources_t
91+
* cuvsResources_t res;
92+
* cuvsResourcesCreate(&res);
93+
*
94+
* // Create PCA params
95+
* cuvsPcaParams_t params;
96+
* cuvsPcaParamsCreate(&params);
97+
* params->n_components = 2;
98+
*
99+
* // Assume populated DLManagedTensor objects (col-major, float32, device memory)
100+
* DLManagedTensor input; // [n_rows x n_cols]
101+
* DLManagedTensor components; // [n_components x n_cols]
102+
* DLManagedTensor explained_var; // [n_components]
103+
* DLManagedTensor explained_var_ratio; // [n_components]
104+
* DLManagedTensor singular_vals; // [n_components]
105+
* DLManagedTensor mu; // [n_cols]
106+
* DLManagedTensor noise_vars; // [1] (scalar)
107+
*
108+
* cuvsPcaFit(res, params, &input, &components, &explained_var,
109+
* &explained_var_ratio, &singular_vals, &mu, &noise_vars, false);
110+
*
111+
* // Cleanup
112+
* cuvsPcaParamsDestroy(params);
113+
* cuvsResourcesDestroy(res);
114+
* @endcode
115+
*
116+
* @param[in] res cuvsResources_t opaque C handle
117+
* @param[in] params PCA parameters
118+
* @param[inout] input input data [n_rows x n_cols] (col-major, float32, device)
119+
* @param[out] components principal components [n_components x n_cols] (col-major, float32, device)
120+
* @param[out] explained_var explained variances [n_components] (float32, device)
121+
* @param[out] explained_var_ratio explained variance ratios [n_components] (float32, device)
122+
* @param[out] singular_vals singular values [n_components] (float32, device)
123+
* @param[out] mu column means [n_cols] (float32, device)
124+
* @param[out] noise_vars noise variance [1] (float32, device)
125+
* @param[in] flip_signs_based_on_U whether to determine signs by U (true) or V.T (false)
126+
* @return cuvsError_t
127+
*/
128+
cuvsError_t cuvsPcaFit(cuvsResources_t res,
129+
cuvsPcaParams_t params,
130+
DLManagedTensor* input,
131+
DLManagedTensor* components,
132+
DLManagedTensor* explained_var,
133+
DLManagedTensor* explained_var_ratio,
134+
DLManagedTensor* singular_vals,
135+
DLManagedTensor* mu,
136+
DLManagedTensor* noise_vars,
137+
bool flip_signs_based_on_U);
138+
139+
/**
140+
* @brief Perform PCA fit and transform in a single operation.
141+
*
142+
* Computes the principal components and transforms the input data into the eigenspace.
143+
*
144+
* @param[in] res cuvsResources_t opaque C handle
145+
* @param[in] params PCA parameters
146+
* @param[inout] input input data [n_rows x n_cols] (col-major, float32, device)
147+
* @param[out] trans_input transformed data [n_rows x n_components] (col-major, float32, device)
148+
* @param[out] components principal components [n_components x n_cols] (col-major, float32, device)
149+
* @param[out] explained_var explained variances [n_components] (float32, device)
150+
* @param[out] explained_var_ratio explained variance ratios [n_components] (float32, device)
151+
* @param[out] singular_vals singular values [n_components] (float32, device)
152+
* @param[out] mu column means [n_cols] (float32, device)
153+
* @param[out] noise_vars noise variance [1] (float32, device)
154+
* @param[in] flip_signs_based_on_U whether to determine signs by U (true) or V.T (false)
155+
* @return cuvsError_t
156+
*/
157+
cuvsError_t cuvsPcaFitTransform(cuvsResources_t res,
158+
cuvsPcaParams_t params,
159+
DLManagedTensor* input,
160+
DLManagedTensor* trans_input,
161+
DLManagedTensor* components,
162+
DLManagedTensor* explained_var,
163+
DLManagedTensor* explained_var_ratio,
164+
DLManagedTensor* singular_vals,
165+
DLManagedTensor* mu,
166+
DLManagedTensor* noise_vars,
167+
bool flip_signs_based_on_U);
168+
169+
/**
170+
* @brief Perform PCA transform operation.
171+
*
172+
* Transforms the input data into the eigenspace using previously computed principal components.
173+
*
174+
* @param[in] res cuvsResources_t opaque C handle
175+
* @param[in] params PCA parameters
176+
* @param[inout] input data to transform [n_rows x n_cols] (col-major, float32, device)
177+
* @param[in] components principal components [n_components x n_cols] (col-major, float32, device)
178+
* @param[in] singular_vals singular values [n_components] (float32, device)
179+
* @param[in] mu column means [n_cols] (float32, device)
180+
* @param[out] trans_input transformed data [n_rows x n_components] (col-major, float32, device)
181+
* @return cuvsError_t
182+
*/
183+
cuvsError_t cuvsPcaTransform(cuvsResources_t res,
184+
cuvsPcaParams_t params,
185+
DLManagedTensor* input,
186+
DLManagedTensor* components,
187+
DLManagedTensor* singular_vals,
188+
DLManagedTensor* mu,
189+
DLManagedTensor* trans_input);
190+
191+
/**
192+
* @brief Perform PCA inverse transform operation.
193+
*
194+
* Transforms data from the eigenspace back to the original space.
195+
*
196+
* @param[in] res cuvsResources_t opaque C handle
197+
* @param[in] params PCA parameters
198+
* @param[in] trans_input transformed data [n_rows x n_components] (col-major, float32, device)
199+
* @param[in] components principal components [n_components x n_cols] (col-major, float32, device)
200+
* @param[in] singular_vals singular values [n_components] (float32, device)
201+
* @param[in] mu column means [n_cols] (float32, device)
202+
* @param[out] output reconstructed data [n_rows x n_cols] (col-major, float32, device)
203+
* @return cuvsError_t
204+
*/
205+
cuvsError_t cuvsPcaInverseTransform(cuvsResources_t res,
206+
cuvsPcaParams_t params,
207+
DLManagedTensor* trans_input,
208+
DLManagedTensor* components,
209+
DLManagedTensor* singular_vals,
210+
DLManagedTensor* mu,
211+
DLManagedTensor* output);
212+
213+
/**
214+
* @}
215+
*/
216+
217+
#ifdef __cplusplus
218+
}
219+
#endif

0 commit comments

Comments
 (0)