Upgrade cuVS Version to 26.02 (facebookresearch#4788)

tarang-jain · facebook-github-bot · commit 0b7fc8737f6d · 2026-03-17T09:33:05.000-07:00
Summary: Pull Request resolved: facebookresearch#4788 Reviewed By: yingufan Differential Revision: D96931953 Pulled By: mnorris11
diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
@@ -74,7 +74,7 @@ runs:
           conda install -y -q cuda-toolkit=12.6 gxx_linux-64=12.4 -c "nvidia/label/cuda-12.6"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=25.10 'cuda-version=12.6' cuda-toolkit=12.6 gxx_linux-64=13.4.0 -c rapidsai -c rapidsai-nightly -c conda-forge
+          conda install -y -q libcuvs=26.02 'cuda-version=12.9' cuda-toolkit=12.9 sysroot_linux-64=2.34 -c rapidsai -c rapidsai-nightly -c conda-forge
         fi
 
         # install SVS runtime for SVS builds
@@ -87,7 +87,7 @@ runs:
           : # skip torch install via conda, we need to install via pip to get
             #  ROCm-enabled version until it's supported in conda by PyTorch
         elif [ "${{ inputs.gpu }}" = "ON" ]; then
-          conda install -y -q "pytorch>=2.7" "pytorch-gpu>=2.7" -c pytorch -c "nvidia/label/12.6"
+          conda install -y -q "pytorch>=2.7" "pytorch-gpu>=2.7" -c pytorch -c "nvidia/label/12.9"
         else
           # TestLowLevelIVF.IVFRQ hangs on pytorch>=2.7, so left it as <2.5 for now.
           conda install -y -q "pytorch<2.5" -c pytorch
diff --git a/INSTALL.md b/INSTALL.md
@@ -6,7 +6,7 @@ pre-release nightly builds.
 
 - The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
 - faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
-- faiss-gpu-cuvs package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 25.10, is available on Linux (x86-64 only) for CUDA 12.4.
+- faiss-gpu-cuvs package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 26.02, is available on Linux (x86-64 only) for CUDA 12.4.
 
 To install the latest stable release:
 
@@ -72,7 +72,7 @@ The optional requirements are:
 - for AMD GPUs:
   - AMD ROCm,
 - for using NVIDIA cuVS implementations:
-  - libcuvs=25.10
+  - libcuvs=26.02
 - for the python bindings:
   - python 3,
   - numpy,
@@ -87,9 +87,9 @@ section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshoot
 
 The libcuvs dependency should be installed via conda:
 ```
-conda install -c rapidsai -c conda-forge -c nvidia libcuvs=25.10 'cuda-version=12.6'
+conda install -c rapidsai -c conda-forge -c nvidia libcuvs=26.02 'cuda-version=12.6'
 ```
-For more ways to install cuVS 25.10, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
+For more ways to install cuVS 26.02, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
 
 ### Building with Intel(R) SVS
 
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,11 +15,11 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "25.10")
+set(RAPIDS_VERSION "26.02")
 set(rapids-cmake-version ${RAPIDS_VERSION})
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${RAPIDS_VERSION}/RAPIDS.cmake
             ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
 endif()
 include(${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
@@ -6,9 +6,9 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
 {% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
 {% set number = GIT_DESCRIBE_NUMBER %}
-{% set cuda_constraints=">=12.6,<12.7" %}
-{% set libcublas_constraints=">=12.6,<12.7" %}
-{% set cudart_constraints=">=12.6,<12.7" %}
+{% set cuda_constraints=">=12.9,<13.0" %}
+{% set libcublas_constraints=">=12.9,<13.0" %}
+{% set cudart_constraints=">=12.9,<13.0" %}
 
 package:
   name: faiss-pkg
@@ -60,7 +60,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl >=2024.2.2  # [x86_64]
         - openblas =0.3.30 # [not x86_64]
-        - libcuvs =25.10
+        - libcuvs =26.02
         - cuda-version {{ cuda_constraints }}
         - libsvs-runtime =0.2.0  # [x86_64 and linux]
       run:
@@ -69,7 +69,7 @@ outputs:
         - openblas =0.3.30 # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =25.10
+        - libcuvs =26.02
         - cuda-version {{ cuda_constraints }}
         - libnvjitlink
         - libsvs-runtime =0.2.0  # [x86_64 and linux]
@@ -91,10 +91,10 @@ outputs:
       string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
     requirements:
       build:
-        - {{ compiler('cxx') }} =12.4
-        - sysroot_linux-64 =2.17 # [linux64]
+        - {{ compiler('cxx') }} =14.2
+        - sysroot_linux-64 =2.34 # [linux64]
         - swig =4.0
-        - cmake >=3.26.4
+        - cmake >=3.30.4
         - make =4.2 # [not win]
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl >=2024.2.2  # [x86_64]
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
@@ -238,7 +238,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor &&
         args.outIndicesType == IndicesDataType::I64 &&
         args.vectorType == DistanceDataType::F32 && args.k > 0) {
-        cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
+        auto distance = metricFaissToCuvs(args.metric, false);
 
         auto resImpl = prov->getResources();
         auto res = resImpl.get();
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
@@ -193,14 +193,14 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
 
 enum class search_algo {
     /// For large batch sizes.
-    SINGLE_CTA,
+    SINGLE_CTA = 0,
     /// For small batch sizes.
-    MULTI_CTA,
-    MULTI_KERNEL,
-    AUTO
+    MULTI_CTA = 1,
+    MULTI_KERNEL = 2,
+    AUTO = 100
 };
 
-enum class hash_mode { HASH, SMALL, AUTO };
+enum class hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
 
 struct SearchParametersCagra : SearchParameters {
     /// Maximum number of queries to search at the same time (batch size). Auto
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
@@ -33,7 +33,7 @@
 
 #if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device_memory_resource.hpp>
 #endif
 
 namespace faiss {
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
@@ -23,9 +23,9 @@
 
 #if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
+#include <rmm/mr/managed_memory_resource.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <memory>
 #endif
 
@@ -93,7 +93,7 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
 #if defined USE_NVIDIA_CUVS
           mmr_(new rmm::mr::managed_memory_resource),
-          pmr_(new rmm::mr::pinned_memory_resource),
+          pmr_(new rmm::mr::pinned_host_memory_resource),
 #endif
           pinnedMemAlloc_(nullptr),
           pinnedMemAllocSize_(0),
@@ -164,7 +164,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     if (pinnedMemAlloc_) {
 #if defined USE_NVIDIA_CUVS
-        pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
+        pmr_->deallocate_sync(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
         FAISS_ASSERT_FMT(
@@ -350,7 +350,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
             try {
-                pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
+                pinnedMemAlloc_ = pmr_->allocate_sync(pinnedMemSize_);
             } catch (const std::bad_alloc& rmm_ex) {
                 FAISS_THROW_MSG("CUDA memory allocation error");
             }
@@ -549,7 +549,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
             rmm::mr::device_memory_resource* current_mr =
                     rmm::mr::get_per_device_resource(
                             rmm::cuda_device_id{adjReq.device});
-            p = current_mr->allocate_async(adjReq.size, adjReq.stream);
+            p = current_mr->allocate(adjReq.stream, adjReq.size);
             adjReq.mr = current_mr;
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
@@ -584,7 +584,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
             // TODO: change this to use the current device resource once RMM has
             // a way to retrieve a "guaranteed" managed memory resource for a
             // device.
-            p = mmr_->allocate_async(adjReq.size, adjReq.stream);
+            p = mmr_->allocate(adjReq.stream, adjReq.size);
             adjReq.mr = mmr_.get();
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
@@ -648,7 +648,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_CUVS
-        req.mr->deallocate_async(p, req.size, req.stream);
+        req.mr->deallocate(req.stream, p, req.size);
 #else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
@@ -25,7 +25,8 @@
 
 #if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
+#include <rmm/mr/device_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
 #endif
 
 #include <faiss/gpu/GpuResources.h>
@@ -172,8 +173,8 @@ class StandardGpuResourcesImpl : public GpuResources {
     // managed_memory_resource
     std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
 
-    // pinned_memory_resource
-    std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
+    // pinned_host_memory_resource
+    std::unique_ptr<rmm::mr::pinned_host_memory_resource> pmr_;
 #endif
 
     /// Pinned memory allocation for use with this GPU
diff --git a/faiss/gpu/impl/BinaryCuvsCagra.cu b/faiss/gpu/impl/BinaryCuvsCagra.cu
@@ -49,9 +49,9 @@ BinaryCuvsCagra::BinaryCuvsCagra(
         IndicesOptions indicesOptions)
         : resources_(resources),
           dim_(dim),
+          store_dataset_(store_dataset),
           graph_build_algo_(graph_build_algo),
-          nn_descent_niter_(nn_descent_niter),
-          store_dataset_(store_dataset) {
+          nn_descent_niter_(nn_descent_niter) {
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
             "only INDICES_64_BIT is supported for cuVS CAGRA index");
@@ -161,6 +161,11 @@ void BinaryCuvsCagra::train(idx_t n, const uint8_t* x) {
         cuvs::neighbors::cagra::graph_build_params::iterative_search_params
                 graph_build_params;
         index_params_.graph_build_params = graph_build_params;
+        if (index_params_.graph_degree ==
+            index_params_.intermediate_graph_degree) {
+            index_params_.intermediate_graph_degree =
+                    1.5 * index_params_.graph_degree;
+        }
     }
 
     if (getDeviceForAddress(x) >= 0) {
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
@@ -39,9 +39,14 @@ namespace faiss {
 /// Algorithm used to build underlying CAGRA graph
 enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
 
-enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
+enum class cagra_search_algo {
+    SINGLE_CTA = 0,
+    MULTI_CTA = 1,
+    MULTI_KERNEL = 2,
+    AUTO = 100
+};
 
-enum class cagra_hash_mode { HASH, SMALL, AUTO };
+enum class cagra_hash_mode { HASH = 0, SMALL = 1, AUTO = 100 };
 
 namespace gpu {
 
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -91,7 +91,7 @@ void CuvsFlatIndex::query(
                 outDistances.getSize(0),
                 outDistances.getSize(1));
 
-        cuvsDistanceType distance = metricFaissToCuvs(metric, exactDistance);
+        auto distance = metricFaissToCuvs(metric, exactDistance);
 
         std::optional<raft::device_vector_view<const float, int64_t>>
                 norms_view = raft::make_device_vector_view(
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
diff --git a/faiss/gpu/utils/CuvsUtils.cu b/faiss/gpu/utils/CuvsUtils.cu
diff --git a/faiss/gpu/utils/CuvsUtils.h b/faiss/gpu/utils/CuvsUtils.h
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt