Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions conda/recipes/libcuvs/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ cache:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -117,6 +118,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -179,6 +181,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -240,6 +243,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -299,6 +303,7 @@ outputs:
- openblas # required by some CPU algos in benchmarks
- cuda-cudart-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down
2 changes: 2 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,8 @@ ConfigureTest(
PERCENT 100
)

add_subdirectory(cutile)

# ##################################################################################################
# Install tests ####################################################################################
# ##################################################################################################
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/cutile/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# =============================================================================
# cmake-format: off
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
# cmake-format: on
# =============================================================================

include("${CMAKE_CURRENT_LIST_DIR}/generate_cutile_cubins.cmake")

generate_cutile_vector_add_cubins(CUTILE_GENERATED_INCLUDE_DIR)

ConfigureTest(
NAME CUTILE_VECTOR_ADD_TEST
PATH "${CMAKE_CURRENT_LIST_DIR}/cutile_vector_add.cu"
GPUS 1
PERCENT 100
)

add_dependencies(CUTILE_VECTOR_ADD_TEST cutile_vector_add_cubins)

target_include_directories(
CUTILE_VECTOR_ADD_TEST PRIVATE "${CUTILE_GENERATED_INCLUDE_DIR}"
)
128 changes: 128 additions & 0 deletions cpp/tests/cutile/cutile_vector_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include "../test_utils.cuh"

#include "vector_add_kernel_symbol.h"
#include "vector_add_sm_100_cubin.h"
#include "vector_add_sm_120_cubin.h"
#include "vector_add_sm_80_cubin.h"
#include "vector_add_sm_86_cubin.h"
#include "vector_add_sm_90_cubin.h"

#include <cuda_runtime_api.h>

#include <cstdint>

namespace cuvs {
namespace {

struct EmbeddedCubin {
int cc_major;
int cc_minor;
const unsigned char* data;
size_t size;
};

// Lookup table for cubins built at configure time (see export_vector_add_cubin.py).
constexpr EmbeddedCubin kEmbeddedCubins[] = {
{8, 0, vector_add_sm_80_cubin, sizeof(vector_add_sm_80_cubin)},
{8, 6, vector_add_sm_86_cubin, sizeof(vector_add_sm_86_cubin)},
{9, 0, vector_add_sm_90_cubin, sizeof(vector_add_sm_90_cubin)},
{10, 0, vector_add_sm_100_cubin, sizeof(vector_add_sm_100_cubin)},
{12, 0, vector_add_sm_120_cubin, sizeof(vector_add_sm_120_cubin)},
};

const EmbeddedCubin* find_embedded_cubin(int cc_major, int cc_minor)
{
for (const auto& entry : kEmbeddedCubins) {
if (entry.cc_major == cc_major && entry.cc_minor == cc_minor) { return &entry; }
}
// Fall back to a cubin for the same major version (e.g. minor SKUs within a generation).
for (const auto& entry : kEmbeddedCubins) {
if (entry.cc_major == cc_major) { return &entry; }
}
return nullptr;
}

class CutileVectorAddTest : public ::testing::Test {
protected:
void SetUp() override
{
int device = 0;
RAFT_CUDA_TRY(cudaGetDevice(&device));
RAFT_CUDA_TRY(
cudaDeviceGetAttribute(&cc_major_, cudaDevAttrComputeCapabilityMajor, device));
RAFT_CUDA_TRY(
cudaDeviceGetAttribute(&cc_minor_, cudaDevAttrComputeCapabilityMinor, device));
}

int cc_major_{};
int cc_minor_{};
};

} // namespace

TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd)
{
const EmbeddedCubin* cubin = find_embedded_cubin(cc_major_, cc_minor_);
ASSERT_NE(cubin, nullptr)
<< "No embedded cuTile cubin for compute capability " << cc_major_ << "." << cc_minor_;

cudaLibrary_t library{};
ASSERT_EQ(cudaSuccess,
cudaLibraryLoadData(
&library, cubin->data, nullptr, nullptr, 0, nullptr, nullptr, 0))
<< "cudaLibraryLoadData failed: " << cudaGetErrorString(cudaGetLastError());

cudaKernel_t kernel{};
ASSERT_EQ(cudaSuccess,
cudaLibraryGetKernel(&kernel, library, CUTILE_VECTOR_ADD_KERNEL_SYMBOL))
<< "cudaLibraryGetKernel failed: " << cudaGetErrorString(cudaGetLastError());

constexpr int kN = 1024;
constexpr int kTile = 256;
constexpr int kGridDim = (kN + kTile - 1) / kTile;

float *d_a = nullptr, *d_b = nullptr, *d_c = nullptr;
RAFT_CUDA_TRY(cudaMalloc(&d_a, kN * sizeof(float)));
RAFT_CUDA_TRY(cudaMalloc(&d_b, kN * sizeof(float)));
RAFT_CUDA_TRY(cudaMalloc(&d_c, kN * sizeof(float)));

std::vector<float> h_a(kN), h_b(kN);
for (int i = 0; i < kN; ++i) {
h_a[i] = static_cast<float>(i);
h_b[i] = static_cast<float>(i * 2);
}
RAFT_CUDA_TRY(cudaMemcpy(d_a, h_a.data(), kN * sizeof(float), cudaMemcpyHostToDevice));
RAFT_CUDA_TRY(cudaMemcpy(d_b, h_b.data(), kN * sizeof(float), cudaMemcpyHostToDevice));
RAFT_CUDA_TRY(cudaMemset(d_c, 0, kN * sizeof(float)));

int64_t shape = kN;
int64_t stride = 1;
void* kernel_args[] = {
&d_a, &shape, &stride, &d_b, &shape, &stride, &d_c, &shape, &stride,
};

dim3 grid(kGridDim);
dim3 block(1);
ASSERT_EQ(cudaSuccess, cudaLaunchKernel(kernel, grid, block, kernel_args, 0, 0))
<< "cudaLaunchKernel failed: " << cudaGetErrorString(cudaGetLastError());
RAFT_CUDA_TRY(cudaDeviceSynchronize());

std::vector<float> h_c(kN);
RAFT_CUDA_TRY(cudaMemcpy(h_c.data(), d_c, kN * sizeof(float), cudaMemcpyDeviceToHost));

for (int i = 0; i < kN; ++i) {
ASSERT_FLOAT_EQ(h_a[i] + h_b[i], h_c[i]) << "@" << i;
}

RAFT_CUDA_TRY(cudaFree(d_a));
RAFT_CUDA_TRY(cudaFree(d_b));
RAFT_CUDA_TRY(cudaFree(d_c));
RAFT_CUDA_TRY(cudaLibraryUnload(library));
}

} // namespace cuvs
101 changes: 101 additions & 0 deletions cpp/tests/cutile/export_vector_add_cubin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
"""Export the cuTile vector-add kernel to a cubin for a single GPU target."""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

import cuda.tile as ct
from cuda.tile.compilation import (
ArrayConstraint,
CallingConvention,
ConstantConstraint,
KernelSignature,
export_kernel,
)

from vector_add_kernel import TILE_SIZE, vector_add

# cuTile / tileiras gpu_code values used at build time. These correspond to the
# cuvs library CUDA 13 real targets as follows (tileiras has no sm_*a/sm_*f names):
# sm_80 -> 80-real
# sm_86 -> 86-real
# sm_90 -> 90a-real
# sm_100 -> 100f-real
# sm_120 -> 120a-real
SUPPORTED_GPU_CODES = ("sm_80", "sm_86", "sm_90", "sm_100", "sm_120")


def _kernel_signature() -> KernelSignature:
array = ArrayConstraint(
ct.float32,
1,
index_dtype=ct.int64,
stride_lower_bound_incl=0,
alias_groups=(),
may_alias_internally=False,
stride_constant=(1,),
)
return KernelSignature(
parameters=[array, array, array, ConstantConstraint(TILE_SIZE)],
calling_convention=CallingConvention.cutile_python_v1(),
).with_mangled_symbol("vector_add")


def export_cubin(output_file: Path, gpu_code: str, symbol_header: Path | None) -> str:
if gpu_code not in SUPPORTED_GPU_CODES:
raise ValueError(
f"Unsupported gpu_code {gpu_code!r}; expected one of {SUPPORTED_GPU_CODES}"
)

signature = _kernel_signature()
export_kernel(
vector_add,
signatures=[signature],
output_file=str(output_file),
gpu_code=gpu_code,
output_format="cubin",
)

if symbol_header is not None:
symbol_header.write_text(
"\n".join(
[
"// Generated by export_vector_add_cubin.py; do not edit.",
"#pragma once",
f'#define CUTILE_VECTOR_ADD_KERNEL_SYMBOL "{signature.symbol}"',
"",
]
)
)

return signature.symbol


def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("output_file", type=Path, help="Output cubin path")
parser.add_argument(
"--gpu-code",
required=True,
choices=SUPPORTED_GPU_CODES,
help="tileiras / export_kernel target (e.g. sm_120)",
)
parser.add_argument(
"--symbol-header",
type=Path,
default=None,
help="Optional header that defines CUTILE_VECTOR_ADD_KERNEL_SYMBOL",
)
args = parser.parse_args()

symbol = export_cubin(args.output_file, args.gpu_code, args.symbol_header)
print(symbol)
return 0


if __name__ == "__main__":
sys.exit(main())
90 changes: 90 additions & 0 deletions cpp/tests/cutile/generate_cutile_cubins.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# =============================================================================
# cmake-format: off
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
# cmake-format: on
# =============================================================================

include_guard(GLOBAL)

# Build-time cuTile cubin targets. Maps to cuvs CUDA 13 -real library arches (75-real omitted).
set(CUTILE_VECTOR_ADD_GPU_CODES sm_80 sm_86 sm_90 sm_100 sm_120)

function(generate_cutile_vector_add_cubins output_include_dir_var)
find_package(Python3 REQUIRED COMPONENTS Interpreter)
find_package(CUDAToolkit REQUIRED)

find_program(
CUTILE_BIN2C
NAMES bin2c
PATHS ${CUDAToolkit_BIN_DIR}
REQUIRED
)

execute_process(
COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile"
RESULT_VARIABLE _cutile_import_result
OUTPUT_QUIET
ERROR_QUIET
)
if(NOT _cutile_import_result EQUAL 0)
message(
FATAL_ERROR
"cuda.tile (cuTile Python) is required to build CUTILE_VECTOR_ADD_TEST. "
"Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]."
)
endif()

set(_cutile_source_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}")
set(_cutile_binary_dir "${CMAKE_CURRENT_BINARY_DIR}/cutile_generated")
file(MAKE_DIRECTORY "${_cutile_binary_dir}")

set(_symbol_header "${_cutile_binary_dir}/vector_add_kernel_symbol.h")
set(_first_gpu_code TRUE)

foreach(_gpu_code IN LISTS CUTILE_VECTOR_ADD_GPU_CODES)
set(_cubin_file "${_cutile_binary_dir}/vector_add_${_gpu_code}.cubin")
set(_cubin_header "${_cutile_binary_dir}/vector_add_${_gpu_code}_cubin.h")

if(_first_gpu_code)
set(_symbol_arg --symbol-header "${_symbol_header}")
set(_cubin_outputs "${_cubin_file}" "${_symbol_header}")
set(_first_gpu_code FALSE)
else()
set(_symbol_arg)
set(_cubin_outputs "${_cubin_file}")
endif()

add_custom_command(
OUTPUT ${_cubin_outputs}
COMMAND
"${Python3_EXECUTABLE}" "${_cutile_source_dir}/export_vector_add_cubin.py"
"${_cubin_file}" --gpu-code "${_gpu_code}" ${_symbol_arg}
DEPENDS "${_cutile_source_dir}/export_vector_add_cubin.py"
"${_cutile_source_dir}/vector_add_kernel.py"
COMMENT "Exporting cuTile vector_add cubin for ${_gpu_code}"
VERBATIM
)

add_custom_command(
OUTPUT "${_cubin_header}"
COMMAND "${CUTILE_BIN2C}" --const --name "vector_add_${_gpu_code}_cubin" --static
"${_cubin_file}" > "${_cubin_header}"
DEPENDS "${_cubin_file}"
COMMENT "Embedding vector_add ${_gpu_code} cubin via bin2c"
VERBATIM
)

list(APPEND _generated_headers "${_cubin_header}")
endforeach()

add_custom_target(
cutile_vector_add_cubins
DEPENDS "${_symbol_header}" ${_generated_headers}
)

set(${output_include_dir_var}
"${_cutile_binary_dir}"
PARENT_SCOPE
)
endfunction()
Loading
Loading