Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Enhancements
* Added engine as a top-level optional parameter while creating vector field [#2736](https://github.com/opensearch-project/k-NN/pull/2736)
* Migrate k-NN plugin to use GRPC transport-grpc SPI interface [#2833](https://github.com/opensearch-project/k-NN/pull/2833)
* Native scoring for FP16 [#2922](https://github.com/opensearch-project/k-NN/pull/2922)
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ tasks.register('buildJniLib', Exec) {
args.add("--build")
args.add("jni/build")
args.add("--target")
def knn_libs = ['opensearchknn_faiss', 'opensearchknn_common', 'opensearchknn_nmslib']
def knn_libs = ['opensearchknn_faiss', 'opensearchknn_common', 'opensearchknn_nmslib', 'opensearchknn_simd']
if (project.hasProperty('knn_libs')) {
knn_libs = ['opensearchknn_common'] + project.knn_libs.split(',').collect { it.trim() }
}
Expand Down
43 changes: 43 additions & 0 deletions jni/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(TARGET_LIB_UTIL opensearchknn_util)
set(TARGET_LIB_COMMON opensearchknn_common) # common lib for JNI
set(TARGET_LIB_NMSLIB opensearchknn_nmslib) # nmslib JNI
set(TARGET_LIB_FAISS opensearchknn_faiss) # faiss JNI
set(TARGET_LIB_SIMD opensearchknn_simd) # SIMD computing JNI
set(TARGET_LIBS "") # Libs to be installed

set(CMAKE_CXX_STANDARD 17)
Expand Down Expand Up @@ -127,6 +128,48 @@ endif ()

# ---------------------------------------------------------------------------

# ----------------------------- SIMD Computing ------------------------------
# Init SIMD computing properties
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/init-simd.cmake)

# Set target library + source
set(SIMD_COMPUTING_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/src/org_opensearch_knn_jni_SimdVectorComputeService.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/simd/similarity_function/similarity_function.cpp
)

# Make it a shared library
add_library(${TARGET_LIB_SIMD} SHARED ${SIMD_COMPUTING_SRCS})

# Set SIMD compile option
target_compile_options(${TARGET_LIB_SIMD} PRIVATE ${FP16_SIMD_FLAGS} ${SIMD_FLAGS})

# Link util library
target_link_libraries(${TARGET_LIB_SIMD} ${TARGET_LINK_FAISS_LIB} ${TARGET_LIB_UTIL})

# Add include headers
target_include_directories(${TARGET_LIB_SIMD} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/jni/include
$ENV{JAVA_HOME}/include
$ENV{JAVA_HOME}/include/${JVM_OS_TYPE}
${CMAKE_CURRENT_SOURCE_DIR}/external/faiss
)

# Set common properties
opensearch_set_common_properties(${TARGET_LIB_SIMD})

# Set the output name to embed the SIMD extension (if any)
set_target_properties(${TARGET_LIB_SIMD} PROPERTIES
OUTPUT_NAME "${TARGET_LIB_SIMD}"
)

# Add target SIMD library to TARGET_LIBS list
list(APPEND TARGET_LIBS ${TARGET_LIB_SIMD})
# ---------------------------------------------------------------------------



# --------------------------------- TESTS -----------------------------------
# Windows : Comment the TESTS for now because the tests are failing(failing to build jni_tests.exe) if we are building our target libraries as SHARED libraries.
# TODO: Fix the failing JNI TESTS on Windows
Expand Down
138 changes: 138 additions & 0 deletions jni/cmake/init-simd.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#

include(CheckCXXSourceCompiles)

# Allow user overrides
if(NOT DEFINED AVX2_ENABLED)
set(AVX2_ENABLED true) # set default value as true if the argument is not set
endif()

if(NOT DEFINED AVX512_ENABLED)
set(AVX512_ENABLED true) # set default value as true if the argument is not set
endif()

if(NOT DEFINED AVX512_SPR_ENABLED)
# Check if the system is Intel(R) Sapphire Rapids or a newer-generation processor
execute_process(COMMAND bash -c "lscpu | grep -q 'GenuineIntel' && lscpu | grep -i 'avx512_fp16' | grep -i 'avx512_bf16' | grep -i 'avx512_vpopcntdq'" OUTPUT_VARIABLE SPR_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT "${SPR_FLAGS}" STREQUAL "")
set(AVX512_SPR_ENABLED true)
else()
set(AVX512_SPR_ENABLED false)
endif()
endif()

# Default SIMD state
set(KNN_HAVE_AVX2_F16C OFF)
set(KNN_HAVE_AVX512 OFF)
set(KNN_HAVE_AVX512_SPR OFF)
set(KNN_HAVE_ARM_FP16 OFF)
set(SIMD_OPT_LEVEL "")
set(SIMD_FLAGS "")

if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows" OR (NOT AVX2_ENABLED AND NOT AVX512_ENABLED AND NOT AVX512_SPR_ENABLED))
message(STATUS "[SIMD] Windows or SIMD explicitly disabled. Falling back to generic.")
set(SIMD_OPT_LEVEL "generic") # Keep optimization level as generic on Windows OS as it is not supported due to MINGW64 compiler issue.
set(SIMD_FLAGS "")

elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+fp16")
check_cxx_source_compiles("
#include <arm_neon.h>
int main() {
float32x4_t f = vdupq_n_f32(1.0f);
float16x4_t h = vcvt_f16_f32(f);
(void)h;
return 0;
}" HAVE_NEON_FP16)
unset(CMAKE_REQUIRED_FLAGS)

if(HAVE_NEON_FP16)
set(KNN_HAVE_ARM_FP16 ON)
set(SIMD_OPT_LEVEL "generic") # On aarch64 avx2 is not supported.
set(SIMD_FLAGS -march=armv8.4-a+fp16)
add_definitions(-DKNN_HAVE_ARM_FP16)
message(STATUS "[SIMD] ARM NEON with FP16 supported.")
else()
message(STATUS "[SIMD] ARM NEON FP16 instructions not supported by compiler. Falling back to generic.")
endif()

elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" AND AVX512_SPR_ENABLED)
set(CMAKE_REQUIRED_FLAGS "-mavx512f -mavx512fp16 -mf16c")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
__m512 v = _mm512_set1_ps(1.0f);
__m256i h = _mm512_cvt_roundps_ph(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
__m512 w = _mm512_cvtph_ps(h);
(void)w;
return 0;
}" HAVE_AVX512_SPR_COMPILER)
unset(CMAKE_REQUIRED_FLAGS)

if(HAVE_AVX512_SPR_COMPILER)
set(KNN_HAVE_AVX512_SPR ON)
set(SIMD_OPT_LEVEL "avx512_spr")
set(SIMD_FLAGS -mavx512f -mavx512fp16 -mf16c)
add_definitions(-DKNN_HAVE_AVX512_SPR)
message(STATUS "[SIMD] AVX512_SPR supported by compiler.")
else()
message(FATAL_ERROR "[SIMD] AVX512_SPR was explicitly enabled, but compiler does not support it.")
endif()

elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" AND AVX512_ENABLED)
set(CMAKE_REQUIRED_FLAGS "-mavx512f -mf16c")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
__m512 v = _mm512_setzero_ps();
__m256i h = _mm512_cvtps_ph(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
(void)h;
return 0;
}" HAVE_AVX512_COMPILER)
unset(CMAKE_REQUIRED_FLAGS)

if(HAVE_AVX512_COMPILER)
set(KNN_HAVE_AVX512 ON)
set(SIMD_OPT_LEVEL "avx512") # Keep optimization level as avx512 to improve performance on Linux. This is not present on mac systems, and presently not supported on Windows OS.
set(SIMD_FLAGS -mavx512f -mf16c)
add_definitions(-DKNN_HAVE_AVX512)
message(STATUS "[SIMD] AVX512 + F16C supported by compiler.")
else()
message(FATAL_ERROR "[SIMD] AVX512 + FP16 was explicitly enabled, but compiler does not support it.")
endif()

else()
set(CMAKE_REQUIRED_FLAGS "-mavx2 -mf16c -mfma")
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
__m256 v = _mm256_setzero_ps();
__m128i h = _mm256_cvtps_ph(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
(void)h;
return 0;
}" HAVE_AVX2_COMPILER)
unset(CMAKE_REQUIRED_FLAGS)

if(HAVE_AVX2_COMPILER)
set(KNN_HAVE_AVX2_F16C ON)
set(SIMD_OPT_LEVEL "avx2") # Keep optimization level as avx2 to improve performance on Linux and Mac.
set(SIMD_FLAGS -mavx2 -mf16c -mfma)
add_definitions(-DKNN_HAVE_AVX2_F16C)
message(STATUS "[SIMD] AVX2 + F16C supported by compiler.")
else()
message(FATAL_ERROR "[SIMD] AVX2 + F16C was explicitly enabled, but compiler does not support it.")
endif()
endif()

# Fallback if nothing matched
if(SIMD_OPT_LEVEL STREQUAL "")
message(WARNING "[SIMD] No SIMD support detected or all SIMD options disabled. Falling back to Java encoding/decoding.")
set(SIMD_OPT_LEVEL "generic")
set(SIMD_FLAGS "")
endif()

# Always-used flags
set(FP16_SIMD_FLAGS "-O3" "-fPIC")
16 changes: 16 additions & 0 deletions jni/include/memory_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,20 @@
#define RESTRICT
#endif

#if defined(__GNUC__) || defined(__clang__)
/**
* Generic wrapper for GCC/Clang's __builtin_assume_aligned.
* This tells the compiler that 'ptr' is guaranteed to be aligned to 'align' bytes.
*/
#define BUILTIN_ASSUME_ALIGNED(ptr, align) \
(typeof(ptr))__builtin_assume_aligned((ptr), (align))
#else

/**
* Fallback for other compilers (e.g., MSVC or others without __builtin_assume_aligned).
* Returns the original pointer, relying on explicit aligned intrinsics like _mm512_load_ps.
*/
#define BUILTIN_ASSUME_ALIGNED(ptr, align) (ptr)
#endif

#endif //KNNPLUGIN_JNI_INCLUDE_MEMORY_UTIL_H_
37 changes: 37 additions & 0 deletions jni/include/org_opensearch_knn_jni_SimdVectorComputeService.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions jni/include/platform_defs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#if defined(__GNUC__) || defined(__clang__)
#define LIKELY(x) (__builtin_expect(!!(x), 1))
#define UNLIKELY(x) (__builtin_expect(!!(x), 0))
#elif defined(_MSC_VER)
// MSVC doesn't have __builtin_expect; just pass through
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
86 changes: 86 additions & 0 deletions jni/include/simd/similarity_function/similarity_function.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#ifndef OPENSEARCH_KNN_SIMD_SIMILARITY_FUNCTION_H
#define OPENSEARCH_KNN_SIMD_SIMILARITY_FUNCTION_H

#include <cstdint>
#include <vector>
#include <memory>
#include "faiss/impl/DistanceComputer.h"

namespace knn_jni::simd::similarity_function {
enum class NativeSimilarityFunctionType {
// Max inner product for FP16.
// Max inner product will transform inner product to v < 0 ? 1 / (1 - v) : (1 + v)
FP16_MAXIMUM_INNER_PRODUCT,
// L2 for FP16
FP16_L2
};

struct SimilarityFunction;

struct SimdVectorSearchContext {
// SIMD aligned query bytes
void* queryVectorSimdAligned = nullptr;
// Query vector byte size
int32_t queryVectorByteSize = 0;
// Vector dimension
int32_t dimension = 0;
// Stored vector byte size. Based on its quantization status, this value can vary than `queryVectorByteSize`.
// For example, for FP16, this value would be 2 * dimension
int64_t oneVectorByteSize = 0;
// Underlying mmap page table. If Faiss index is large, then there can be several mapped regions over it.
std::vector<void*> mmapPages;
// Mapped page size for each. mmapPageSizes[i] -> mmapPages[i]'s size
std::vector<int64_t> mmapPageSizes;
// Function type index
int32_t nativeFunctionTypeOrd = -1;
// Similarity function calculating similarity that was chosen based on `nativeFunctionTypeOrd`.
SimilarityFunction* similarityFunction;
// Faiss distance computation function.
std::unique_ptr<faiss::DistanceComputer> faissFunction;
// Temp buffer which is reset per search.
std::vector<uint8_t> tmpBuffer;

~SimdVectorSearchContext();

// This will look up internal mapping table and acquire raw pointers pointing to vectors with the passed vector
// ids then put them into `vectors`
void getVectorPointersInBulk(uint8_t* vectors[], int32_t* internalVectorIds, int32_t numVectors);

// Similar to `getVectorPointersInBulk`, but it returns raw pointer pointing to the vector it's looking for.
uint8_t* getVectorPointer(int32_t internalVectorId);
};

// This class's responsibility is to calculate similarity between query and vectors.
// It first saves search context first in static thread local object, then use it during search.
struct SimilarityFunction {
virtual ~SimilarityFunction() = default;

// Save required information during search in static thread local storage.
// The maximum thread local storage is bounded by O(SizeOf(Query Vector Size)).
static SimdVectorSearchContext* saveSearchContext(
uint8_t* queryPtr,
int32_t queryByteSize,
int32_t dimension,
int64_t* mmapAddressAndSize,
int32_t numAddressAndSize,
int32_t nativeFunctionTypeOrd);

// Return thread static storage it's holding.
static SimdVectorSearchContext* getSearchContext();

// Given vector ids, calculate similarity in bulk and put scores into `scores`.
virtual void calculateSimilarityInBulk(SimdVectorSearchContext* srchContext,
int32_t* internalVectorIds,
float* scores,
int32_t numVectors) = 0;

// Similar to `calculateSimilarityInBulk`, but this targets a single vector and returns a score.
virtual float calculateSimilarity(SimdVectorSearchContext* srchContext, int32_t internalVectorId) = 0;

private:
// Select similarity function based on the function type.
static SimilarityFunction* selectSimilarityFunction(NativeSimilarityFunctionType nativeFunctionType);
};
}

#endif // OPENSEARCH_KNN_SIMD_SIMILARITY_FUNCTION_H
Loading
Loading