Skip to content

Commit a77f422

Browse files
authored
Merge pull request #1810 from tursodatabase/vector-search-avoid-unaligned-reads
vector-search: avoid unaligned reads
2 parents e673973 + 27b3fc4 commit a77f422

File tree

8 files changed

+188
-104
lines changed

8 files changed

+188
-104
lines changed

libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c

Lines changed: 46 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -85673,8 +85673,9 @@ void blobSpotFree(BlobSpot *pBlobSpot);
8567385673

8567485674
/*
8567585675
* Accessor for node binary format
85676-
* - v1 format is the following:
85677-
* [u64 nRowid] [u16 nEdges] [node vector] [edge vector] * nEdges [trash vector] * (nMaxEdges - nEdges) ([u64 legacyField] [u64 edgeId]) * nEdges
85676+
* - default format is the following:
85677+
* [u64 nRowid] [u16 nEdges] [6 byte padding] [node vector] [edge vector] * nEdges [trash vector] * (nMaxEdges - nEdges) ([u32 unused] [f32 distance] [u64 edgeId]) * nEdges
85678+
* Note, that 6 byte padding after nEdges required to align [node vector] by word boundary and avoid unaligned reads
8567885679
* Note, that node vector and edge vector can have different representations (and edge vector can be smaller in size than node vector)
8567985680
*/
8568085681
int nodeEdgesMaxCount(const DiskAnnIndex *pIndex);
@@ -85713,9 +85714,11 @@ typedef u8 MetricType;
8571385714
/*
8571485715
* 1 - v1 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u64 unused ] [u64 edge rowid] ] ...
8571585716
* 2 - v2 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u32 unused] [f32 distance] [u64 edge rowid] ] ...
85717+
* 3 - v3 version; node meta aligned to 8-byte boundary (instead of having u64 + u16 size - we round it up to u64 + u64)
8571685718
*/
8571785719
#define VECTOR_FORMAT_V1 1
85718-
#define VECTOR_FORMAT_DEFAULT 2
85720+
#define VECTOR_FORMAT_V2 2
85721+
#define VECTOR_FORMAT_DEFAULT 3
8571985722

8572085723
/* type of the vector index */
8572185724
#define VECTOR_INDEX_TYPE_PARAM_ID 2
@@ -212727,8 +212730,6 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){
212727212730
*/
212728212731
#define DISKANN_BLOCK_SIZE_SHIFT 9
212729212732

212730-
#define VECTOR_NODE_METADATA_SIZE (sizeof(u64) + sizeof(u16))
212731-
#define VECTOR_EDGE_METADATA_SIZE (sizeof(u64) + sizeof(u64))
212732212733

212733212734
typedef struct VectorPair VectorPair;
212734212735
typedef struct DiskAnnSearchCtx DiskAnnSearchCtx;
@@ -212951,46 +212952,58 @@ void blobSpotFree(BlobSpot *pBlobSpot) {
212951212952
** Layout specific utilities
212952212953
**************************************************************************/
212953212954

212954-
int nodeEdgeOverhead(int nEdgeVectorSize){
212955-
return nEdgeVectorSize + VECTOR_EDGE_METADATA_SIZE;
212955+
int nodeMetadataSize(int nFormatVersion){
212956+
if( nFormatVersion <= VECTOR_FORMAT_V2 ){
212957+
return (sizeof(u64) + sizeof(u16));
212958+
}else{
212959+
return (sizeof(u64) + sizeof(u64));
212960+
}
212961+
}
212962+
212963+
int edgeMetadataSize(int nFormatVersion){
212964+
return (sizeof(u64) + sizeof(u64));
212965+
}
212966+
212967+
int nodeEdgeOverhead(int nFormatVersion, int nEdgeVectorSize){
212968+
return nEdgeVectorSize + edgeMetadataSize(nFormatVersion);
212956212969
}
212957212970

212958-
int nodeOverhead(int nNodeVectorSize){
212959-
return nNodeVectorSize + VECTOR_NODE_METADATA_SIZE;
212971+
int nodeOverhead(int nFormatVersion, int nNodeVectorSize){
212972+
return nNodeVectorSize + nodeMetadataSize(nFormatVersion);
212960212973
}
212961212974

212962212975
int nodeEdgesMaxCount(const DiskAnnIndex *pIndex){
212963-
unsigned int nMaxEdges = (pIndex->nBlockSize - nodeOverhead(pIndex->nNodeVectorSize)) / nodeEdgeOverhead(pIndex->nEdgeVectorSize);
212976+
unsigned int nMaxEdges = (pIndex->nBlockSize - nodeOverhead(pIndex->nFormatVersion, pIndex->nNodeVectorSize)) / nodeEdgeOverhead(pIndex->nFormatVersion, pIndex->nEdgeVectorSize);
212964212977
assert( nMaxEdges > 0);
212965212978
return nMaxEdges;
212966212979
}
212967212980

212968212981
int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex){
212969212982
unsigned int offset;
212970212983
unsigned int nMaxEdges = nodeEdgesMaxCount(pIndex);
212971-
offset = VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + nMaxEdges * pIndex->nEdgeVectorSize;
212984+
offset = nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + nMaxEdges * pIndex->nEdgeVectorSize;
212972212985
assert( offset <= pIndex->nBlockSize );
212973212986
return offset;
212974212987
}
212975212988

212976212989
void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector){
212977-
assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize );
212990+
assert( nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize );
212978212991

212979212992
memset(pBlobSpot->pBuffer, 0, pBlobSpot->nBufferSize);
212980212993
writeLE64(pBlobSpot->pBuffer, nRowid);
212981212994
// neighbours count already zero after memset - no need to set it explicitly
212982212995

212983-
vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE, pIndex->nNodeVectorSize);
212996+
vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + nodeMetadataSize(pIndex->nFormatVersion), pIndex->nNodeVectorSize);
212984212997
}
212985212998

212986212999
void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector) {
212987-
assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize );
213000+
assert( nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize );
212988213001

212989-
vectorInitStatic(pVector, pIndex->nNodeVectorType, pIndex->nVectorDims, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE);
213002+
vectorInitStatic(pVector, pIndex->nNodeVectorType, pIndex->nVectorDims, pBlobSpot->pBuffer + nodeMetadataSize(pIndex->nFormatVersion));
212990213003
}
212991213004

212992213005
u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) {
212993-
assert( VECTOR_NODE_METADATA_SIZE <= pBlobSpot->nBufferSize );
213006+
assert( nodeMetadataSize(pIndex->nFormatVersion) <= pBlobSpot->nBufferSize );
212994213007

212995213008
return readLE16(pBlobSpot->pBuffer + sizeof(u64));
212996213009
}
@@ -213000,20 +213013,20 @@ void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdg
213000213013
int offset = nodeEdgesMetadataOffset(pIndex);
213001213014

213002213015
if( pRowid != NULL ){
213003-
assert( offset + (iEdge + 1) * VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize );
213004-
*pRowid = readLE64(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u64));
213016+
assert( offset + (iEdge + 1) * edgeMetadataSize(pIndex->nFormatVersion) <= pBlobSpot->nBufferSize );
213017+
*pRowid = readLE64(pBlobSpot->pBuffer + offset + iEdge * edgeMetadataSize(pIndex->nFormatVersion) + sizeof(u64));
213005213018
}
213006213019
if( pIndex->nFormatVersion != VECTOR_FORMAT_V1 && pDistance != NULL ){
213007-
distance = readLE32(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u32));
213020+
distance = readLE32(pBlobSpot->pBuffer + offset + iEdge * edgeMetadataSize(pIndex->nFormatVersion) + sizeof(u32));
213008213021
*pDistance = *((float*)&distance);
213009213022
}
213010213023
if( pVector != NULL ){
213011-
assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize < offset );
213024+
assert( nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize < offset );
213012213025
vectorInitStatic(
213013213026
pVector,
213014213027
pIndex->nEdgeVectorType,
213015213028
pIndex->nVectorDims,
213016-
pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize
213029+
pBlobSpot->pBuffer + nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize
213017213030
);
213018213031
}
213019213032
}
@@ -213050,11 +213063,11 @@ void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iRe
213050213063
nEdges++;
213051213064
}
213052213065

213053-
edgeVectorOffset = VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iReplace * pIndex->nEdgeVectorSize;
213054-
edgeMetaOffset = nodeEdgesMetadataOffset(pIndex) + iReplace * VECTOR_EDGE_METADATA_SIZE;
213066+
edgeVectorOffset = nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + iReplace * pIndex->nEdgeVectorSize;
213067+
edgeMetaOffset = nodeEdgesMetadataOffset(pIndex) + iReplace * edgeMetadataSize(pIndex->nFormatVersion);
213055213068

213056213069
assert( edgeVectorOffset + pIndex->nEdgeVectorSize <= pBlobSpot->nBufferSize );
213057-
assert( edgeMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize );
213070+
assert( edgeMetaOffset + edgeMetadataSize(pIndex->nFormatVersion) <= pBlobSpot->nBufferSize );
213058213071

213059213072
vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + edgeVectorOffset, pIndex->nEdgeVectorSize);
213060213073
writeLE32(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u32), *((u32*)&distance));
@@ -213070,19 +213083,19 @@ void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDel
213070213083

213071213084
assert( 0 <= iDelete && iDelete < nEdges );
213072213085

213073-
edgeVectorOffset = VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iDelete * pIndex->nEdgeVectorSize;
213074-
lastVectorOffset = VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + (nEdges - 1) * pIndex->nEdgeVectorSize;
213075-
edgeMetaOffset = nodeEdgesMetadataOffset(pIndex) + iDelete * VECTOR_EDGE_METADATA_SIZE;
213076-
lastMetaOffset = nodeEdgesMetadataOffset(pIndex) + (nEdges - 1) * VECTOR_EDGE_METADATA_SIZE;
213086+
edgeVectorOffset = nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + iDelete * pIndex->nEdgeVectorSize;
213087+
lastVectorOffset = nodeMetadataSize(pIndex->nFormatVersion) + pIndex->nNodeVectorSize + (nEdges - 1) * pIndex->nEdgeVectorSize;
213088+
edgeMetaOffset = nodeEdgesMetadataOffset(pIndex) + iDelete * edgeMetadataSize(pIndex->nFormatVersion);
213089+
lastMetaOffset = nodeEdgesMetadataOffset(pIndex) + (nEdges - 1) * edgeMetadataSize(pIndex->nFormatVersion);
213077213090

213078213091
assert( edgeVectorOffset + pIndex->nEdgeVectorSize <= pBlobSpot->nBufferSize );
213079213092
assert( lastVectorOffset + pIndex->nEdgeVectorSize <= pBlobSpot->nBufferSize );
213080-
assert( edgeMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize );
213081-
assert( lastMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize );
213093+
assert( edgeMetaOffset + edgeMetadataSize(pIndex->nFormatVersion) <= pBlobSpot->nBufferSize );
213094+
assert( lastMetaOffset + edgeMetadataSize(pIndex->nFormatVersion) <= pBlobSpot->nBufferSize );
213082213095

213083213096
if( edgeVectorOffset < lastVectorOffset ){
213084213097
memmove(pBlobSpot->pBuffer + edgeVectorOffset, pBlobSpot->pBuffer + lastVectorOffset, pIndex->nEdgeVectorSize);
213085-
memmove(pBlobSpot->pBuffer + edgeMetaOffset, pBlobSpot->pBuffer + lastMetaOffset, VECTOR_EDGE_METADATA_SIZE);
213098+
memmove(pBlobSpot->pBuffer + edgeMetaOffset, pBlobSpot->pBuffer + lastMetaOffset, edgeMetadataSize(pIndex->nFormatVersion));
213086213099
}
213087213100

213088213101
writeLE16(pBlobSpot->pBuffer + sizeof(u64), nEdges - 1);
@@ -213168,9 +213181,9 @@ int diskAnnCreateIndex(
213168213181
if( maxNeighborsParam == 0 ){
213169213182
// 3 D**(1/2) gives good recall values (90%+)
213170213183
// we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound
213171-
maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(neighbours, dims)) + 1);
213184+
maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(VECTOR_FORMAT_DEFAULT, vectorDataSize(type, dims))) / nodeEdgeOverhead(VECTOR_FORMAT_DEFAULT, vectorDataSize(neighbours, dims)) + 1);
213172213185
}
213173-
blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(neighbours, dims));
213186+
blockSizeBytes = nodeOverhead(VECTOR_FORMAT_DEFAULT, vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(VECTOR_FORMAT_DEFAULT, vectorDataSize(neighbours, dims));
213174213187
if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){
213175213188
return SQLITE_ERROR;
213176213189
}

libsql-ffi/bundled/bindings/bindgen.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,7 @@ extern "C" {
940940
extern "C" {
941941
pub fn sqlite3_vmprintf(
942942
arg1: *const ::std::os::raw::c_char,
943-
arg2: va_list,
943+
arg2: *mut __va_list_tag,
944944
) -> *mut ::std::os::raw::c_char;
945945
}
946946
extern "C" {
@@ -956,7 +956,7 @@ extern "C" {
956956
arg1: ::std::os::raw::c_int,
957957
arg2: *mut ::std::os::raw::c_char,
958958
arg3: *const ::std::os::raw::c_char,
959-
arg4: va_list,
959+
arg4: *mut __va_list_tag,
960960
) -> *mut ::std::os::raw::c_char;
961961
}
962962
extern "C" {
@@ -2503,7 +2503,7 @@ extern "C" {
25032503
pub fn sqlite3_str_vappendf(
25042504
arg1: *mut sqlite3_str,
25052505
zFormat: *const ::std::os::raw::c_char,
2506-
arg2: va_list,
2506+
arg2: *mut __va_list_tag,
25072507
);
25082508
}
25092509
extern "C" {
@@ -3570,4 +3570,12 @@ extern "C" {
35703570
extern "C" {
35713571
pub static sqlite3_wal_manager: libsql_wal_manager;
35723572
}
3573-
pub type __builtin_va_list = *mut ::std::os::raw::c_char;
3573+
pub type __builtin_va_list = [__va_list_tag; 1usize];
3574+
#[repr(C)]
3575+
#[derive(Debug, Copy, Clone)]
3576+
pub struct __va_list_tag {
3577+
pub gp_offset: ::std::os::raw::c_uint,
3578+
pub fp_offset: ::std::os::raw::c_uint,
3579+
pub overflow_arg_area: *mut ::std::os::raw::c_void,
3580+
pub reg_save_area: *mut ::std::os::raw::c_void,
3581+
}

0 commit comments

Comments
 (0)