Skip to content

Commit a71b165

Browse files
authored
uint64 header support for cuvs_bench (#2130)
Closes #2129 This PR adds a extended uint64 header support for `bin` data files. It automatically detects based on the filesize, and therefore does not break support for existing files. This is useful for datasets that exceed `uint32` max number of rows (4.2B). Authors: - Jinsol Park (https://github.com/jinsolp) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #2130
1 parent e7a6f59 commit a71b165

7 files changed

Lines changed: 662 additions & 58 deletions

File tree

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#
2+
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
6+
"""
7+
On-disk header helpers for the cuvs-bench binary file format.
8+
9+
cuvs-bench inherits the big-ann-benchmarks binary layout: a small header
10+
listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols``
11+
array of the dtype implied by the file extension. Two layouts are supported:
12+
13+
- **Legacy**: ``[uint32 n_rows, uint32 n_cols, data ...]`` (8-byte header).
14+
This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin``
15+
/ ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today.
16+
17+
- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]`` (16-byte header).
18+
For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B).
19+
20+
Detection is **size-based**: a well-formed cuvs-bench binary is exactly
21+
``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes
22+
of the file and:
23+
24+
1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte
25+
header). The layout is accepted if ``8 + n_rows * n_cols * itemsize``
26+
matches the on-disk file size.
27+
2. Otherwise tries the extended layout (first 16 bytes as two
28+
``uint64``s, 16-byte header). Accepted if
29+
``16 + n_rows * n_cols * itemsize`` matches the file size instead.
30+
3. If neither layout matches, raises ``ValueError`` -- the file is
31+
truncated, padded, or has a mismatched dtype extension.
32+
"""
33+
34+
import os
35+
import struct
36+
from typing import BinaryIO, Tuple
37+
38+
import numpy as np
39+
40+
UINT32_MAX = (1 << 32) - 1
41+
42+
LEGACY_HEADER_BYTES = 8
43+
EXTENDED_HEADER_BYTES = 16
44+
45+
46+
def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]:
47+
"""Read the header of a cuvs-bench binary file.
48+
49+
Auto-detects the on-disk layout from the file size by checking which
50+
of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64
51+
header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize``
52+
balance.
53+
54+
Parameters
55+
----------
56+
path : str
57+
Path to the binary file.
58+
itemsize : int
59+
Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for
60+
``int8``) used for the size-equation check.
61+
62+
Returns
63+
-------
64+
(n_rows, n_cols, header_bytes) : Tuple[int, int, int]
65+
Row count, column count, and the number of bytes the header
66+
occupies on disk (``8`` for legacy, ``16`` for extended).
67+
68+
Raises
69+
------
70+
ValueError
71+
If neither the legacy nor the extended interpretation matches.
72+
FileNotFoundError
73+
If ``path`` does not exist.
74+
"""
75+
if itemsize < 1:
76+
raise ValueError(
77+
f"itemsize must be a positive integer, got {itemsize!r}"
78+
)
79+
file_size = os.path.getsize(path)
80+
with open(path, "rb") as f:
81+
head = f.read(EXTENDED_HEADER_BYTES)
82+
83+
if len(head) < LEGACY_HEADER_BYTES:
84+
raise ValueError(
85+
f"File too small to contain a valid header (expected at least "
86+
f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}"
87+
)
88+
89+
n_rows_32, n_cols_32 = struct.unpack("<II", head[:LEGACY_HEADER_BYTES])
90+
if file_size == LEGACY_HEADER_BYTES + n_rows_32 * n_cols_32 * itemsize:
91+
return int(n_rows_32), int(n_cols_32), LEGACY_HEADER_BYTES
92+
93+
if len(head) == EXTENDED_HEADER_BYTES:
94+
n_rows_64, n_cols_64 = struct.unpack("<QQ", head)
95+
if (
96+
file_size
97+
== EXTENDED_HEADER_BYTES + n_rows_64 * n_cols_64 * itemsize
98+
):
99+
return int(n_rows_64), int(n_cols_64), EXTENDED_HEADER_BYTES
100+
101+
raise ValueError(
102+
f"File size {file_size:,} bytes does not match either the legacy "
103+
f"(8-byte uint32) or extended (16-byte uint64) header layout for "
104+
f"itemsize={itemsize}: {path}. The file may be truncated, padded, "
105+
f"or have a mismatched dtype extension."
106+
)
107+
108+
109+
def write_bin_header(
110+
f: BinaryIO,
111+
n_rows: int,
112+
n_cols: int,
113+
*,
114+
size_dtype=np.uint32,
115+
) -> int:
116+
"""Write the canonical cuvs-bench binary header at the current position.
117+
118+
The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and
119+
``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used
120+
otherwise, or when explicitly requested via ``size_dtype=np.uint64``.
121+
122+
Parameters
123+
----------
124+
f : BinaryIO
125+
Open binary file handle, positioned where the header should go.
126+
n_rows, n_cols : int
127+
Header values to write. Must be non-negative.
128+
size_dtype : numpy dtype
129+
``np.uint32`` for the legacy 8-byte header (default), or
130+
``np.uint64`` to force the extended 16-byte header.
131+
132+
Returns
133+
-------
134+
int
135+
Number of bytes written (``8`` for legacy, ``16`` for extended).
136+
"""
137+
if n_rows < 0 or n_cols < 0:
138+
raise ValueError(
139+
f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})"
140+
)
141+
use_uint64 = (
142+
np.dtype(size_dtype) == np.uint64
143+
or n_rows > UINT32_MAX
144+
or n_cols > UINT32_MAX
145+
)
146+
if use_uint64:
147+
f.write(struct.pack("<QQ", int(n_rows), int(n_cols)))
148+
return EXTENDED_HEADER_BYTES
149+
f.write(struct.pack("<II", int(n_rows), int(n_cols)))
150+
return LEGACY_HEADER_BYTES

python/cuvs_bench/cuvs_bench/backends/_utils.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
import numpy as np
2929

30+
from cuvs_bench._bin_format import read_bin_header
31+
3032

3133
def dtype_from_filename(filename):
3234
"""Map file extension to numpy dtype.
@@ -53,6 +55,8 @@ def dtype_from_filename(filename):
5355
return np.float16
5456
elif ext == ".ibin":
5557
return np.int32
58+
elif ext == ".u64bin":
59+
return np.uint64
5660
elif ext == ".u8bin":
5761
return np.ubyte
5862
elif ext == ".i8bin":
@@ -65,17 +69,18 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
6569
"""
6670
Read a binary vector file into a numpy array.
6771
68-
Supports the standard big-ann-bench binary format used by cuvs-bench
69-
datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
70-
followed by ``n_rows * n_cols`` elements of the dtype inferred from
71-
the file extension via ``dtype_from_filename``.
72+
Supports the cuvs-bench binary format with either the legacy 8-byte
73+
``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte
74+
``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more
75+
than ``UINT32_MAX`` rows or columns. The layout is auto-detected from
76+
the file size by :func:`cuvs_bench._bin_format.read_bin_header`.
7277
7378
Parameters
7479
----------
7580
path : str
7681
Path to the binary file. The dtype is inferred from the extension:
7782
``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
78-
``.i8bin`` (int8), ``.ibin`` (int32).
83+
``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64).
7984
subset_size : Optional[int]
8085
If provided, only the first ``subset_size`` rows are loaded.
8186
@@ -93,27 +98,24 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
9398
or the file is truncated.
9499
"""
95100
dtype = dtype_from_filename(path)
96-
if subset_size is not None and subset_size < 1:
101+
itemsize = np.dtype(dtype).itemsize
102+
if subset_size is not None and (
103+
isinstance(subset_size, float) or subset_size < 1
104+
):
97105
raise ValueError(
98106
f"subset_size must be a positive integer, got {subset_size}"
99107
)
108+
n_rows, n_cols, header_bytes = read_bin_header(path, itemsize)
109+
if subset_size is not None:
110+
n_rows = min(n_rows, subset_size)
100111
with open(path, "rb") as f:
101-
header = f.read(8)
102-
if len(header) < 8:
103-
raise ValueError(
104-
f"File too small to contain a valid header (expected 8 bytes, "
105-
f"got {len(header)}): {path}"
106-
)
107-
n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
108-
n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
109-
if subset_size is not None:
110-
n_rows = min(n_rows, subset_size)
111-
expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
112+
f.seek(header_bytes)
113+
expected_bytes = n_rows * n_cols * itemsize
112114
raw = f.read(expected_bytes)
113115
if len(raw) < expected_bytes:
114116
raise ValueError(
115117
f"File is truncated: expected {expected_bytes} bytes of data "
116-
f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
118+
f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), "
117119
f"got {len(raw)}: {path}"
118120
)
119121
data = np.frombuffer(raw, dtype=dtype)

python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@
99
import sys
1010
import warnings
1111

12-
from .utils import memmap_bin_file, suffix_from_dtype, write_bin
12+
from .utils import (
13+
groundtruth_neighbors_filename,
14+
memmap_bin_file,
15+
offset_neighbor_indices,
16+
suffix_from_dtype,
17+
write_bin,
18+
write_groundtruth_neighbors,
19+
)
1320

1421

1522
def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
@@ -193,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
193200
D, Ind = cpu_search(X, queries, k, metric=metric)
194201

195202
D, Ind = xp.asarray(D), xp.asarray(Ind)
196-
Ind += i # shift neighbor index by offset i
203+
Ind = offset_neighbor_indices(Ind, i, n_samples)
197204

198205
if distances is None:
199206
distances = D
@@ -358,9 +365,11 @@ def main():
358365
print("Calculating true nearest neighbors")
359366
distances, indices = calc_truth(dataset, queries, args.k, args.metric)
360367

361-
write_bin(
362-
os.path.join(args.output, "groundtruth.neighbors.ibin"),
363-
indices.astype(xp.uint32),
368+
n_base = dataset.shape[0]
369+
write_groundtruth_neighbors(
370+
os.path.join(args.output, groundtruth_neighbors_filename(n_base)),
371+
indices,
372+
n_base,
364373
)
365374
write_bin(
366375
os.path.join(args.output, "groundtruth.distances.fbin"),

0 commit comments

Comments
 (0)