Skip to content

Commit 08fd6ec

Browse files
bessa75meta-codesync[bot]
authored andcommitted
add dataset DINO10B (facebookresearch#4686)
Summary: Hi, I am Matthijs Douze's intern Swann Bessa. I made this pull request to add a new dataset we have created recently. It consists of 10 billion patch features from images in the YFCC100M dataset. The features were computed using the facebook/dinov3-vitl16-pretrain-lvd1689m model. I added some bvecs readers and implemented the DatasetDino10B class. Note : i did not use any additional library. I just imported os in the vecs_io.py file. Pull Request resolved: facebookresearch#4686 Reviewed By: mdouze Differential Revision: D87897571 Pulled By: pankajsingh88 fbshipit-source-id: 256e1ecb28c2cbb58af3dfb7c3ffe77536047918
1 parent 8cff802 commit 08fd6ec

2 files changed

Lines changed: 195 additions & 1 deletion

File tree

contrib/datasets.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import getpass
1010

1111

12-
from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
12+
from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap, bvecs_iter, bvecs_iter_chunked
1313
from .exhaustive_search import knn
1414

1515

@@ -350,6 +350,78 @@ def get_groundtruth(self, k=None):
350350
gt = gt[:, :k]
351351
return gt
352352

353+
class DatasetDINO10B(Dataset):
354+
"""
355+
Data from https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/
356+
The dataset contains 10 billion 1024-d vectors extracted from image patches from the YFCC100M dataset, using a Dino-ViT-L 16 model (facebook/dinov3-vitl16-pretrain-lvd1689m).
357+
The dataset is sharded in multiple chunked .bvecs files. Downloading instructions can be obtained with "wget https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/README.md".
358+
Supported sizes : 100k 200k 500k 1M ... 5B 10B listed in supported_nbs (see __init__).
359+
"""
360+
def __init__(self, nb, ignore_supported = False):
361+
Dataset.__init__(self)
362+
supported_nbs = [100_000, 200_000, 500_000, 1_000_000, 2_000_000, 5_000_000, 10_000_000, 20_000_000, 50_000_000, 100_000_000, 200_000_000, 500_000_000, 1_000_000_000, 2_000_000_000, 5_000_000_000, 10_000_000_000]
363+
if nb not in supported_nbs and not ignore_supported:
364+
raise ValueError(f"Unsupported dataset size: {nb}, supported values are: {supported_nbs}")
365+
if not os.path.exists(dataset_basedir):
366+
raise ValueError(f"Provided dataset base directory does not exist: {dataset_basedir}")
367+
self.basedir = dataset_basedir + "dino_vitl_10B/"
368+
self.indexdir = self.basedir + "chunked_base_10B"
369+
assert os.path.exists(self.indexdir), f"Index path should exist, check your dataset path: {self.indexdir}"
370+
self.queriesdir = self.basedir + "queries_clean.bvecs"
371+
assert os.path.exists(self.queriesdir), f"Queries path should exist as dataset size {nb} is supported: {self.queriesdir}"
372+
self.gtsdir = self.basedir + "gts/" + "gts_dino_patch_" + str(nb) + "_" + "k10.npy"
373+
self.train_queriesdir = self.basedir + "train_queries_99M.bvecs"
374+
self.nb = nb
375+
self.d = 1024
376+
self.nq = 100_000
377+
self.nt = 99_000_000
378+
self.metric = "L2"
379+
380+
381+
def get_queries(self):
382+
"""Get all vectors as a single array"""
383+
queries = bvecs_mmap(self.queriesdir)
384+
return sanitize(queries)
385+
386+
def get_train(self, maxtrain = None):
387+
"""Get training query vectors as a single array"""
388+
if maxtrain is None or maxtrain > 10_000_000:
389+
raise NotImplementedError("The training set is potentially too large to fit in RAM (400 GB of data). Please use train_iterator or use maxtrain parameter below 10_000_000 to get the first maxtrain training vectors.")
390+
return sanitize(bvecs_mmap(self.train_queriesdir)[:maxtrain])
391+
392+
def get_database(self):
393+
"""Get all database vectors as a single array"""
394+
if self.nb > 10_000_000:
395+
raise NotImplementedError("The dataset is potentially too large to fit in RAM. Please use database_iterator or use a dataset size equal to or below 10_000_000.")
396+
else:
397+
return sanitize(bvecs_iter_chunked(self.indexdir, batch_size=self.nb).__next__())
398+
399+
def database_iterator(self, bs=10_000):
400+
"""Iterator over the database of size nb, corresponding to the first nb vectors in the .bvecs file"""
401+
total_read = 0
402+
for batch in bvecs_iter_chunked(self.indexdir, batch_size=bs):
403+
if total_read + batch.shape[0] > self.nb:
404+
batch = batch[:self.nb - total_read]
405+
yield sanitize(batch)
406+
total_read += batch.shape[0]
407+
if total_read >= self.nb:
408+
break
409+
410+
def train_iterator(self, bs=10_000):
411+
"""Iterator over all training query vectors in the .bvecs file"""
412+
for batch in bvecs_iter(self.train_queriesdir, batch_size=bs):
413+
yield sanitize(batch)
414+
415+
def get_groundtruth(self, k = 10):
416+
"""Get ground truth from .npy file"""
417+
if k > 10:
418+
raise NotImplementedError("Ground truth files only available for k<=10")
419+
gts = np.load(self.gtsdir)
420+
gts = gts[:, :k]
421+
return gts
422+
423+
def distance(self):
424+
return "euclidean"
353425

354426
def dataset_from_name(dataset='deep1M', download=False):
355427
""" converts a string describing a dataset to a Dataset object
@@ -385,5 +457,9 @@ def dataset_from_name(dataset='deep1M', download=False):
385457
elif dataset == "glove":
386458
return DatasetGlove(download=download)
387459

460+
elif dataset.startswith("dino"):
461+
dbsize = 10_000_000_000 if dataset == "dino10B" else int(dataset[4:])
462+
return DatasetDINO10B(nb=dbsize)
463+
388464
else:
389465
raise RuntimeError("unknown dataset " + dataset)

contrib/vecs_io.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import sys
77
import numpy as np
8+
import os
89

910
"""
1011
I/O functions in fvecs, bvecs, ivecs formats
@@ -58,3 +59,120 @@ def ivecs_write(fname, m):
5859
def fvecs_write(fname, m):
5960
m = m.astype('float32')
6061
ivecs_write(fname, m.view('int32'))
62+
63+
def bvecs_iter(filepath, batch_size=100_000):
64+
"""
65+
Memory-mapped iterator - only loads requested slices into RAM
66+
"""
67+
68+
file_size = os.path.getsize(filepath)
69+
with open(filepath, 'rb') as f:
70+
dim = np.frombuffer(f.read(4), dtype='<i4')[0]
71+
72+
bytes_per_vec = 4 + dim
73+
n_vectors = file_size // bytes_per_vec
74+
75+
mm = np.memmap(filepath, mode='r', dtype=np.uint8)
76+
records = mm.reshape(n_vectors, bytes_per_vec)
77+
78+
for start in range(0, n_vectors, batch_size):
79+
end = np.min([start + batch_size, n_vectors])
80+
yield records[start:end, 4:]
81+
82+
def bvecs_iter_chunked(chunk_folder, batch_size=100_000):
83+
"""
84+
Memory-mapped iterator over chunked .bvecs files.
85+
Iterates through all chunk files in order (chunk_0000.bvecs, chunk_0001.bvecs, etc.)
86+
and yields batches of vectors, handling cases where batches span multiple files.
87+
88+
Args:
89+
chunk_folder: path to folder containing chunk_XXXX.bvecs files
90+
batch_size: number of vectors to yield per batch
91+
92+
Yields:
93+
numpy array of shape (batch_size, d) or smaller for last batch
94+
95+
Raises:
96+
ValueError: if there are gaps in the chunk sequence
97+
"""
98+
99+
# Find all chunk files and sort them
100+
chunk_files = []
101+
for entry in os.scandir(chunk_folder):
102+
if entry.is_file() and entry.name.startswith("chunk_") and entry.name.endswith(".bvecs"):
103+
chunk_files.append(entry.path)
104+
chunk_files.sort()
105+
106+
if not chunk_files:
107+
raise ValueError(f"No chunk files found in {chunk_folder}")
108+
109+
# Extract chunk numbers and verify no gaps
110+
chunk_numbers = []
111+
for path in chunk_files:
112+
basename = os.path.basename(path)
113+
try:
114+
num_str = basename.split('_')[1].split('.')[0]
115+
chunk_numbers.append(int(num_str))
116+
except (IndexError, ValueError):
117+
raise ValueError(f"Invalid chunk filename format: {basename}")
118+
119+
# Check for gaps in sequence
120+
expected_chunks = list(range(len(chunk_numbers)))
121+
if sorted(chunk_numbers) != expected_chunks:
122+
missing = set(expected_chunks) - set(chunk_numbers)
123+
raise ValueError(
124+
f"Gap detected in chunk sequence! Missing chunks: {sorted(missing)}\n"
125+
f"Found chunks: {sorted(chunk_numbers)}\n"
126+
f"Expected continuous sequence from 0 to {len(chunk_numbers)-1}"
127+
)
128+
129+
# Get dimension from first chunk
130+
with open(chunk_files[0], 'rb') as f:
131+
dim = np.frombuffer(f.read(4), dtype='<i4')[0]
132+
133+
bytes_per_vec = 4 + dim
134+
135+
# Buffer to accumulate vectors across chunk boundaries
136+
buffer = None
137+
buffer_size = 0
138+
139+
# Iterate through each chunk file
140+
for chunk_path in chunk_files:
141+
file_size = os.path.getsize(chunk_path)
142+
n_vectors = file_size // bytes_per_vec
143+
144+
# Memory-map this chunk
145+
mm = np.memmap(chunk_path, mode='r', dtype=np.uint8)
146+
records = mm.reshape(n_vectors, bytes_per_vec)
147+
vectors = records[:, 4:] # Skip dimension prefix
148+
149+
start = 0
150+
151+
# First, handle any buffered data from previous chunk
152+
if buffer is not None:
153+
needed = batch_size - buffer_size
154+
if needed <= n_vectors:
155+
# Can complete the buffered batch with this chunk
156+
batch = np.vstack([buffer, vectors[:needed]])
157+
yield batch
158+
buffer = None
159+
buffer_size = 0
160+
start = needed
161+
else:
162+
# Still not enough, accumulate and continue to next chunk
163+
buffer = np.vstack([buffer, vectors])
164+
buffer_size += n_vectors
165+
continue
166+
167+
# Now process complete batches from this chunk
168+
while start + batch_size <= n_vectors:
169+
yield vectors[start:start + batch_size]
170+
start += batch_size
171+
172+
remainder = n_vectors - start
173+
if remainder > 0:
174+
buffer = vectors[start:].copy()
175+
buffer_size = remainder
176+
177+
if buffer is not None:
178+
yield buffer

0 commit comments

Comments
 (0)