add dataset DINO10B (facebookresearch#4686)

bessa75 · meta-codesync[bot] · commit 08fd6eca7027 · 2025-12-01T16:35:41.000-08:00
Summary: Hi, I am Matthijs Douze's intern Swann Bessa. I made this pull request to add a new dataset we have created recently. It consists of 10 billion patch features from images in the YFCC100M dataset. The features were computed using the facebook/dinov3-vitl16-pretrain-lvd1689m model. I added some bvecs readers and implemented the DatasetDino10B class. Note : i did not use any additional library. I just imported os in the vecs_io.py file. Pull Request resolved: facebookresearch#4686 Reviewed By: mdouze Differential Revision: D87897571 Pulled By: pankajsingh88 fbshipit-source-id: 256e1ecb28c2cbb58af3dfb7c3ffe77536047918
diff --git a/contrib/datasets.py b/contrib/datasets.py
@@ -9,7 +9,7 @@
 import getpass
 
 
-from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
+from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap, bvecs_iter, bvecs_iter_chunked
 from .exhaustive_search import knn
 
 
@@ -350,6 +350,78 @@ def get_groundtruth(self, k=None):
             gt = gt[:, :k]
         return gt
 
+class DatasetDINO10B(Dataset):
+    """
+    Data from https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/
+    The dataset contains 10 billion 1024-d vectors extracted from image patches from the YFCC100M dataset, using a Dino-ViT-L 16 model (facebook/dinov3-vitl16-pretrain-lvd1689m).
+    The dataset is sharded in multiple chunked .bvecs files. Downloading instructions can be obtained with "wget https://dl.fbaipublicfiles.com/large_objects/dino_vitl_10B/README.md".
+    Supported sizes : 100k 200k 500k 1M ... 5B 10B listed in supported_nbs (see __init__).
+    """
+    def __init__(self, nb, ignore_supported = False):
+        Dataset.__init__(self)
+        supported_nbs = [100_000, 200_000, 500_000, 1_000_000, 2_000_000, 5_000_000, 10_000_000, 20_000_000, 50_000_000, 100_000_000, 200_000_000, 500_000_000, 1_000_000_000, 2_000_000_000, 5_000_000_000, 10_000_000_000]
+        if nb not in supported_nbs and not ignore_supported:
+            raise ValueError(f"Unsupported dataset size: {nb}, supported values are: {supported_nbs}")
+        if not os.path.exists(dataset_basedir):
+            raise ValueError(f"Provided dataset base directory does not exist: {dataset_basedir}")
+        self.basedir = dataset_basedir + "dino_vitl_10B/"
+        self.indexdir = self.basedir + "chunked_base_10B"
+        assert os.path.exists(self.indexdir), f"Index path should exist, check your dataset path: {self.indexdir}"
+        self.queriesdir = self.basedir + "queries_clean.bvecs"
+        assert os.path.exists(self.queriesdir), f"Queries path should exist as dataset size {nb} is supported: {self.queriesdir}"
+        self.gtsdir = self.basedir + "gts/" + "gts_dino_patch_" + str(nb) + "_" + "k10.npy"
+        self.train_queriesdir = self.basedir + "train_queries_99M.bvecs"
+        self.nb = nb
+        self.d = 1024
+        self.nq = 100_000
+        self.nt = 99_000_000
+        self.metric = "L2"
+
+
+    def get_queries(self):
+        """Get all vectors as a single array"""
+        queries = bvecs_mmap(self.queriesdir)
+        return sanitize(queries)
+
+    def get_train(self, maxtrain = None):
+        """Get training query vectors as a single array"""
+        if maxtrain is None or maxtrain > 10_000_000:
+            raise NotImplementedError("The training set is potentially too large to fit in RAM (400 GB of data). Please use train_iterator or use maxtrain parameter below 10_000_000 to get the first maxtrain training vectors.")
+        return sanitize(bvecs_mmap(self.train_queriesdir)[:maxtrain])
+
+    def get_database(self):
+        """Get all database vectors as a single array"""
+        if self.nb > 10_000_000:
+            raise NotImplementedError("The dataset is potentially too large to fit in RAM. Please use database_iterator or use a dataset size equal to or below 10_000_000.")
+        else:
+            return sanitize(bvecs_iter_chunked(self.indexdir, batch_size=self.nb).__next__())
+
+    def database_iterator(self, bs=10_000):
+        """Iterator over the database of size nb, corresponding to the first nb vectors in the .bvecs file"""
+        total_read = 0
+        for batch in bvecs_iter_chunked(self.indexdir, batch_size=bs):
+            if total_read + batch.shape[0] > self.nb:
+                batch = batch[:self.nb - total_read]
+            yield sanitize(batch)
+            total_read += batch.shape[0]
+            if total_read >= self.nb:
+                break
+
+    def train_iterator(self, bs=10_000):
+        """Iterator over all training query vectors in the .bvecs file"""
+        for batch in bvecs_iter(self.train_queriesdir, batch_size=bs):
+            yield sanitize(batch)
+
+    def get_groundtruth(self, k = 10):
+        """Get ground truth from .npy file"""
+        if k > 10:
+            raise NotImplementedError("Ground truth files only available for k<=10")
+        gts = np.load(self.gtsdir)
+        gts = gts[:, :k]
+        return gts
+
+    def distance(self):
+        return "euclidean"
 
 def dataset_from_name(dataset='deep1M', download=False):
     """ converts a string describing a dataset to a Dataset object
@@ -385,5 +457,9 @@ def dataset_from_name(dataset='deep1M', download=False):
     elif dataset == "glove":
         return DatasetGlove(download=download)
 
+    elif dataset.startswith("dino"):
+        dbsize = 10_000_000_000 if dataset == "dino10B" else int(dataset[4:])
+        return DatasetDINO10B(nb=dbsize)
+
     else:
         raise RuntimeError("unknown dataset " + dataset)
diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
@@ -5,6 +5,7 @@
 
 import sys
 import numpy as np
+import os
 
 """
 I/O functions in fvecs, bvecs, ivecs formats
@@ -58,3 +59,120 @@ def ivecs_write(fname, m):
 def fvecs_write(fname, m):
     m = m.astype('float32')
     ivecs_write(fname, m.view('int32'))
+
+def bvecs_iter(filepath, batch_size=100_000):
+    """
+    Memory-mapped iterator - only loads requested slices into RAM
+    """
+
+    file_size = os.path.getsize(filepath)
+    with open(filepath, 'rb') as f:
+        dim = np.frombuffer(f.read(4), dtype='<i4')[0]
+
+    bytes_per_vec = 4 + dim
+    n_vectors = file_size // bytes_per_vec
+
+    mm = np.memmap(filepath, mode='r', dtype=np.uint8)
+    records = mm.reshape(n_vectors, bytes_per_vec)
+
+    for start in range(0, n_vectors, batch_size):
+        end = np.min([start + batch_size, n_vectors])
+        yield records[start:end, 4:]
+
+def bvecs_iter_chunked(chunk_folder, batch_size=100_000):
+    """
+    Memory-mapped iterator over chunked .bvecs files.
+    Iterates through all chunk files in order (chunk_0000.bvecs, chunk_0001.bvecs, etc.)
+    and yields batches of vectors, handling cases where batches span multiple files.
+
+    Args:
+        chunk_folder: path to folder containing chunk_XXXX.bvecs files
+        batch_size: number of vectors to yield per batch
+
+    Yields:
+        numpy array of shape (batch_size, d) or smaller for last batch
+
+    Raises:
+        ValueError: if there are gaps in the chunk sequence
+    """
+
+    # Find all chunk files and sort them
+    chunk_files = []
+    for entry in os.scandir(chunk_folder):
+        if entry.is_file() and entry.name.startswith("chunk_") and entry.name.endswith(".bvecs"):
+            chunk_files.append(entry.path)
+    chunk_files.sort()
+
+    if not chunk_files:
+        raise ValueError(f"No chunk files found in {chunk_folder}")
+
+    # Extract chunk numbers and verify no gaps
+    chunk_numbers = []
+    for path in chunk_files:
+        basename = os.path.basename(path)
+        try:
+            num_str = basename.split('_')[1].split('.')[0]
+            chunk_numbers.append(int(num_str))
+        except (IndexError, ValueError):
+            raise ValueError(f"Invalid chunk filename format: {basename}")
+
+    # Check for gaps in sequence
+    expected_chunks = list(range(len(chunk_numbers)))
+    if sorted(chunk_numbers) != expected_chunks:
+        missing = set(expected_chunks) - set(chunk_numbers)
+        raise ValueError(
+            f"Gap detected in chunk sequence! Missing chunks: {sorted(missing)}\n"
+            f"Found chunks: {sorted(chunk_numbers)}\n"
+            f"Expected continuous sequence from 0 to {len(chunk_numbers)-1}"
+        )
+
+    # Get dimension from first chunk
+    with open(chunk_files[0], 'rb') as f:
+        dim = np.frombuffer(f.read(4), dtype='<i4')[0]
+
+    bytes_per_vec = 4 + dim
+
+    # Buffer to accumulate vectors across chunk boundaries
+    buffer = None
+    buffer_size = 0
+
+    # Iterate through each chunk file
+    for chunk_path in chunk_files:
+        file_size = os.path.getsize(chunk_path)
+        n_vectors = file_size // bytes_per_vec
+
+        # Memory-map this chunk
+        mm = np.memmap(chunk_path, mode='r', dtype=np.uint8)
+        records = mm.reshape(n_vectors, bytes_per_vec)
+        vectors = records[:, 4:]  # Skip dimension prefix
+
+        start = 0
+
+        # First, handle any buffered data from previous chunk
+        if buffer is not None:
+            needed = batch_size - buffer_size
+            if needed <= n_vectors:
+                # Can complete the buffered batch with this chunk
+                batch = np.vstack([buffer, vectors[:needed]])
+                yield batch
+                buffer = None
+                buffer_size = 0
+                start = needed
+            else:
+                # Still not enough, accumulate and continue to next chunk
+                buffer = np.vstack([buffer, vectors])
+                buffer_size += n_vectors
+                continue
+
+        # Now process complete batches from this chunk
+        while start + batch_size <= n_vectors:
+            yield vectors[start:start + batch_size]
+            start += batch_size
+
+        remainder = n_vectors - start
+        if remainder > 0:
+            buffer = vectors[start:].copy()
+            buffer_size = remainder
+
+    if buffer is not None:
+        yield buffer