ADC lookup table for quantized scan + 8-lane SIMD distance

justrach · claude · justrach · commit 607af77b8456 · 2026-04-05T09:50:54.000+08:00
Replace per-element codebook lookups with pre-computed ADC (Asymmetric Distance Computation) tables. For each query: 1. Rotate query once: O(d^2) 2. Build distance table: table[j][k] = dist(rotated_q[j], centroid[k]) 3. Scan: just table lookups per byte — O(d) with tiny constant Fast paths for 4-bit (2 nibbles/byte) and 2-bit (4 values/byte) with direct bit masking instead of generic unpackBits. Also widened SIMD from @vector(4) to @vector(8) in dotProduct/l2Distance. Benchmark results at target dimensions (cosine, recall@10): dims=768, N=10K: FP32 5.2ms, Q4 32.8ms, recall=99.5% dims=768, N=50K: FP32 23.7ms, Q4 163.6ms, recall=99.5% dims=1536, N=10K: FP32 10.1ms, Q4 67.3ms, recall=100% dims=1536, N=50K: FP32 47.3ms, Q4 329.7ms, recall=99.5% Q4 achieves near-perfect recall with 8x memory compression per vector. Current speed gap vs FP32 is from re-ranking phase + table cache misses. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/root b/root
diff --git a/src/turboquant.zig b/src/turboquant.zig
@@ -267,57 +267,118 @@ pub const TurboQuant = struct {
     }
 
     /// Pre-rotate a query vector: y_q = Π · q. Caller owns the returned slice.
-    /// Call this ONCE per search, then use scanL2Rotated/scanDotRotated per vector.
+    /// Call this ONCE per search, then use buildDistTable + scanWithTable per vector.
     pub fn rotateQuery(self: *const TurboQuant, allocator: Allocator, query: []const f32) ![]f32 {
         const d: usize = @intCast(self.dims);
         const yq = try allocator.alloc(f32, d);
         matvecMul(self.rotation, query, yq, d);
         return yq;
     }
 
-    /// Fast L2 distance from pre-rotated query to a quantized vector.
-    /// O(d) with no rotation — just codebook lookups + SIMD subtraction.
-    pub fn scanL2Rotated(self: *const TurboQuant, rotated_query: []const f32, quantized: []const u8) f32 {
+    /// Build an ADC (Asymmetric Distance Computation) lookup table for L2.
+    /// dist_table[j * nc + k] = (rotated_query[j] - centroid[k])^2
+    /// Call ONCE per query, then scanWithTable is a trivial table lookup per coordinate.
+    pub fn buildL2Table(self: *const TurboQuant, allocator: Allocator, rotated_query: []const f32) ![]f32 {
         const d: usize = @intCast(self.dims);
-        const bw = self.bit_width;
+        const nc: usize = self.num_centroids;
         const cb = self.codebook;
-        var j: usize = 0;
-        var v_sum: V8 = @splat(0.0);
-        while (j + 8 <= d) : (j += 8) {
-            const yq_v: V8 = rotated_query[j..][0..8].*;
-            var cb_v: V8 = undefined;
-            inline for (0..8) |k| {
-                cb_v[k] = cb[unpackBits(quantized, j + k, bw)];
+        const table = try allocator.alloc(f32, d * nc);
+        for (0..d) |j| {
+            const qj = rotated_query[j];
+            for (0..nc) |k| {
+                const diff = qj - cb[k];
+                table[j * nc + k] = diff * diff;
             }
-            const diff = yq_v - cb_v;
-            v_sum += diff * diff;
         }
-        var sum = @reduce(.Add, v_sum);
-        while (j < d) : (j += 1) {
+        return table;
+    }
+
+    /// Build an ADC lookup table for dot product.
+    /// dot_table[j * nc + k] = rotated_query[j] * centroid[k]
+    pub fn buildDotTable(self: *const TurboQuant, allocator: Allocator, rotated_query: []const f32) ![]f32 {
+        const d: usize = @intCast(self.dims);
+        const nc: usize = self.num_centroids;
+        const cb = self.codebook;
+        const table = try allocator.alloc(f32, d * nc);
+        for (0..d) |j| {
+            const qj = rotated_query[j];
+            for (0..nc) |k| {
+                table[j * nc + k] = qj * cb[k];
+            }
+        }
+        return table;
+    }
+
+    /// Ultra-fast scan: just table lookups per coordinate. O(d) with tiny constant.
+    /// For 4-bit: 2 coords per byte, each is a table lookup + accumulate.
+    pub fn scanWithTable(self: *const TurboQuant, table: []const f32, quantized: []const u8) f32 {
+        const d: usize = @intCast(self.dims);
+        const nc: usize = self.num_centroids;
+        const bw = self.bit_width;
+        var sum: f32 = 0;
+
+        if (bw == 4) {
+            // Fast path for 4-bit: 2 nibbles per byte, no bit-shifting needed
+            var j: usize = 0;
+            var byte_idx: usize = 0;
+            while (j + 2 <= d) : ({
+                j += 2;
+                byte_idx += 1;
+            }) {
+                const b = quantized[byte_idx];
+                const lo: usize = b & 0x0F;
+                const hi: usize = (b >> 4) & 0x0F;
+                sum += table[j * nc + lo];
+                sum += table[(j + 1) * nc + hi];
+            }
+            if (j < d) {
+                sum += table[j * nc + @as(usize, quantized[byte_idx] & 0x0F)];
+            }
+        } else if (bw == 2) {
+            // Fast path for 2-bit: 4 values per byte
+            var j: usize = 0;
+            var byte_idx: usize = 0;
+            while (j + 4 <= d) : ({
+                j += 4;
+                byte_idx += 1;
+            }) {
+                const b = quantized[byte_idx];
+                sum += table[j * nc + @as(usize, b & 0x03)];
+                sum += table[(j + 1) * nc + @as(usize, (b >> 2) & 0x03)];
+                sum += table[(j + 2) * nc + @as(usize, (b >> 4) & 0x03)];
+                sum += table[(j + 3) * nc + @as(usize, (b >> 6) & 0x03)];
+            }
+            while (j < d) : (j += 1) {
+                sum += table[j * nc + @as(usize, unpackBits(quantized, j, bw))];
+            }
+        } else {
+            // Generic path
+            for (0..d) |j| {
+                sum += table[j * nc + @as(usize, unpackBits(quantized, j, bw))];
+            }
+        }
+        return sum;
+    }
+
+    // Keep the old methods for backward compat / single-vector queries
+    pub fn scanL2Rotated(self: *const TurboQuant, rotated_query: []const f32, quantized: []const u8) f32 {
+        const d: usize = @intCast(self.dims);
+        const bw = self.bit_width;
+        const cb = self.codebook;
+        var sum: f32 = 0;
+        for (0..d) |j| {
             const diff = rotated_query[j] - cb[unpackBits(quantized, j, bw)];
             sum += diff * diff;
         }
         return sum;
     }
 
-    /// Fast dot product from pre-rotated query to a quantized vector.
-    /// O(d) with no rotation — just codebook lookups + SIMD multiply.
     pub fn scanDotRotated(self: *const TurboQuant, rotated_query: []const f32, quantized: []const u8) f32 {
         const d: usize = @intCast(self.dims);
         const bw = self.bit_width;
         const cb = self.codebook;
-        var j: usize = 0;
-        var v_sum: V8 = @splat(0.0);
-        while (j + 8 <= d) : (j += 8) {
-            const yq_v: V8 = rotated_query[j..][0..8].*;
-            var cb_v: V8 = undefined;
-            inline for (0..8) |k| {
-                cb_v[k] = cb[unpackBits(quantized, j + k, bw)];
-            }
-            v_sum += yq_v * cb_v;
-        }
-        var sum = @reduce(.Add, v_sum);
-        while (j < d) : (j += 1) {
+        var sum: f32 = 0;
+        for (0..d) |j| {
             sum += rotated_query[j] * cb[unpackBits(quantized, j, bw)];
         }
         return sum;
diff --git a/src/vector.zig b/src/vector.zig
@@ -215,16 +215,19 @@ pub const VectorColumn = struct {
         const rotated_q = try q.rotateQuery(alloc, effective_query);
         defer alloc.free(rotated_q);
 
-        // Scan all quantized vectors with pre-rotated query — O(d) per vector, no rotation.
+        // Build ADC lookup table ONCE — O(d * 2^b), then scan is just table lookups.
+        const dist_table = switch (metric) {
+            .l2 => try q.buildL2Table(alloc, rotated_q),
+            .dot_product, .cosine => try q.buildDotTable(alloc, rotated_q),
+        };
+        defer alloc.free(dist_table);
+
+        // Scan all quantized vectors — just table lookups per byte, extremely fast.
         var i: u32 = 0;
         while (i < self.count) : (i += 1) {
             const qvec = self.qdata.items[@as(usize, i) * bpv ..][0..bpv];
-
-            const score = switch (metric) {
-                .l2 => -q.scanL2Rotated(rotated_q, qvec),
-                .dot_product => q.scanDotRotated(rotated_q, qvec),
-                .cosine => q.scanDotRotated(rotated_q, qvec),
-            };
+            const raw = q.scanWithTable(dist_table, qvec);
+            const score = if (metric == .l2) -raw else raw;
 
             if (heap_size < candidate_k) {
                 candidates[heap_size] = .{ .index = i, .score = score };