Skip to content

Commit fab71e5

Browse files
committed
Improve: Faster serial Jaccard with SWAR
1 parent ed08bdd commit fab71e5

File tree

2 files changed

+26
-24
lines changed

2 files changed

+26
-24
lines changed

include/simsimd/binary.h

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
*
77
* Contains following similarity measures:
88
*
9-
* - Bit-level Hamming distance → @c u32 counter
10-
* - Bit-level Jaccard distance (Tanimoto coefficient) → @c f32 ratio
11-
* - Jaccard distance for @c u32 integral MinHash vectors from StringZilla → @c f32 ratio
12-
* - TODO: Weighted Jaccard distance for @c u32 integral Count-Min-Sketch vectors → @c f32 ratio
9+
* - Bit-level Hamming distance → `u32` counter
10+
* - Bit-level Jaccard distance (Tanimoto coefficient) → `f32` ratio
11+
* - Jaccard distance for `u32` integral MinHash vectors from StringZilla → `f32` ratio
12+
* - TODO: Weighted Jaccard distance for `u32` integral Count-Min-Sketch vectors → `f32` ratio
1313
*
1414
* For hardware architectures:
1515
*
@@ -343,26 +343,10 @@ SIMSIMD_INTERNAL void simsimd_jaccard_b128_init_serial(simsimd_jaccard_b128_stat
343343

344344
SIMSIMD_INTERNAL void simsimd_jaccard_b128_update_serial(simsimd_jaccard_b128_state_serial_t *state,
345345
simsimd_b128_vec_t a, simsimd_b128_vec_t b) {
346-
simsimd_u64_t intersection_lo = a.u64s[0] & b.u64s[0];
347-
simsimd_u64_t intersection_hi = a.u64s[1] & b.u64s[1];
348-
349-
// Unrolled lookup table popcount for portability
350-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo));
351-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 8));
352-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 16));
353-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 24));
354-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 32));
355-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 40));
356-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 48));
357-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_lo >> 56));
358-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi));
359-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 8));
360-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 16));
361-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 24));
362-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 32));
363-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 40));
364-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 48));
365-
state->intersection_count += simsimd_popcount_b8((simsimd_b8_t)(intersection_hi >> 56));
346+
simsimd_u64_t intersection_low = a.u64s[0] & b.u64s[0];
347+
simsimd_u64_t intersection_high = a.u64s[1] & b.u64s[1];
348+
state->intersection_count += _simsimd_u64_popcount(intersection_low);
349+
state->intersection_count += _simsimd_u64_popcount(intersection_high);
366350
}
367351

368352
SIMSIMD_INTERNAL void simsimd_jaccard_b128_finalize_serial(

include/simsimd/types.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,6 +1401,24 @@ SIMSIMD_INTERNAL void _simsimd_u64_smul(simsimd_u64_t const *a, simsimd_u64_t co
14011401
else { *r = (hi_lo << 32) + (lo_hi << 32) + lo_lo; } // Combine parts if no overflow
14021402
}
14031403

1404+
/**
1405+
* @brief SWAR population count for 64-bit integers.
1406+
*
1407+
* Classic algorithm from Hacker's Delight using parallel bit summation:
1408+
* - Step 1: Count bits in pairs (2-bit sums)
1409+
* - Step 2: Count bits in nibbles (4-bit sums)
1410+
* - Step 3: Count bits in bytes (8-bit sums)
1411+
* - Step 4: Horizontal sum via multiply - each byte contributes to bits 56-63
1412+
*
1413+
* Cost: ~12 ALU ops, zero memory access (vs 8 table lookups for byte-wise).
1414+
*/
1415+
SIMSIMD_INTERNAL simsimd_u64_t _simsimd_u64_popcount(simsimd_u64_t x) {
1416+
x = x - ((x >> 1) & 0x5555555555555555ull);
1417+
x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
1418+
x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full;
1419+
return (x * 0x0101010101010101ull) >> 56;
1420+
}
1421+
14041422
SIMSIMD_INTERNAL void _simsimd_i8_smul(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t *r) {
14051423
simsimd_i16_t result = (simsimd_i16_t)(*a) * (simsimd_i16_t)(*b);
14061424
*r = (result > 127) ? 127 : (result < -128 ? -128 : (simsimd_i8_t)result);

0 commit comments

Comments
 (0)