lib: specialize radical_inverse() for improved performance

vkoskiv · vkoskiv · commit 2b20e1c51285 · 2025-08-24T00:08:28.000+03:00
I re-read the PBRT section on sampling, and realised my original C port
of their radical inverse implementation misses on the performance
benefits of the C++ template trick they use.
Recreate the same trick with some X macros, the improvement seems
noticeable. On my desktop (i7-6700k, 4c/8t):

Before:
&gt; hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 128'
Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 128
  Time (mean ± σ):     27.379 s ±  0.548 s    [User: 189.460 s, System: 0.589 s]
  Range (min … max):   26.518 s … 28.056 s    10 runs

After:
&gt; hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 128'
Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 128
  Time (mean ± σ):     25.778 s ±  0.592 s    [User: 177.314 s, System: 0.534 s]
  Range (min … max):   24.894 s … 26.677 s    10 runs

On my server (AMD EPYC 9374F, 32c/64t):

Before:
&gt; hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64'
Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64
  Time (mean ± σ):     20.264 s ±  0.142 s    [User: 977.766 s, System: 0.292 s]
  Range (min … max):   20.055 s … 20.551 s    10 runs

After:
&gt; hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64'
Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64
  Time (mean ± σ):     19.989 s ±  0.119 s    [User: 954.135 s, System: 0.275 s]
  Range (min … max):   19.836 s … 20.157 s    10 runs
diff --git a/src/lib/renderer/samplers/common.h b/src/lib/renderer/samplers/common.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "../../../includes.h"
+#include <common/cr_assert.h>
 
 // Hash function by Thomas Wang: https://burtleburtle.net/bob/hash/integer.html
 static inline uint32_t hash(uint32_t x) {
@@ -30,20 +31,65 @@ static inline float wrapAdd(float u, float v) {
 	return (u + v < 1.0f) ? u + v : u + v - 1.0f;
 }
 
-// By PBRT authors
-static inline float radicalInverse(int pass, int base) {
-	const float invBase = 1.0f / base;
-	int reversedDigits = 0;
-	float invBaseN = 1.0f;
-	while (pass) {
-		const int next = pass / base;
-		const int digit = pass - base * next;
-		reversedDigits = reversedDigits * base + digit;
-		invBaseN *= invBase;
-		pass = next;
+#define RAD_INV_BASES \
+	X(1, 3) \
+	X(2, 5) \
+	X(3, 7) \
+	X(4, 11) \
+	X(5, 13) \
+
+// NOTE: +1 for case 0 reverse_bits_64() in radical_inverse().
+static const int n_bases = 5 + 1;
+
+static const float one_minus_epsilon = 0x1.fffffep-1;
+// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#RadicalInverseSpecialized
+#define X(idx, base) \
+static inline float radical_inverse_b_##base(int a) { \
+	const float inv_base = 1.0f / (float)base; \
+	uint64_t reversed_digits = 0; \
+	float inv_base_n = 1.0f; \
+	while (a) { \
+		const uint64_t next = a / base; \
+		const uint64_t digit = a - next * base; \
+		reversed_digits = reversed_digits * base + digit; \
+		inv_base_n *= inv_base; \
+		a = next; \
+	} \
+	return min(reversed_digits * inv_base_n, one_minus_epsilon); \
+}
+
+RAD_INV_BASES
+
+#undef X
+
+// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#ReverseBits32
+static inline uint32_t reverse_bits_32(uint32_t n) {
+	n = (n << 16) | (n >> 16);
+	n = ((n & 0x00ff00ff) << 8) | ((n & 0xff00ff00) >> 8);
+    n = ((n & 0x0f0f0f0f) << 4) | ((n & 0xf0f0f0f0) >> 4);
+    n = ((n & 0x33333333) << 2) | ((n & 0xcccccccc) >> 2);
+    n = ((n & 0x55555555) << 1) | ((n & 0xaaaaaaaa) >> 1);
+    return n;
+}
+
+// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#ReverseBits64
+static inline uint64_t reverse_bits_64(uint64_t n) {
+	uint64_t n0 = reverse_bits_32((uint32_t)n);
+	uint64_t n1 = reverse_bits_32((uint32_t)(n >> 32));
+	return (n0 << 32) | n1;
+}
+
+#define X(idx, base) case idx: return radical_inverse_b_##base(a);
+static inline float radical_inverse(int base_idx, uint64_t a) {
+	ASSERT(a < n_bases);
+	switch (base_idx) {
+		case 0: return reverse_bits_64(a) * 0x1p-64;
+		RAD_INV_BASES
 	}
-	return min(reversedDigits * invBaseN, 0.99999994f);
+	ASSERT_NOT_REACHED();
+	return 0.0f;
 }
+#undef X
 
 static inline float uintToUnitReal(uint32_t v) {
 	// Trick from MTGP: generate an uniformly distributed single precision number in [1,2) and subtract 1
diff --git a/src/lib/renderer/samplers/halton.c b/src/lib/renderer/samplers/halton.c
@@ -13,7 +13,6 @@
 #include <common/cr_assert.h>
 #include <common/vector.h>
 
-static const unsigned int primes[] = {2, 3, 5, 7, 11, 13};
 static const unsigned int primesCount = 6;
 
 void initHalton(haltonSampler *s, int pass, uint32_t seed) {
@@ -24,7 +23,7 @@ void initHalton(haltonSampler *s, int pass, uint32_t seed) {
 
 float getHalton(haltonSampler *s) {
 	// Wrapping around trick by @lycium
-	float v = wrapAdd(radicalInverse(s->currPass, primes[s->currPrime++ % primesCount]), s->rndOffset);
+	float v = wrapAdd(radical_inverse(s->currPrime++ % primesCount, s->currPass), s->rndOffset);
 	ASSERT(v >= 0.0f);
 	ASSERT(v < 1.0f);
 	return v;
diff --git a/src/lib/renderer/samplers/hammersley.c b/src/lib/renderer/samplers/hammersley.c
@@ -13,7 +13,6 @@
 #include <common/vector.h>
 #include <common/cr_assert.h>
 
-static const unsigned int primes[] = {2, 3, 5, 7, 11, 13};
 static const unsigned int primes_count = 6;
 
 void initHammersley(hammersleySampler *s, int pass, int maxPasses, uint32_t seed) {
@@ -28,7 +27,7 @@ float getHammersley(hammersleySampler *s) {
 	// Wrapping around trick by Thomas Ludwig (@lycium)
 	float u;
 	if (s->currPass > 0) {
-		u = radicalInverse(s->currPass, primes[s->currPrime++ % primes_count]);
+		u = radical_inverse(s->currPrime++ % primes_count, s->currPass);
 	} else {
 		u = s->currPass / s->maxPasses;
 	}