Skip to content

Commit 2b20e1c

Browse files
committed
lib: specialize radical_inverse() for improved performance
I re-read the PBRT section on sampling, and realised my original C port of their radical inverse implementation misses on the performance benefits of the C++ template trick they use. Recreate the same trick with some X macros, the improvement seems noticeable. On my desktop (i7-6700k, 4c/8t): Before: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 128' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 128 Time (mean ± σ): 27.379 s ± 0.548 s [User: 189.460 s, System: 0.589 s] Range (min … max): 26.518 s … 28.056 s 10 runs After: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 128' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 128 Time (mean ± σ): 25.778 s ± 0.592 s [User: 177.314 s, System: 0.534 s] Range (min … max): 24.894 s … 26.677 s 10 runs On my server (AMD EPYC 9374F, 32c/64t): Before: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64 Time (mean ± σ): 20.264 s ± 0.142 s [User: 977.766 s, System: 0.292 s] Range (min … max): 20.055 s … 20.551 s 10 runs After: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 1024 -j 64 Time (mean ± σ): 19.989 s ± 0.119 s [User: 954.135 s, System: 0.275 s] Range (min … max): 19.836 s … 20.157 s 10 runs
1 parent 9a6861f commit 2b20e1c

File tree

3 files changed

+60
-16
lines changed

3 files changed

+60
-16
lines changed

src/lib/renderer/samplers/common.h

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include "../../../includes.h"
12+
#include <common/cr_assert.h>
1213

1314
// Hash function by Thomas Wang: https://burtleburtle.net/bob/hash/integer.html
1415
static inline uint32_t hash(uint32_t x) {
@@ -30,20 +31,65 @@ static inline float wrapAdd(float u, float v) {
3031
return (u + v < 1.0f) ? u + v : u + v - 1.0f;
3132
}
3233

33-
// By PBRT authors
34-
static inline float radicalInverse(int pass, int base) {
35-
const float invBase = 1.0f / base;
36-
int reversedDigits = 0;
37-
float invBaseN = 1.0f;
38-
while (pass) {
39-
const int next = pass / base;
40-
const int digit = pass - base * next;
41-
reversedDigits = reversedDigits * base + digit;
42-
invBaseN *= invBase;
43-
pass = next;
34+
#define RAD_INV_BASES \
35+
X(1, 3) \
36+
X(2, 5) \
37+
X(3, 7) \
38+
X(4, 11) \
39+
X(5, 13) \
40+
41+
// NOTE: +1 for case 0 reverse_bits_64() in radical_inverse().
42+
static const int n_bases = 5 + 1;
43+
44+
static const float one_minus_epsilon = 0x1.fffffep-1;
45+
// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#RadicalInverseSpecialized
46+
#define X(idx, base) \
47+
static inline float radical_inverse_b_##base(int a) { \
48+
const float inv_base = 1.0f / (float)base; \
49+
uint64_t reversed_digits = 0; \
50+
float inv_base_n = 1.0f; \
51+
while (a) { \
52+
const uint64_t next = a / base; \
53+
const uint64_t digit = a - next * base; \
54+
reversed_digits = reversed_digits * base + digit; \
55+
inv_base_n *= inv_base; \
56+
a = next; \
57+
} \
58+
return min(reversed_digits * inv_base_n, one_minus_epsilon); \
59+
}
60+
61+
RAD_INV_BASES
62+
63+
#undef X
64+
65+
// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#ReverseBits32
66+
static inline uint32_t reverse_bits_32(uint32_t n) {
67+
n = (n << 16) | (n >> 16);
68+
n = ((n & 0x00ff00ff) << 8) | ((n & 0xff00ff00) >> 8);
69+
n = ((n & 0x0f0f0f0f) << 4) | ((n & 0xf0f0f0f0) >> 4);
70+
n = ((n & 0x33333333) << 2) | ((n & 0xcccccccc) >> 2);
71+
n = ((n & 0x55555555) << 1) | ((n & 0xaaaaaaaa) >> 1);
72+
return n;
73+
}
74+
75+
// https://pbr-book.org/3ed-2018/Sampling_and_Reconstruction/The_Halton_Sampler#ReverseBits64
76+
static inline uint64_t reverse_bits_64(uint64_t n) {
77+
uint64_t n0 = reverse_bits_32((uint32_t)n);
78+
uint64_t n1 = reverse_bits_32((uint32_t)(n >> 32));
79+
return (n0 << 32) | n1;
80+
}
81+
82+
#define X(idx, base) case idx: return radical_inverse_b_##base(a);
83+
static inline float radical_inverse(int base_idx, uint64_t a) {
84+
ASSERT(a < n_bases);
85+
switch (base_idx) {
86+
case 0: return reverse_bits_64(a) * 0x1p-64;
87+
RAD_INV_BASES
4488
}
45-
return min(reversedDigits * invBaseN, 0.99999994f);
89+
ASSERT_NOT_REACHED();
90+
return 0.0f;
4691
}
92+
#undef X
4793

4894
static inline float uintToUnitReal(uint32_t v) {
4995
// Trick from MTGP: generate an uniformly distributed single precision number in [1,2) and subtract 1

src/lib/renderer/samplers/halton.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <common/cr_assert.h>
1414
#include <common/vector.h>
1515

16-
static const unsigned int primes[] = {2, 3, 5, 7, 11, 13};
1716
static const unsigned int primesCount = 6;
1817

1918
void initHalton(haltonSampler *s, int pass, uint32_t seed) {
@@ -24,7 +23,7 @@ void initHalton(haltonSampler *s, int pass, uint32_t seed) {
2423

2524
float getHalton(haltonSampler *s) {
2625
// Wrapping around trick by @lycium
27-
float v = wrapAdd(radicalInverse(s->currPass, primes[s->currPrime++ % primesCount]), s->rndOffset);
26+
float v = wrapAdd(radical_inverse(s->currPrime++ % primesCount, s->currPass), s->rndOffset);
2827
ASSERT(v >= 0.0f);
2928
ASSERT(v < 1.0f);
3029
return v;

src/lib/renderer/samplers/hammersley.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#include <common/vector.h>
1414
#include <common/cr_assert.h>
1515

16-
static const unsigned int primes[] = {2, 3, 5, 7, 11, 13};
1716
static const unsigned int primes_count = 6;
1817

1918
void initHammersley(hammersleySampler *s, int pass, int maxPasses, uint32_t seed) {
@@ -28,7 +27,7 @@ float getHammersley(hammersleySampler *s) {
2827
// Wrapping around trick by Thomas Ludwig (@lycium)
2928
float u;
3029
if (s->currPass > 0) {
31-
u = radicalInverse(s->currPass, primes[s->currPrime++ % primes_count]);
30+
u = radical_inverse(s->currPrime++ % primes_count, s->currPass);
3231
} else {
3332
u = s->currPass / s->maxPasses;
3433
}

0 commit comments

Comments
 (0)