Skip to content

Commit 9dc994e

Browse files
committed
Test of hash functions, no gain
1 parent 63265e2 commit 9dc994e

2 files changed

Lines changed: 147 additions & 87 deletions

File tree

test/src/java/com/github/oeuvres/alix/util/CharArrayHashBenchmark.java

Lines changed: 56 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -15,47 +15,56 @@
1515
/**
1616
* Benchmark CharArrayHash vs Lucene CharArrayMap on a large word list.
1717
*
18-
* CLI: --file <path> UTF-8 file, one token per line (optional) --target <N>
19-
* desired unique tokens (default 600000) --mode <hash|map|both> which
20-
* structure(s) to test (default both) --lowercase lowercase tokens before
21-
* insert (optional)
22-
*
23-
* Notes: - Uses Δ(used heap) after GC as a coarse footprint estimator. - For
24-
* reproducible results, fix heap: -Xms4g -Xmx4g -XX:+UseG1GC - Requires your
25-
* CharArrayHash class on classpath.
18+
* Notes:
19+
* - This version avoids "same char[] instance" lookup bias by using separate
20+
* precomputed probe arrays for hits.
21+
* - Build simulates a copy-required token stream: CharArrayHash copies from a reused buffer
22+
* into its slab, CharArrayMap must allocate a stable char[] per key.
23+
* - Uses Δ(used heap) after GC as a coarse footprint estimator.
2624
*/
2725
public class CharArrayHashBenchmark
2826
{
27+
private static final Random RNG = new Random(0xC0FFEE);
2928

3029
public static void main(String[] args) throws Exception
3130
{
3231
final String lexicon = "word.csv";
3332
InputStream is = French.class.getResourceAsStream(lexicon);
3433
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
35-
List<char[]> tokens = loadTokens(reader);
34+
35+
List<String> tokens = loadTokensAsStrings(reader);
3636
System.out.println("Loaded tokens: " + fmt(tokens.size()));
3737

38-
List<char[]> hits = new ArrayList<>(tokens);
39-
List<char[]> misses = makeMisses(tokens);
38+
// Precompute lookup probes (distinct arrays) so timing excludes allocation.
39+
List<char[]> hitProbes = makeHitProbes(tokens);
40+
List<char[]> missProbes = makeMissesFromHits(hitProbes);
4041

4142
System.out.println("\n=== Lucene CharArrayMap<Integer> ===");
42-
runMap(tokens, hits, misses);
43+
runMap(tokens, hitProbes, missProbes);
44+
4345
System.out.println("\n=== CharArrayHash ===");
44-
runHash(tokens, hits, misses);
46+
runHash(tokens, hitProbes, missProbes);
4547
}
4648

4749
// ---------------------- CharArrayHash ----------------------
4850

49-
private static void runHash(List<char[]> tokens, List<char[]> hits, List<char[]> misses)
51+
private static void runHash(List<String> tokens, List<char[]> hits, List<char[]> misses)
5052
{
5153
forceGC();
5254
long mem0 = usedMem();
5355
long t0 = System.nanoTime();
5456

5557
CharArrayHash h = new CharArrayHash(tokens.size());
58+
59+
// Reused buffer: realistic for token streams (copy-required).
60+
char[] buf = new char[32];
5661
int lastOrd = -1;
57-
for (char[] w : tokens) {
58-
int r = h.add(w, 0, w.length);
62+
63+
for (String s : tokens) {
64+
int len = s.length();
65+
if (buf.length < len) buf = new char[len];
66+
s.getChars(0, len, buf, 0);
67+
int r = h.add(buf, 0, len);
5968
lastOrd = (r >= 0) ? r : (-r - 1);
6069
}
6170
h.trimToSize();
@@ -82,21 +91,26 @@ private static void runHash(List<char[]> tokens, List<char[]> hits, List<char[]>
8291
});
8392

8493
print("CharArrayHash", tokens.size(), buildNs, bestHitNs, bestMissNs, bytes);
85-
if (lastOrd == 42) System.out.print(""); // keep reachable
94+
if (lastOrd == 42) System.out.print("");
8695
}
8796

8897
// ---------------------- CharArrayMap<Integer> ----------------------
8998

90-
private static void runMap(List<char[]> tokens, List<char[]> hits, List<char[]> misses)
99+
private static void runMap(List<String> tokens, List<char[]> hits, List<char[]> misses)
91100
{
92101
forceGC();
93102
long mem0 = usedMem();
94103
long t0 = System.nanoTime();
95104

96105
CharArrayMap<Integer> map = new CharArrayMap<>(tokens.size(), false);
106+
107+
// Copy-required: store stable char[] keys, so allocate per token.
97108
int id = 0;
98-
for (char[] w : tokens) {
99-
map.put(w, id++); // uses char[] API
109+
for (String s : tokens) {
110+
int len = s.length();
111+
char[] k = new char[len];
112+
s.getChars(0, len, k, 0);
113+
map.put(k, id++);
100114
}
101115

102116
long buildNs = System.nanoTime() - t0;
@@ -126,27 +140,33 @@ private static void runMap(List<char[]> tokens, List<char[]> hits, List<char[]>
126140
if (id == 7) System.out.print("");
127141
}
128142

129-
// ---------------------- helpers ----------------------
143+
// ---------------------- data prep ----------------------
130144

131-
private static final Random RNG = new Random(0xC0FFEE);
132-
133-
private static List<char[]> loadTokens(Reader reader) throws IOException
145+
private static List<String> loadTokensAsStrings(Reader reader) throws IOException
134146
{
135-
List<char[]> out = new ArrayList<>(600000);
147+
List<String> out = new ArrayList<>(600000);
136148
CSVReader csvReader = new CSVReader(reader, ',', 1, '"', 16384);
137-
// header line
138149
if (!csvReader.readRow()) throw new IOException("Empty file?");
139150
while (csvReader.readRow()) {
140-
final int wordLength = csvReader.getCell(0).length();
141-
final char[] word = new char[wordLength];
142-
csvReader.getCell(0).getChars(0, wordLength, word, 0);
143-
out.add(word);
151+
out.add(csvReader.getCellAsString(0));
144152
}
145153
Collections.shuffle(out, RNG);
146154
return out;
147155
}
148156

149-
private static List<char[]> makeMisses(List<char[]> hits)
157+
private static List<char[]> makeHitProbes(List<String> tokens)
158+
{
159+
List<char[]> hits = new ArrayList<>(tokens.size());
160+
for (String s : tokens) {
161+
int len = s.length();
162+
char[] w = new char[len];
163+
s.getChars(0, len, w, 0);
164+
hits.add(w);
165+
}
166+
return hits;
167+
}
168+
169+
private static List<char[]> makeMissesFromHits(List<char[]> hits)
150170
{
151171
List<char[]> misses = new ArrayList<>(hits.size());
152172
for (char[] w : hits) {
@@ -158,6 +178,8 @@ private static List<char[]> makeMisses(List<char[]> hits)
158178
return misses;
159179
}
160180

181+
// ---------------------- helpers ----------------------
182+
161183
private static void forceGC()
162184
{
163185
try {
@@ -204,11 +226,9 @@ private static void print(String name, int n, long buildNs, long hitNs, long mis
204226
System.out.println("Name: " + name);
205227
System.out.println("Entries: " + fmt(n));
206228
System.out.println("Build time: " + String.format(Locale.ROOT, "%.2f ms", buildMs));
207-
System.out.println(
208-
"Hit lookup: " + String.format(Locale.ROOT, "%.2f ms", hitMs) + " (" + fmtQps(hitQps) + " qps)");
209-
System.out.println(
210-
"Miss lookup: " + String.format(Locale.ROOT, "%.2f ms", missMs) + " (" + fmtQps(missQps) + " qps)");
211-
System.out.println("Heap Δ (bytes): " + fmt(bytes));
229+
System.out.println("Hit lookup: " + String.format(Locale.ROOT, "%.2f ms", hitMs) + " (" + fmtQps(hitQps) + " qps)");
230+
System.out.println("Miss lookup: " + String.format(Locale.ROOT, "%.2f ms", missMs) + " (" + fmtQps(missQps) + " qps)");
231+
System.out.println("Heap \u0394 (bytes): " + fmt(bytes));
212232
System.out.println("Bytes/entry: " + (Double.isNaN(bpe) ? "n/a" : String.format(Locale.ROOT, "%.2f", bpe)));
213233
}
214234

@@ -223,5 +243,4 @@ private static String fmtQps(double qps)
223243
if (qps >= 1_000) return String.format(Locale.ROOT, "%.1fk", qps);
224244
return String.format(Locale.ROOT, "%.0f", qps);
225245
}
226-
227246
}

util/src/java/com/github/oeuvres/alix/util/CharArrayHash.java

Lines changed: 91 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ public int add(char[] key, int off, int len)
132132
checkBounds(key, off, len);
133133
if (len > MAX_KEY_LENGTH) throw new IllegalArgumentException("key length > 65535: " + len);
134134

135-
final int h = Murmur3.hashChars(key, off, len, SEED);
135+
final int h = murmur3(key, off, len);
136136
int i = h & mask;
137137

138138
for (;;) {
@@ -176,7 +176,7 @@ public int find(char[] key, int off, int len)
176176
checkBounds(key, off, len);
177177
if (len > MAX_KEY_LENGTH) return -1; // cannot exist (see add)
178178

179-
final int h = Murmur3.hashChars(key, off, len, SEED);
179+
final int h = murmur3(key, off, len);
180180
int i = h & mask;
181181

182182
for (;;) {
@@ -293,7 +293,6 @@ public void freeze()
293293
// ---- Internals -----------------------------------------------------------
294294

295295
private static final float LOAD_FACTOR = 0.75f;
296-
private static final int SEED = 0x9747b28c;
297296
private static final int MAX_KEY_LENGTH = 0xFFFF;
298297

299298
// Hash table: stores ords (>=0) or -1 if empty.
@@ -433,58 +432,100 @@ private void rehash(int newCap)
433432
}
434433
}
435434

435+
436436
/**
437-
* Murmur3-32 hashing specialized for {@code char[]} inputs (UTF-16 code units).
437+
* Computes Murmur3-32 for a UTF-16 {@code char[]} slice using the x86_32 mixing constants.
438438
*
439-
* <p>Processes two {@code char}s per 32-bit block (little-endian packing). The tail handles a
440-
* single remaining {@code char} if present. The finalization step mixes in the total number of
441-
* bytes processed (i.e., {@code len << 1}).
439+
* @param a input array of UTF-16 code units
440+
* @param off start offset (inclusive)
441+
* @param len number of {@code char} code units to read
442+
* @return the 32-bit hash value
443+
* @throws IndexOutOfBoundsException if {@code off} or {@code len} are invalid for {@code a}
442444
*/
443-
private static final class Murmur3
445+
static int murmur3(final char[] a, final int off, final int len)
444446
{
445-
/**
446-
* Computes Murmur3-32 for a UTF-16 {@code char[]} slice using the x86_32 mixing constants.
447-
*
448-
* @param a input array of UTF-16 code units
449-
* @param off start offset (inclusive)
450-
* @param len number of {@code char} code units to read
451-
* @param seed 32-bit seed (use a fixed constant for deterministic behavior)
452-
* @return the 32-bit hash value
453-
* @throws IndexOutOfBoundsException if {@code off} or {@code len} are invalid for {@code a}
454-
*/
455-
static int hashChars(final char[] a, final int off, final int len, final int seed)
456-
{
457-
final int n16 = len >>> 1;
458-
final boolean odd = (len & 1) != 0;
459-
int h1 = seed, idx = off;
460-
final int c1 = 0xcc9e2d51, c2 = 0x1b873593;
461-
462-
for (int i = 0; i < (n16 << 1); i += 2) {
463-
int k1 = (a[idx] & 0xFFFF) | ((a[idx + 1] & 0xFFFF) << 16);
464-
idx += 2;
465-
k1 *= c1;
466-
k1 = (k1 << 15) | (k1 >>> 17);
467-
k1 *= c2;
468-
h1 ^= k1;
469-
h1 = (h1 << 13) | (h1 >>> 19);
470-
h1 = h1 * 5 + 0xe6546b64;
471-
}
472-
if (odd) {
473-
int k1 = (a[idx] & 0xFFFF);
474-
k1 *= c1;
475-
k1 = (k1 << 15) | (k1 >>> 17);
476-
k1 *= c2;
477-
h1 ^= k1;
478-
}
479-
final int bytes = len << 1;
480-
h1 ^= bytes;
481-
h1 ^= (h1 >>> 16);
482-
h1 *= 0x85ebca6b;
483-
h1 ^= (h1 >>> 13);
484-
h1 *= 0xc2b2ae35;
485-
h1 ^= (h1 >>> 16);
486-
return h1;
447+
final int seed = 0x9747b28c;
448+
final int n16 = len >>> 1;
449+
final boolean odd = (len & 1) != 0;
450+
int h1 = seed, idx = off;
451+
final int c1 = 0xcc9e2d51, c2 = 0x1b873593;
452+
453+
for (int i = 0; i < (n16 << 1); i += 2) {
454+
int k1 = (a[idx] & 0xFFFF) | ((a[idx + 1] & 0xFFFF) << 16);
455+
idx += 2;
456+
k1 *= c1;
457+
k1 = (k1 << 15) | (k1 >>> 17);
458+
k1 *= c2;
459+
h1 ^= k1;
460+
h1 = (h1 << 13) | (h1 >>> 19);
461+
h1 = h1 * 5 + 0xe6546b64;
487462
}
463+
if (odd) {
464+
int k1 = (a[idx] & 0xFFFF);
465+
k1 *= c1;
466+
k1 = (k1 << 15) | (k1 >>> 17);
467+
k1 *= c2;
468+
h1 ^= k1;
469+
}
470+
final int bytes = len << 1;
471+
h1 ^= bytes;
472+
h1 ^= (h1 >>> 16);
473+
h1 *= 0x85ebca6b;
474+
h1 ^= (h1 >>> 13);
475+
h1 *= 0xc2b2ae35;
476+
h1 ^= (h1 >>> 16);
477+
return h1;
488478
}
489479

480+
/**
481+
* 31-based polynomial hash (String-style) over UTF-16 code units.
482+
*
483+
* Important: if you index with (h & mask) in a power-of-two table, you should
484+
* smear the result (h ^ (h >>> 16)) to improve low-bit distribution.
485+
*/
486+
static int hash31(final char[] a, final int off, final int len)
487+
{
488+
int h = 0;
489+
final int end = off + len;
490+
for (int i = off; i < end; i++) {
491+
h = 31 * h + a[i];
492+
}
493+
// smear like HashMap to improve low bits for power-of-two masking
494+
h ^= (h >>> 16);
495+
return h;
496+
}
497+
498+
/**
499+
* Murmur3 fmix32 finalizer (avalanche).
500+
*
501+
* <p>Mixes all bits of {@code h} so that small differences in input
502+
* produce large differences in output. Useful after a cheap rolling hash
503+
* (e.g., 31-polynomial) when indexing with power-of-two masking.</p>
504+
*/
505+
static int fmix32(int h)
506+
{
507+
h ^= (h >>> 16);
508+
h *= 0x85ebca6b;
509+
h ^= (h >>> 13);
510+
h *= 0xc2b2ae35;
511+
h ^= (h >>> 16);
512+
return h;
513+
}
514+
515+
/**
516+
* Cheap rolling hash (String-style) + fmix32 avalanche.
517+
*
518+
* <p>Good compromise for power-of-two hash tables: fast loop, robust final bits.</p>
519+
*/
520+
static int hash31_fmix32(final char[] a, final int off, final int len)
521+
{
522+
int h = 0;
523+
final int end = off + len;
524+
for (int i = off; i < end; i++) {
525+
h = 31 * h + a[i];
526+
}
527+
// incorporate length (like String does implicitly via iteration count, but explicit helps)
528+
h ^= (len << 1);
529+
return fmix32(h);
530+
}
490531
}

0 commit comments

Comments
 (0)