Skip to content

Commit 63265e2

Browse files
committed
memory optimization
1 parent 2407763 commit 63265e2

2 files changed

Lines changed: 127 additions & 51 deletions

File tree

test/src/java/com/github/oeuvres/alix/util/CharArrayHashBenchmark.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ private static void runHash(List<char[]> tokens, List<char[]> hits, List<char[]>
5858
int r = h.add(w, 0, w.length);
5959
lastOrd = (r >= 0) ? r : (-r - 1);
6060
}
61+
h.trimToSize();
6162

6263
long buildNs = System.nanoTime() - t0;
6364
forceGC();

util/src/java/com/github/oeuvres/alix/util/CharArrayHash.java

Lines changed: 126 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* Alix, A Lucene Indexer for XML documents.
33
*
44
* Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
5-
* Frédéric Glorieux <frederic.glorieux@fictif.org>
6-
* Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
5+
* Frederic Glorieux <frederic.glorieux@fictif.org>
6+
* Copyright 2016 Frederic Glorieux <frederic.glorieux@fictif.org>
77
*
88
* Alix is a java library to index and search XML text documents
99
* with Lucene https://lucene.apache.org/core/
@@ -14,8 +14,8 @@
1414
* https://sf.net/projects/javacrim/
1515
* for a java course at Inalco http://www.er-tim.fr/
1616
* Alix continues the concepts of SDX under another licence
17-
* «Système de Documentation XML»
18-
* 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
17+
* "Systeme de Documentation XML"
18+
* 2000-2010 Ministere de la culture et de la communication (France), AJLSM.
1919
* http://savannah.nongnu.org/projects/sdx/
2020
*
2121
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,15 +46,25 @@
4646
*
4747
* <p>Implementation details:
4848
* <ul>
49-
* <li>Open addressing with linear probing; table capacity is always a power of two.</li>
50-
* <li>Load factor ~0.75; automatic rehash on growth.</li>
51-
* <li>Keys are stored contiguously in a growing {@code char} slab; per-ord metadata store offset,
52-
* length, and the 32-bit hash to avoid recomputing during rehash.</li>
53-
* <li>Lookup uses a 16-bit fingerprint (top bits of the 32-bit hash), length check, then a
54-
* {@code memcmp}-style verification against the slab.</li>
49+
* <li>Open addressing with linear probing; table capacity is always a power of two (fast masking).</li>
50+
* <li>Load factor ~0.75; automatic rehash on growth. Ords remain stable across rehash.</li>
51+
* <li>Keys are copied and stored contiguously in a growing {@code char} slab.</li>
52+
* <li>Per-ord metadata store offset, length, and the 32-bit hash. Length is stored as an
53+
* <em>unsigned</em> 16-bit value (supports key lengths 0..65535).</li>
54+
* <li>Lookup uses: (1) slot check, (2) length check, (3) 16-bit fingerprint derived from the
55+
* stored 32-bit hash (top 16 bits), then (4) a {@code memcmp}-style verification against the slab.</li>
5556
* <li>Hash function: Murmur3-32 over UTF-16 code units (robust distribution for short Latin keys).</li>
5657
* </ul>
5758
*
59+
* <p>Space/time trade-offs:
60+
* <ul>
61+
* <li>Compared to keeping a dedicated 16-bit fingerprint array per slot, this implementation saves
62+
* {@code 2 * capacity} bytes, but fingerprint checks may require an additional random load from
63+
* {@code keyHash[ord]} when length matches.</li>
64+
* <li>Use {@link #trimToSize()} after bulk build to reduce slack in metadata arrays, the slab, and
65+
* (optionally) the hash table itself.</li>
66+
* </ul>
67+
*
5868
* <p>Complexity:
5969
* <ul>
6070
* <li>Average-case insert/find: {@code O(1)} expected.</li>
@@ -74,11 +84,11 @@ public final class CharArrayHash
7484
/**
7585
* Constructs the hash with an expected number of unique keys.
7686
*
77-
* <p>The table's initial capacity is chosen to keep the load factor 0.75 for the expected
87+
* <p>The table's initial capacity is chosen to keep the load factor <= 0.75 for the expected
7888
* number of unique keys. The internal key-slab and per-ord metadata arrays are also sized
7989
* heuristically based on {@code expectedSize} and will grow as needed.
8090
*
81-
* @param expectedSize an estimate of the number of distinct keys to be added (must be 1; values 0 are treated as 1)
91+
* @param expectedSize an estimate of the number of distinct keys to be added (must be >= 1; values <= 0 are treated as 1)
8292
*/
8393
public CharArrayHash(int expectedSize)
8494
{
@@ -87,15 +97,15 @@ public CharArrayHash(int expectedSize)
8797
while (cap < need) cap <<= 1;
8898
table = new int[cap];
8999
Arrays.fill(table, -1); // -1 = empty
90-
fp16 = new short[cap];
91100
mask = cap - 1;
92101

93102
// ord metadata (0-based ords). Reserve slot count = expected + slack.
94103
int metaCap = Math.max(8, expectedSize);
95104
keyOff = new int[metaCap];
96-
keyLen = new int[metaCap];
105+
keyLen = new short[metaCap];
97106
keyHash = new int[metaCap];
98107

108+
// Heuristic: small average token size; the slab grows geometrically.
99109
slab = new char[Math.max(16, expectedSize * 4)];
100110
}
101111

@@ -104,7 +114,7 @@ public CharArrayHash(int expectedSize)
104114
*
105115
* <p>This method implements the {@code BytesRefHash.add} return contract:
106116
* <ul>
107-
* <li>If the key is new, returns its assigned 0-based ordinal (ord 0).</li>
117+
* <li>If the key is new, returns its assigned 0-based ordinal (ord >= 0).</li>
108118
* <li>If the key already exists, returns {@code -(ord) - 1} (a negative encoding of the existing ord).</li>
109119
* </ul>
110120
*
@@ -115,32 +125,37 @@ public CharArrayHash(int expectedSize)
115125
* @param len number of {@code char} code units to read
116126
* @return {@code ord} if the key was added; otherwise {@code -(ord)-1} if the key already existed
117127
* @throws IndexOutOfBoundsException if {@code off} or {@code len} are invalid for {@code key}
128+
* @throws IllegalArgumentException if {@code len} is greater than 65535
118129
*/
119130
public int add(char[] key, int off, int len)
120131
{
121132
checkBounds(key, off, len);
122-
int h = Murmur3.hashChars(key, off, len, SEED);
133+
if (len > MAX_KEY_LENGTH) throw new IllegalArgumentException("key length > 65535: " + len);
134+
135+
final int h = Murmur3.hashChars(key, off, len, SEED);
123136
int i = h & mask;
124-
short f = (short) (h >>> 16);
125137

126138
for (;;) {
127-
int ord = table[i];
139+
final int ord = table[i];
128140
if (ord == -1) { // empty slot -> insert
129-
ord = sizeOrds; // 0-based
141+
final int newOrd = sizeOrds; // 0-based
130142
sizeOrds++;
143+
131144
ensureOrdCapacity(sizeOrds);
132-
int base = appendToSlab(key, off, len);
133-
keyOff[ord] = base;
134-
keyLen[ord] = len;
135-
keyHash[ord] = h;
145+
final int base = appendToSlab(key, off, len);
146+
keyOff[newOrd] = base;
147+
keyLen[newOrd] = (short) len; // stored unsigned
148+
keyHash[newOrd] = h;
136149

137-
table[i] = ord;
138-
fp16[i] = f;
150+
table[i] = newOrd;
139151
if (++occupied > resizeThreshold()) rehash(table.length << 1);
140-
return ord; // new => >= 0
152+
return newOrd; // new => >= 0
141153
}
142154

143-
if (fp16[i] == f && keyLen[ord] == len && equalsAt(ord, key, off, len)) {
155+
// Fast reject chain: length -> 16-bit fingerprint (top hash bits) -> full compare
156+
if (((keyLen[ord] & 0xFFFF) == len)
157+
&& (((keyHash[ord] ^ h) & 0xFFFF_0000) == 0)
158+
&& equalsAt(ord, key, off, len)) {
144159
return -ord - 1; // existing => negative encoding
145160
}
146161
i = (i + 1) & mask; // linear probe
@@ -159,28 +174,39 @@ public int add(char[] key, int off, int len)
159174
public int find(char[] key, int off, int len)
160175
{
161176
checkBounds(key, off, len);
162-
int h = Murmur3.hashChars(key, off, len, SEED);
177+
if (len > MAX_KEY_LENGTH) return -1; // cannot exist (see add)
178+
179+
final int h = Murmur3.hashChars(key, off, len, SEED);
163180
int i = h & mask;
164-
short f = (short) (h >>> 16);
181+
165182
for (;;) {
166-
int ord = table[i];
183+
final int ord = table[i];
167184
if (ord == -1) return -1;
168-
if (fp16[i] == f && keyLen[ord] == len && equalsAt(ord, key, off, len)) return ord;
185+
186+
if (((keyLen[ord] & 0xFFFF) == len)
187+
&& (((keyHash[ord] ^ h) & 0xFFFF_0000) == 0)
188+
&& equalsAt(ord, key, off, len)) {
189+
return ord;
190+
}
169191
i = (i + 1) & mask;
170192
}
171193
}
172194

173195
/**
174196
* Returns the number of unique keys in the hash.
175197
*
176-
* @return the number of assigned ords ( 0)
198+
* @return the number of assigned ords (>= 0)
177199
*/
178200
public int size()
179201
{
180202
return sizeOrds;
181203
}
182204

183-
// Access to stored keys (read-only)
205+
/**
206+
* Returns the underlying character slab storing all keys contiguously.
207+
*
208+
* <p>The valid range is {@code [0 .. slabUsed)}.
209+
*/
184210
public char[] slab()
185211
{
186212
return slab;
@@ -189,7 +215,7 @@ public char[] slab()
189215
/**
190216
* Returns the starting offset of the key identified by the given ordinal within the slab.
191217
*
192-
* @param ord the 0-based ordinal of the key (0 ord &lt; {@link #size()})
218+
* @param ord the 0-based ordinal of the key (0 <= ord &lt; {@link #size()})
193219
* @return the starting offset within {@link #slab()}
194220
* @throws IllegalArgumentException if {@code ord} is out of range
195221
*/
@@ -202,29 +228,82 @@ public int keyOffset(int ord)
202228
/**
203229
* Returns the length (in {@code char} code units) of the key identified by the given ordinal.
204230
*
205-
* @param ord the 0-based ordinal of the key (0 ord &lt; {@link #size()})
231+
* @param ord the 0-based ordinal of the key (0 <= ord &lt; {@link #size()})
206232
* @return the number of {@code char} code units of the key at {@link #keyOffset(int)}
207233
* @throws IllegalArgumentException if {@code ord} is out of range
208234
*/
209235
public int keyLength(int ord)
210236
{
211237
checkOrd(ord);
212-
return keyLen[ord];
238+
return keyLen[ord] & 0xFFFF;
239+
}
240+
241+
/**
242+
* Shrinks internal storage to (approximately) the minimum needed for the current contents.
243+
*
244+
* <p>Intended for bulk-build workflows:
245+
* <ol>
246+
* <li>Create with a reasonable {@code expectedSize}.</li>
247+
* <li>Add all keys.</li>
248+
* <li>Call {@link #trimToSize()} once to reduce memory slack.</li>
249+
* </ol>
250+
*
251+
* <p>This method:
252+
* <ul>
253+
* <li>Trims the slab to {@code slabUsed}.</li>
254+
* <li>Trims per-ord metadata arrays to {@link #size()}.</li>
255+
* <li>Optionally shrinks the hash table to the smallest power-of-two capacity that can hold
256+
* {@link #size()} entries at the target load factor, rebuilding the table if it shrinks.</li>
257+
* </ul>
258+
*
259+
* <p>After trimming, further {@link #add(char[], int, int)} calls remain valid, but may grow
260+
* arrays again.
261+
*/
262+
public void trimToSize()
263+
{
264+
// 1) shrink slab
265+
if (slab.length != slabUsed) {
266+
slab = Arrays.copyOf(slab, slabUsed);
267+
}
268+
269+
// 2) shrink per-ord metadata
270+
if (keyOff.length != sizeOrds) {
271+
keyOff = Arrays.copyOf(keyOff, sizeOrds);
272+
keyLen = Arrays.copyOf(keyLen, sizeOrds);
273+
keyHash = Arrays.copyOf(keyHash, sizeOrds);
274+
}
275+
276+
// 3) shrink table (if possible)
277+
int need = (int) Math.ceil(sizeOrds / LOAD_FACTOR);
278+
int cap = 1;
279+
while (cap < need) cap <<= 1;
280+
if (cap < table.length) {
281+
rehash(cap);
282+
}
283+
}
284+
285+
/**
286+
* Alias for {@link #trimToSize()} to emphasize the "bulk build then freeze" use-case.
287+
*/
288+
public void freeze()
289+
{
290+
trimToSize();
213291
}
214292

215293
// ---- Internals -----------------------------------------------------------
216294

217295
private static final float LOAD_FACTOR = 0.75f;
218296
private static final int SEED = 0x9747b28c;
297+
private static final int MAX_KEY_LENGTH = 0xFFFF;
219298

220-
// Hash table: stores ords (>=0) or -1 if empty; 16-bit fingerprint for quick
221-
// reject
299+
// Hash table: stores ords (>=0) or -1 if empty.
222300
private int[] table;
223-
private short[] fp16;
224301
private int mask;
225302

226303
// Per-ord metadata (0-based ords)
227-
private int[] keyOff, keyLen, keyHash;
304+
private int[] keyOff;
305+
private short[] keyLen; // unsigned
306+
private int[] keyHash;
228307

229308
// Key storage
230309
private char[] slab;
@@ -251,7 +330,7 @@ private static void checkBounds(char[] a, int off, int len)
251330
* Validates that {@code ord} is a currently assigned ordinal.
252331
*
253332
* @param ord 0-based ordinal
254-
* @throws IllegalArgumentException if {@code ord} is &lt; 0 or {@link #size()}
333+
* @throws IllegalArgumentException if {@code ord} is &lt; 0 or >= {@link #size()}
255334
*/
256335
private void checkOrd(int ord)
257336
{
@@ -321,7 +400,7 @@ private int resizeThreshold()
321400
private void ensureOrdCapacity(int required)
322401
{
323402
if (required <= keyOff.length) return;
324-
int cap = Math.max(required, keyOff.length + (keyOff.length >>> 1) + 16);
403+
final int cap = Math.max(required, keyOff.length + (keyOff.length >>> 1) + 16);
325404
keyOff = Arrays.copyOf(keyOff, cap);
326405
keyLen = Arrays.copyOf(keyLen, cap);
327406
keyHash = Arrays.copyOf(keyHash, cap);
@@ -337,23 +416,19 @@ private void ensureOrdCapacity(int required)
337416
*/
338417
private void rehash(int newCap)
339418
{
340-
int[] oldTable = table;
341-
// short[] oldFp = fp16;
419+
final int[] oldTable = table;
342420
table = new int[newCap];
343421
Arrays.fill(table, -1);
344-
fp16 = new short[newCap];
345422
mask = newCap - 1;
346423
occupied = 0;
347424

348425
for (int i = 0; i < oldTable.length; i++) {
349-
int ord = oldTable[i];
426+
final int ord = oldTable[i];
350427
if (ord < 0) continue;
351-
int h = keyHash[ord];
428+
final int h = keyHash[ord];
352429
int j = h & mask;
353-
short f = (short) (h >>> 16);
354430
while (table[j] != -1) j = (j + 1) & mask;
355431
table[j] = ord;
356-
fp16[j] = f;
357432
occupied++;
358433
}
359434
}
@@ -401,7 +476,7 @@ static int hashChars(final char[] a, final int off, final int len, final int see
401476
k1 *= c2;
402477
h1 ^= k1;
403478
}
404-
int bytes = len << 1;
479+
final int bytes = len << 1;
405480
h1 ^= bytes;
406481
h1 ^= (h1 >>> 16);
407482
h1 *= 0x85ebca6b;
@@ -412,4 +487,4 @@ static int hashChars(final char[] a, final int off, final int len, final int see
412487
}
413488
}
414489

415-
}
490+
}

0 commit comments

Comments
 (0)