22 * Alix, A Lucene Indexer for XML documents.
33 *
44 * Copyright 2009 Pierre Dittgen <pierre@dittgen.org>
5- * Frédéric Glorieux <frederic.glorieux@fictif.org>
6- * Copyright 2016 Frédéric Glorieux <frederic.glorieux@fictif.org>
5+ * Frederic Glorieux <frederic.glorieux@fictif.org>
6+ * Copyright 2016 Frederic Glorieux <frederic.glorieux@fictif.org>
77 *
88 * Alix is a java library to index and search XML text documents
99 * with Lucene https://lucene.apache.org/core/
1414 * https://sf.net/projects/javacrim/
1515 * for a java course at Inalco http://www.er-tim.fr/
1616 * Alix continues the concepts of SDX under another licence
17- * «Système de Documentation XML»
18- * 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
17+ * "Systeme de Documentation XML"
18+ * 2000-2010 Ministere de la culture et de la communication (France), AJLSM.
1919 * http://savannah.nongnu.org/projects/sdx/
2020 *
2121 * Licensed under the Apache License, Version 2.0 (the "License");
4646 *
4747 * <p>Implementation details:
4848 * <ul>
49- * <li>Open addressing with linear probing; table capacity is always a power of two.</li>
50- * <li>Load factor ~0.75; automatic rehash on growth.</li>
51- * <li>Keys are stored contiguously in a growing {@code char} slab; per-ord metadata store offset,
52- * length, and the 32-bit hash to avoid recomputing during rehash.</li>
53- * <li>Lookup uses a 16-bit fingerprint (top bits of the 32-bit hash), length check, then a
54- * {@code memcmp}-style verification against the slab.</li>
49+ * <li>Open addressing with linear probing; table capacity is always a power of two (fast masking).</li>
50+ * <li>Load factor ~0.75; automatic rehash on growth. Ords remain stable across rehash.</li>
51+ * <li>Keys are copied and stored contiguously in a growing {@code char} slab.</li>
52+ * <li>Per-ord metadata store offset, length, and the 32-bit hash. Length is stored as an
53+ * <em>unsigned</em> 16-bit value (supports key lengths 0..65535).</li>
54+ * <li>Lookup uses: (1) slot check, (2) length check, (3) 16-bit fingerprint derived from the
55+ * stored 32-bit hash (top 16 bits), then (4) a {@code memcmp}-style verification against the slab.</li>
5556 * <li>Hash function: Murmur3-32 over UTF-16 code units (robust distribution for short Latin keys).</li>
5657 * </ul>
5758 *
59+ * <p>Space/time trade-offs:
60+ * <ul>
61+ * <li>Compared to keeping a dedicated 16-bit fingerprint array per slot, this implementation saves
62+ * {@code 2 * capacity} bytes, but fingerprint checks may require an additional random load from
63+ * {@code keyHash[ord]} when length matches.</li>
64+ * <li>Use {@link #trimToSize()} after bulk build to reduce slack in metadata arrays, the slab, and
65+ * (optionally) the hash table itself.</li>
66+ * </ul>
67+ *
5868 * <p>Complexity:
5969 * <ul>
6070 * <li>Average-case insert/find: {@code O(1)} expected.</li>
@@ -74,11 +84,11 @@ public final class CharArrayHash
7484 /**
7585 * Constructs the hash with an expected number of unique keys.
7686 *
77- * <p>The table's initial capacity is chosen to keep the load factor ≤ 0.75 for the expected
87+ * <p>The table's initial capacity is chosen to keep the load factor <= 0.75 for the expected
7888 * number of unique keys. The internal key-slab and per-ord metadata arrays are also sized
7989 * heuristically based on {@code expectedSize} and will grow as needed.
8090 *
81- * @param expectedSize an estimate of the number of distinct keys to be added (must be ≥ 1; values ≤ 0 are treated as 1)
91+ * @param expectedSize an estimate of the number of distinct keys to be added (must be >= 1; values <= 0 are treated as 1)
8292 */
8393 public CharArrayHash (int expectedSize )
8494 {
@@ -87,15 +97,15 @@ public CharArrayHash(int expectedSize)
8797 while (cap < need ) cap <<= 1 ;
8898 table = new int [cap ];
8999 Arrays .fill (table , -1 ); // -1 = empty
90- fp16 = new short [cap ];
91100 mask = cap - 1 ;
92101
93102 // ord metadata (0-based ords). Reserve slot count = expected + slack.
94103 int metaCap = Math .max (8 , expectedSize );
95104 keyOff = new int [metaCap ];
96- keyLen = new int [metaCap ];
105+ keyLen = new short [metaCap ];
97106 keyHash = new int [metaCap ];
98107
108+ // Heuristic: small average token size; the slab grows geometrically.
99109 slab = new char [Math .max (16 , expectedSize * 4 )];
100110 }
101111
@@ -104,7 +114,7 @@ public CharArrayHash(int expectedSize)
104114 *
105115 * <p>This method implements the {@code BytesRefHash.add} return contract:
106116 * <ul>
107- * <li>If the key is new, returns its assigned 0-based ordinal (ord ≥ 0).</li>
117+ * <li>If the key is new, returns its assigned 0-based ordinal (ord >= 0).</li>
108118 * <li>If the key already exists, returns {@code -(ord) - 1} (a negative encoding of the existing ord).</li>
109119 * </ul>
110120 *
@@ -115,32 +125,37 @@ public CharArrayHash(int expectedSize)
115125 * @param len number of {@code char} code units to read
116126 * @return {@code ord} if the key was added; otherwise {@code -(ord)-1} if the key already existed
117127 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are invalid for {@code key}
128+ * @throws IllegalArgumentException if {@code len} is greater than 65535
118129 */
119130 public int add (char [] key , int off , int len )
120131 {
121132 checkBounds (key , off , len );
122- int h = Murmur3 .hashChars (key , off , len , SEED );
133+ if (len > MAX_KEY_LENGTH ) throw new IllegalArgumentException ("key length > 65535: " + len );
134+
135+ final int h = Murmur3 .hashChars (key , off , len , SEED );
123136 int i = h & mask ;
124- short f = (short ) (h >>> 16 );
125137
126138 for (;;) {
127- int ord = table [i ];
139+ final int ord = table [i ];
128140 if (ord == -1 ) { // empty slot -> insert
129- ord = sizeOrds ; // 0-based
141+ final int newOrd = sizeOrds ; // 0-based
130142 sizeOrds ++;
143+
131144 ensureOrdCapacity (sizeOrds );
132- int base = appendToSlab (key , off , len );
133- keyOff [ord ] = base ;
134- keyLen [ord ] = len ;
135- keyHash [ord ] = h ;
145+ final int base = appendToSlab (key , off , len );
146+ keyOff [newOrd ] = base ;
147+ keyLen [newOrd ] = ( short ) len ; // stored unsigned
148+ keyHash [newOrd ] = h ;
136149
137- table [i ] = ord ;
138- fp16 [i ] = f ;
150+ table [i ] = newOrd ;
139151 if (++occupied > resizeThreshold ()) rehash (table .length << 1 );
140- return ord ; // new => >= 0
152+ return newOrd ; // new => >= 0
141153 }
142154
143- if (fp16 [i ] == f && keyLen [ord ] == len && equalsAt (ord , key , off , len )) {
155+ // Fast reject chain: length -> 16-bit fingerprint (top hash bits) -> full compare
156+ if (((keyLen [ord ] & 0xFFFF ) == len )
157+ && (((keyHash [ord ] ^ h ) & 0xFFFF_0000 ) == 0 )
158+ && equalsAt (ord , key , off , len )) {
144159 return -ord - 1 ; // existing => negative encoding
145160 }
146161 i = (i + 1 ) & mask ; // linear probe
@@ -159,28 +174,39 @@ public int add(char[] key, int off, int len)
159174 public int find (char [] key , int off , int len )
160175 {
161176 checkBounds (key , off , len );
162- int h = Murmur3 .hashChars (key , off , len , SEED );
177+ if (len > MAX_KEY_LENGTH ) return -1 ; // cannot exist (see add)
178+
179+ final int h = Murmur3 .hashChars (key , off , len , SEED );
163180 int i = h & mask ;
164- short f = ( short ) ( h >>> 16 );
181+
165182 for (;;) {
166- int ord = table [i ];
183+ final int ord = table [i ];
167184 if (ord == -1 ) return -1 ;
168- if (fp16 [i ] == f && keyLen [ord ] == len && equalsAt (ord , key , off , len )) return ord ;
185+
186+ if (((keyLen [ord ] & 0xFFFF ) == len )
187+ && (((keyHash [ord ] ^ h ) & 0xFFFF_0000 ) == 0 )
188+ && equalsAt (ord , key , off , len )) {
189+ return ord ;
190+ }
169191 i = (i + 1 ) & mask ;
170192 }
171193 }
172194
173195 /**
174196 * Returns the number of unique keys in the hash.
175197 *
176- * @return the number of assigned ords (≥ 0)
198+ * @return the number of assigned ords (>= 0)
177199 */
178200 public int size ()
179201 {
180202 return sizeOrds ;
181203 }
182204
183- // Access to stored keys (read-only)
205+ /**
206+ * Returns the underlying character slab storing all keys contiguously.
207+ *
208+ * <p>The valid range is {@code [0 .. slabUsed)}.
209+ */
184210 public char [] slab ()
185211 {
186212 return slab ;
@@ -189,7 +215,7 @@ public char[] slab()
189215 /**
190216 * Returns the starting offset of the key identified by the given ordinal within the slab.
191217 *
192- * @param ord the 0-based ordinal of the key (0 ≤ ord < {@link #size()})
218+ * @param ord the 0-based ordinal of the key (0 <= ord < {@link #size()})
193219 * @return the starting offset within {@link #slab()}
194220 * @throws IllegalArgumentException if {@code ord} is out of range
195221 */
@@ -202,29 +228,82 @@ public int keyOffset(int ord)
202228 /**
203229 * Returns the length (in {@code char} code units) of the key identified by the given ordinal.
204230 *
205- * @param ord the 0-based ordinal of the key (0 ≤ ord < {@link #size()})
231+ * @param ord the 0-based ordinal of the key (0 <= ord < {@link #size()})
206232 * @return the number of {@code char} code units of the key at {@link #keyOffset(int)}
207233 * @throws IllegalArgumentException if {@code ord} is out of range
208234 */
209235 public int keyLength (int ord )
210236 {
211237 checkOrd (ord );
212- return keyLen [ord ];
238+ return keyLen [ord ] & 0xFFFF ;
239+ }
240+
241+ /**
242+ * Shrinks internal storage to (approximately) the minimum needed for the current contents.
243+ *
244+ * <p>Intended for bulk-build workflows:
245+ * <ol>
246+ * <li>Create with a reasonable {@code expectedSize}.</li>
247+ * <li>Add all keys.</li>
248+ * <li>Call {@link #trimToSize()} once to reduce memory slack.</li>
249+ * </ol>
250+ *
251+ * <p>This method:
252+ * <ul>
253+ * <li>Trims the slab to {@code slabUsed}.</li>
254+ * <li>Trims per-ord metadata arrays to {@link #size()}.</li>
255+ * <li>Optionally shrinks the hash table to the smallest power-of-two capacity that can hold
256+ * {@link #size()} entries at the target load factor, rebuilding the table if it shrinks.</li>
257+ * </ul>
258+ *
259+ * <p>After trimming, further {@link #add(char[], int, int)} calls remain valid, but may grow
260+ * arrays again.
261+ */
262+ public void trimToSize ()
263+ {
264+ // 1) shrink slab
265+ if (slab .length != slabUsed ) {
266+ slab = Arrays .copyOf (slab , slabUsed );
267+ }
268+
269+ // 2) shrink per-ord metadata
270+ if (keyOff .length != sizeOrds ) {
271+ keyOff = Arrays .copyOf (keyOff , sizeOrds );
272+ keyLen = Arrays .copyOf (keyLen , sizeOrds );
273+ keyHash = Arrays .copyOf (keyHash , sizeOrds );
274+ }
275+
276+ // 3) shrink table (if possible)
277+ int need = (int ) Math .ceil (sizeOrds / LOAD_FACTOR );
278+ int cap = 1 ;
279+ while (cap < need ) cap <<= 1 ;
280+ if (cap < table .length ) {
281+ rehash (cap );
282+ }
283+ }
284+
285+ /**
286+ * Alias for {@link #trimToSize()} to emphasize the "bulk build then freeze" use-case.
287+ */
288+ public void freeze ()
289+ {
290+ trimToSize ();
213291 }
214292
215293 // ---- Internals -----------------------------------------------------------
216294
217295 private static final float LOAD_FACTOR = 0.75f ;
218296 private static final int SEED = 0x9747b28c ;
297+ private static final int MAX_KEY_LENGTH = 0xFFFF ;
219298
220- // Hash table: stores ords (>=0) or -1 if empty; 16-bit fingerprint for quick
221- // reject
299+ // Hash table: stores ords (>=0) or -1 if empty.
222300 private int [] table ;
223- private short [] fp16 ;
224301 private int mask ;
225302
226303 // Per-ord metadata (0-based ords)
227- private int [] keyOff , keyLen , keyHash ;
304+ private int [] keyOff ;
305+ private short [] keyLen ; // unsigned
306+ private int [] keyHash ;
228307
229308 // Key storage
230309 private char [] slab ;
@@ -251,7 +330,7 @@ private static void checkBounds(char[] a, int off, int len)
251330 * Validates that {@code ord} is a currently assigned ordinal.
252331 *
253332 * @param ord 0-based ordinal
254- * @throws IllegalArgumentException if {@code ord} is < 0 or ≥ {@link #size()}
333+ * @throws IllegalArgumentException if {@code ord} is < 0 or >= {@link #size()}
255334 */
256335 private void checkOrd (int ord )
257336 {
@@ -321,7 +400,7 @@ private int resizeThreshold()
321400 private void ensureOrdCapacity (int required )
322401 {
323402 if (required <= keyOff .length ) return ;
324- int cap = Math .max (required , keyOff .length + (keyOff .length >>> 1 ) + 16 );
403+ final int cap = Math .max (required , keyOff .length + (keyOff .length >>> 1 ) + 16 );
325404 keyOff = Arrays .copyOf (keyOff , cap );
326405 keyLen = Arrays .copyOf (keyLen , cap );
327406 keyHash = Arrays .copyOf (keyHash , cap );
@@ -337,23 +416,19 @@ private void ensureOrdCapacity(int required)
337416 */
338417 private void rehash (int newCap )
339418 {
340- int [] oldTable = table ;
341- // short[] oldFp = fp16;
419+ final int [] oldTable = table ;
342420 table = new int [newCap ];
343421 Arrays .fill (table , -1 );
344- fp16 = new short [newCap ];
345422 mask = newCap - 1 ;
346423 occupied = 0 ;
347424
348425 for (int i = 0 ; i < oldTable .length ; i ++) {
349- int ord = oldTable [i ];
426+ final int ord = oldTable [i ];
350427 if (ord < 0 ) continue ;
351- int h = keyHash [ord ];
428+ final int h = keyHash [ord ];
352429 int j = h & mask ;
353- short f = (short ) (h >>> 16 );
354430 while (table [j ] != -1 ) j = (j + 1 ) & mask ;
355431 table [j ] = ord ;
356- fp16 [j ] = f ;
357432 occupied ++;
358433 }
359434 }
@@ -401,7 +476,7 @@ static int hashChars(final char[] a, final int off, final int len, final int see
401476 k1 *= c2 ;
402477 h1 ^= k1 ;
403478 }
404- int bytes = len << 1 ;
479+ final int bytes = len << 1 ;
405480 h1 ^= bytes ;
406481 h1 ^= (h1 >>> 16 );
407482 h1 *= 0x85ebca6b ;
@@ -412,4 +487,4 @@ static int hashChars(final char[] a, final int off, final int len, final int see
412487 }
413488 }
414489
415- }
490+ }
0 commit comments