1717import java .util .Arrays ;
1818
1919/**
20- * Dependency-free hash dictionary of UTF-16 character sequences, with optional
21- * value association. Both keys and values are stored as char sequences and
22- * share the same ordinal space: a sequence is interned at most once,
23- * regardless of whether it appears as a key, a value, or both. This is a
24- * deliberate divergence from {@link java.util.Map}; it makes self-referential
25- * dictionaries (lemma maps, alias tables) cost the minimum.
20+ * Dependency-free hash dictionary of UTF-16 character sequences with stable
21+ * integer ordinals. Each distinct sequence is interned at most once and
22+ * receives a 0-based ord that does not change for the lifetime of this
23+ * instance.
2624 *
27- * <h2>Semantics</h2>
28- * <ul>
29- * <li>{@link #add(CharSequence) add(...)} interns a sequence and returns
30- * its ordinal. Idempotent.</li>
31- * <li>{@link #put(CharSequence, CharSequence) put(k, v)} interns both
32- * sequences if absent, then associates {@code k}'s ord with {@code v}'s
33- * ord. Replace-on-put: returns the previous value ord, or
34- * {@link #HAS_NO_VALUE} if the key had no association.</li>
35- * <li>{@link #ord(CharSequence) ord(...)} returns the ordinal of an existing
36- * sequence, or {@link #NOT_IN_DIC} if absent. Never inserts.</li>
37- * <li>{@link #valueOrd(CharSequence) valueOrd(...)} returns the ord of the
38- * value associated with a key. Returns {@link #NOT_IN_DIC} if the key
39- * sequence is not in the dictionary, or {@link #HAS_NO_VALUE} if it is
40- * in the dictionary but has no associated value.</li>
41- * <li>{@link #copy(int, char[], int) copy(ord, dst, off)} writes a
42- * sequence's chars into a caller-supplied buffer. Returns the char count
43- * written, or echoes a negative ord unchanged.</li>
44- * </ul>
45- *
46- * <p>Composition pattern for keyed lookup:</p>
47- * <pre>{@code
48- * int len = dic.copy(dic.valueOrd("-ce"), buf, 0);
49- * if (len < 0) {
50- * // -1 (NOT_IN_DIC): key absent, or
51- * // -2 (HAS_NO_VALUE): key present without value
52- * } else {
53- * // buf[0..len) holds the value
54- * }
55- * }</pre>
25+ * <p>This class provides set semantics only. Value associations on top of a
26+ * dictionary belong to companion classes that compose a {@code CharsDic} with
27+ * a parallel array (see {@code CharsMap}, {@code CharsFreq}). Composition
28+ * preserves the property that any sequence ever passed in (whether as a key,
29+ * a value, or a counter target) is interned exactly once and shares the same
30+ * ord space.</p>
5631 *
5732 * <h2>Implementation</h2>
5833 * <ul>
6237 * metadata holds slab offset and length. A 16-bit fingerprint per slot
6338 * rejects most probes before slab comparison.</li>
6439 * <li>Per-ord hashes are retained for rehashing without re-walking the slab.</li>
65- * <li>Per-ord value associations are held in a parallel {@code int[]},
66- * initialised to {@link #HAS_NO_VALUE}.</li>
6740 * <li>Hash function: Murmur3-32 over UTF-16 code units.</li>
6841 * </ul>
6942 *
70- * <p>Memory at <i>n</i> ords (rough): 16 bytes/ord (meta + termHash + values)
71- * plus ~8 bytes/slot in the open-addressing table at 0.75 load, plus the slab
43+ * <p>Memory at <i>n</i> ords (rough): 12 bytes/ord (meta + termHash) plus
44+ * ~8 bytes/slot in the open-addressing table at 0.75 load, plus the slab
7245 * itself (sum of all sequence lengths in chars).</p>
7346 *
7447 * <p>Thread-safety: not thread-safe under mutation. Concurrent reads are safe
7750public final class CharsDic
7851{
7952 /**
80- * Returned by {@link #valueOrd(int)} and related lookups when the key is
81- * present in the dictionary but has no associated value.
82- */
83- public static final int HAS_NO_VALUE = -2 ;
84-
85- /**
86- * Returned by {@link #ord(CharSequence)}, {@link #valueOrd(CharSequence)}
87- * and related lookups when the queried sequence is not in the dictionary.
88- * Also the value of {@link #copy(int, char[], int)} when the supplied ord
89- * is negative.
53+ * Returned by {@link #ord(CharSequence)} and related lookups when the
54+ * queried sequence is not in the dictionary. Also returned by
55+ * {@link #copy(int, char[], int)} when the supplied ord is negative.
9056 */
9157 public static final int NOT_IN_DIC = -1 ;
9258
@@ -139,9 +105,6 @@ public final class CharsDic
139105 /** Full 32-bit hash per ord, retained for rehashing. */
140106 private int [] termHash ;
141107
142- /** Per-ord associated value ord, initialised to {@link #HAS_NO_VALUE}. */
143- private int [] values ;
144-
145108 /**
146109 * Constructs the dictionary with an expected number of unique sequences.
147110 *
@@ -166,18 +129,12 @@ public CharsDic(int expectedSize)
166129 final int metaCap = Math .max (8 , expectedSize );
167130 meta = new long [metaCap ];
168131 termHash = new int [metaCap ];
169- values = new int [metaCap ];
170- Arrays .fill (values , HAS_NO_VALUE );
171132
172133 slab = new char [Math .max (16 , expectedSize * 4 )];
173134 }
174135
175136 /**
176- * Interns a sequence without setting any associated value.
177- *
178- * <p>Idempotent. If the sequence already has a value association from a
179- * previous {@link #put(CharSequence, CharSequence)}, that association is
180- * preserved.</p>
137+ * Interns a sequence.
181138 *
182139 * @param key source sequence (UTF-16 code units)
183140 * @return the assigned 0-based ord ({@code >= 0})
@@ -193,7 +150,7 @@ public int add(final CharSequence key)
193150 }
194151
195152 /**
196- * Interns a slice of a {@code char[]} without setting any associated value .
153+ * Interns a slice of a {@code char[]}.
197154 *
198155 * @param key source array (UTF-16 code units)
199156 * @param off start offset (inclusive)
@@ -211,10 +168,9 @@ public int add(final char[] key, final int off, final int len)
211168 }
212169
213170 /**
214- * Interns a slice of a {@link CharSequence} without setting any associated
215- * value.
171+ * Interns a slice of a {@link CharSequence}.
216172 *
217- * @param key source character sequence (UTF-16 code units)
173+ * @param key source character sequence
218174 * @param off start offset (inclusive)
219175 * @param len number of code units to read
220176 * @return the assigned 0-based ord ({@code >= 0})
@@ -246,8 +202,7 @@ public String asString(final int ord)
246202 }
247203
248204 /**
249- * Tells whether a sequence is interned, regardless of whether it appears
250- * as a key, a value, or both.
205+ * Tells whether a sequence is interned.
251206 *
252207 * @param key source sequence
253208 * @return true iff the sequence is in the dictionary
@@ -259,8 +214,7 @@ public boolean contains(final CharSequence key)
259214 }
260215
261216 /**
262- * Tells whether a slice of a {@code char[]} is interned, regardless of
263- * whether it appears as a key, a value, or both.
217+ * Tells whether a slice of a {@code char[]} is interned.
264218 *
265219 * @param key source array
266220 * @param off start offset
@@ -275,8 +229,7 @@ public boolean contains(final char[] key, final int off, final int len)
275229 }
276230
277231 /**
278- * Tells whether a slice of a {@link CharSequence} is interned, regardless
279- * of whether it appears as a key, a value, or both.
232+ * Tells whether a slice of a {@link CharSequence} is interned.
280233 *
281234 * @param key source sequence
282235 * @param off start offset
@@ -293,16 +246,10 @@ public boolean contains(final CharSequence key, final int off, final int len)
293246 /**
294247 * Copies the sequence stored at {@code ord} into a destination buffer.
295248 *
296- * <p>If {@code ord} is negative (typically a value returned by
297- * {@link #ord(CharSequence)} or {@link #valueOrd(CharSequence)} on a miss),
298- * the same negative value is returned and {@code dst} is left
299- * untouched. This lets callers compose lookups without an intermediate
300- * branch:</p>
301- *
302- * <pre>{@code
303- * int len = dic.copy(dic.valueOrd(key), buf, 0);
304- * if (len < 0) { ...miss... } else { ...buf[0..len)... }
305- * }</pre>
249+ * <p>If {@code ord} is negative (typically {@link #NOT_IN_DIC} from a
250+ * lookup miss), the same negative value is returned and {@code dst} is
251+ * left untouched. This lets callers compose lookups without an
252+ * intermediate branch.</p>
306253 *
307254 * @param ord ord to read; negative values pass through
308255 * @param dst destination array (must be non-null when {@code ord >= 0})
@@ -458,91 +405,7 @@ public int ord(final CharSequence key, final int off, final int len)
458405 }
459406
460407 /**
461- * Associates the value sequence with the key sequence. Both are interned
462- * if absent. Replace-on-put.
463- *
464- * @param key key sequence
465- * @param value value sequence
466- * @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
467- * the key had no association before this call
468- * @throws NullPointerException if either argument is {@code null}
469- * @throws IllegalArgumentException if either length exceeds 65535
470- */
471- public int put (final CharSequence key , final CharSequence value )
472- {
473- if (key == null ) {
474- throw new NullPointerException ("key" );
475- }
476- if (value == null ) {
477- throw new NullPointerException ("value" );
478- }
479- return put (key , 0 , key .length (), value , 0 , value .length ());
480- }
481-
482- /**
483- * Associates a value-slice sequence with a key-slice sequence. Both are
484- * interned if absent. Replace-on-put.
485- *
486- * @param key key sequence
487- * @param keyOff key start offset
488- * @param keyLen key length
489- * @param value value sequence
490- * @param valueOff value start offset
491- * @param valueLen value length
492- * @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
493- * the key had no association before this call
494- * @throws NullPointerException if either sequence is {@code null}
495- * @throws IndexOutOfBoundsException if any offset/length is invalid
496- * @throws IllegalArgumentException if either length exceeds 65535
497- */
498- public int put (
499- final CharSequence key , final int keyOff , final int keyLen ,
500- final CharSequence value , final int valueOff , final int valueLen )
501- {
502- checkBounds (key , keyOff , keyLen );
503- checkBounds (value , valueOff , valueLen );
504- checkLen (keyLen );
505- checkLen (valueLen );
506- final int kOrd = intern (null , keyOff , keyLen , key );
507- final int vOrd = intern (null , valueOff , valueLen , value );
508- final int prev = values [kOrd ];
509- values [kOrd ] = vOrd ;
510- return prev ;
511- }
512-
513- /**
514- * Associates a value-slice {@code char[]} with a key-slice {@code char[]}.
515- * Both are interned if absent. Replace-on-put.
516- *
517- * @param key key array
518- * @param keyOff key start offset
519- * @param keyLen key length
520- * @param value value array
521- * @param valueOff value start offset
522- * @param valueLen value length
523- * @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
524- * the key had no association before this call
525- * @throws NullPointerException if either array is {@code null}
526- * @throws IndexOutOfBoundsException if any offset/length is invalid
527- * @throws IllegalArgumentException if either length exceeds 65535
528- */
529- public int put (
530- final char [] key , final int keyOff , final int keyLen ,
531- final char [] value , final int valueOff , final int valueLen )
532- {
533- checkBounds (key , keyOff , keyLen );
534- checkBounds (value , valueOff , valueLen );
535- checkLen (keyLen );
536- checkLen (valueLen );
537- final int kOrd = intern (key , keyOff , keyLen , null );
538- final int vOrd = intern (value , valueOff , valueLen , null );
539- final int prev = values [kOrd ];
540- values [kOrd ] = vOrd ;
541- return prev ;
542- }
543-
544- /**
545- * Returns the number of unique sequences interned (keys, values, or both).
408+ * Returns the number of unique sequences interned.
546409 *
547410 * @return number of assigned ords
548411 */
@@ -605,72 +468,9 @@ public void trimToSize()
605468 if (meta .length != sizeOrds ) {
606469 meta = Arrays .copyOf (meta , sizeOrds );
607470 termHash = Arrays .copyOf (termHash , sizeOrds );
608- values = Arrays .copyOf (values , sizeOrds );
609471 }
610472 }
611473
612- /**
613- * Returns the value-ord associated with a key sequence.
614- *
615- * @param key key sequence
616- * @return associated value ord, {@link #NOT_IN_DIC} if the key sequence
617- * is not in the dictionary, or {@link #HAS_NO_VALUE} if it is in
618- * the dictionary but has no associated value
619- * @throws NullPointerException if {@code key} is {@code null}
620- */
621- public int valueOrd (final CharSequence key )
622- {
623- final int o = ord (key );
624- return (o < 0 ) ? o : values [o ];
625- }
626-
627- /**
628- * Returns the value-ord associated with a key by direct ord lookup.
629- *
630- * @param keyOrd key ord
631- * @return associated value ord, or {@link #HAS_NO_VALUE} if no association
632- * @throws IllegalArgumentException if {@code keyOrd} is invalid
633- */
634- public int valueOrd (final int keyOrd )
635- {
636- checkOrd (keyOrd );
637- return values [keyOrd ];
638- }
639-
640- /**
641- * Returns the value-ord associated with a key {@code char[]} slice.
642- *
643- * @param key key array
644- * @param off start offset
645- * @param len number of code units
646- * @return associated value ord, {@link #NOT_IN_DIC} if the key is not in
647- * the dictionary, or {@link #HAS_NO_VALUE} if it has no association
648- * @throws NullPointerException if {@code key} is {@code null}
649- * @throws IndexOutOfBoundsException if {@code off}/{@code len} are invalid
650- */
651- public int valueOrd (final char [] key , final int off , final int len )
652- {
653- final int o = ord (key , off , len );
654- return (o < 0 ) ? o : values [o ];
655- }
656-
657- /**
658- * Returns the value-ord associated with a key {@link CharSequence} slice.
659- *
660- * @param key key sequence
661- * @param off start offset
662- * @param len number of code units
663- * @return associated value ord, {@link #NOT_IN_DIC} if the key is not in
664- * the dictionary, or {@link #HAS_NO_VALUE} if it has no association
665- * @throws NullPointerException if {@code key} is {@code null}
666- * @throws IndexOutOfBoundsException if {@code off}/{@code len} are invalid
667- */
668- public int valueOrd (final CharSequence key , final int off , final int len )
669- {
670- final int o = ord (key , off , len );
671- return (o < 0 ) ? o : values [o ];
672- }
673-
674474 /**
675475 * Appends a {@link CharSequence} slice to the slab.
676476 *
@@ -780,9 +580,6 @@ private void ensureOrdCapacity(final int required)
780580 final int cap = Math .max (required , meta .length + (meta .length >>> 1 ) + 16 );
781581 meta = Arrays .copyOf (meta , cap );
782582 termHash = Arrays .copyOf (termHash , cap );
783- final int oldLen = values .length ;
784- values = Arrays .copyOf (values , cap );
785- Arrays .fill (values , oldLen , cap , HAS_NO_VALUE );
786583 }
787584
788585 /**
@@ -873,8 +670,7 @@ private static int fmix32(int h1)
873670
874671 /**
875672 * Shared insertion core. Exactly one of {@code a} or {@code cs} must be
876- * non-null. Returns the ord whether newly assigned or pre-existing
877- * (B1 ingestion semantics).
673+ * non-null. Returns the ord whether newly assigned or pre-existing.
878674 *
879675 * @param a source array, or {@code null}
880676 * @param off offset in the chosen source
0 commit comments