Skip to content

Commit 083925c

Browse files
committed
first drafs
1 parent 3aee570 commit 083925c

3 files changed

Lines changed: 754 additions & 232 deletions

File tree

util/src/java/com/github/oeuvres/alix/util/CharsDic.java

Lines changed: 28 additions & 232 deletions
Original file line numberDiff line numberDiff line change
@@ -17,42 +17,17 @@
1717
import java.util.Arrays;
1818

1919
/**
20-
* Dependency-free hash dictionary of UTF-16 character sequences, with optional
21-
* value association. Both keys and values are stored as char sequences and
22-
* share the same ordinal space: a sequence is interned at most once,
23-
* regardless of whether it appears as a key, a value, or both. This is a
24-
* deliberate divergence from {@link java.util.Map}; it makes self-referential
25-
* dictionaries (lemma maps, alias tables) cost the minimum.
20+
* Dependency-free hash dictionary of UTF-16 character sequences with stable
21+
* integer ordinals. Each distinct sequence is interned at most once and
22+
* receives a 0-based ord that does not change for the lifetime of this
23+
* instance.
2624
*
27-
* <h2>Semantics</h2>
28-
* <ul>
29-
* <li>{@link #add(CharSequence) add(...)} interns a sequence and returns
30-
* its ordinal. Idempotent.</li>
31-
* <li>{@link #put(CharSequence, CharSequence) put(k, v)} interns both
32-
* sequences if absent, then associates {@code k}'s ord with {@code v}'s
33-
* ord. Replace-on-put: returns the previous value ord, or
34-
* {@link #HAS_NO_VALUE} if the key had no association.</li>
35-
* <li>{@link #ord(CharSequence) ord(...)} returns the ordinal of an existing
36-
* sequence, or {@link #NOT_IN_DIC} if absent. Never inserts.</li>
37-
* <li>{@link #valueOrd(CharSequence) valueOrd(...)} returns the ord of the
38-
* value associated with a key. Returns {@link #NOT_IN_DIC} if the key
39-
* sequence is not in the dictionary, or {@link #HAS_NO_VALUE} if it is
40-
* in the dictionary but has no associated value.</li>
41-
* <li>{@link #copy(int, char[], int) copy(ord, dst, off)} writes a
42-
* sequence's chars into a caller-supplied buffer. Returns the char count
43-
* written, or echoes a negative ord unchanged.</li>
44-
* </ul>
45-
*
46-
* <p>Composition pattern for keyed lookup:</p>
47-
* <pre>{@code
48-
* int len = dic.copy(dic.valueOrd("-ce"), buf, 0);
49-
* if (len < 0) {
50-
* // -1 (NOT_IN_DIC): key absent, or
51-
* // -2 (HAS_NO_VALUE): key present without value
52-
* } else {
53-
* // buf[0..len) holds the value
54-
* }
55-
* }</pre>
25+
* <p>This class provides set semantics only. Value associations on top of a
26+
* dictionary belong to companion classes that compose a {@code CharsDic} with
27+
* a parallel array (see {@code CharsMap}, {@code CharsFreq}). Composition
28+
* preserves the property that any sequence ever passed in (whether as a key,
29+
* a value, or a counter target) is interned exactly once and shares the same
30+
* ord space.</p>
5631
*
5732
* <h2>Implementation</h2>
5833
* <ul>
@@ -62,13 +37,11 @@
6237
* metadata holds slab offset and length. A 16-bit fingerprint per slot
6338
* rejects most probes before slab comparison.</li>
6439
* <li>Per-ord hashes are retained for rehashing without re-walking the slab.</li>
65-
* <li>Per-ord value associations are held in a parallel {@code int[]},
66-
* initialised to {@link #HAS_NO_VALUE}.</li>
6740
* <li>Hash function: Murmur3-32 over UTF-16 code units.</li>
6841
* </ul>
6942
*
70-
* <p>Memory at <i>n</i> ords (rough): 16 bytes/ord (meta + termHash + values)
71-
* plus ~8 bytes/slot in the open-addressing table at 0.75 load, plus the slab
43+
* <p>Memory at <i>n</i> ords (rough): 12 bytes/ord (meta + termHash) plus
44+
* ~8 bytes/slot in the open-addressing table at 0.75 load, plus the slab
7245
* itself (sum of all sequence lengths in chars).</p>
7346
*
7447
* <p>Thread-safety: not thread-safe under mutation. Concurrent reads are safe
@@ -77,16 +50,9 @@
7750
public final class CharsDic
7851
{
7952
/**
80-
* Returned by {@link #valueOrd(int)} and related lookups when the key is
81-
* present in the dictionary but has no associated value.
82-
*/
83-
public static final int HAS_NO_VALUE = -2;
84-
85-
/**
86-
* Returned by {@link #ord(CharSequence)}, {@link #valueOrd(CharSequence)}
87-
* and related lookups when the queried sequence is not in the dictionary.
88-
* Also the value of {@link #copy(int, char[], int)} when the supplied ord
89-
* is negative.
53+
* Returned by {@link #ord(CharSequence)} and related lookups when the
54+
* queried sequence is not in the dictionary. Also returned by
55+
* {@link #copy(int, char[], int)} when the supplied ord is negative.
9056
*/
9157
public static final int NOT_IN_DIC = -1;
9258

@@ -139,9 +105,6 @@ public final class CharsDic
139105
/** Full 32-bit hash per ord, retained for rehashing. */
140106
private int[] termHash;
141107

142-
/** Per-ord associated value ord, initialised to {@link #HAS_NO_VALUE}. */
143-
private int[] values;
144-
145108
/**
146109
* Constructs the dictionary with an expected number of unique sequences.
147110
*
@@ -166,18 +129,12 @@ public CharsDic(int expectedSize)
166129
final int metaCap = Math.max(8, expectedSize);
167130
meta = new long[metaCap];
168131
termHash = new int[metaCap];
169-
values = new int[metaCap];
170-
Arrays.fill(values, HAS_NO_VALUE);
171132

172133
slab = new char[Math.max(16, expectedSize * 4)];
173134
}
174135

175136
/**
176-
* Interns a sequence without setting any associated value.
177-
*
178-
* <p>Idempotent. If the sequence already has a value association from a
179-
* previous {@link #put(CharSequence, CharSequence)}, that association is
180-
* preserved.</p>
137+
* Interns a sequence.
181138
*
182139
* @param key source sequence (UTF-16 code units)
183140
* @return the assigned 0-based ord ({@code >= 0})
@@ -193,7 +150,7 @@ public int add(final CharSequence key)
193150
}
194151

195152
/**
196-
* Interns a slice of a {@code char[]} without setting any associated value.
153+
* Interns a slice of a {@code char[]}.
197154
*
198155
* @param key source array (UTF-16 code units)
199156
* @param off start offset (inclusive)
@@ -211,10 +168,9 @@ public int add(final char[] key, final int off, final int len)
211168
}
212169

213170
/**
214-
* Interns a slice of a {@link CharSequence} without setting any associated
215-
* value.
171+
* Interns a slice of a {@link CharSequence}.
216172
*
217-
* @param key source character sequence (UTF-16 code units)
173+
* @param key source character sequence
218174
* @param off start offset (inclusive)
219175
* @param len number of code units to read
220176
* @return the assigned 0-based ord ({@code >= 0})
@@ -246,8 +202,7 @@ public String asString(final int ord)
246202
}
247203

248204
/**
249-
* Tells whether a sequence is interned, regardless of whether it appears
250-
* as a key, a value, or both.
205+
* Tells whether a sequence is interned.
251206
*
252207
* @param key source sequence
253208
* @return true iff the sequence is in the dictionary
@@ -259,8 +214,7 @@ public boolean contains(final CharSequence key)
259214
}
260215

261216
/**
262-
* Tells whether a slice of a {@code char[]} is interned, regardless of
263-
* whether it appears as a key, a value, or both.
217+
* Tells whether a slice of a {@code char[]} is interned.
264218
*
265219
* @param key source array
266220
* @param off start offset
@@ -275,8 +229,7 @@ public boolean contains(final char[] key, final int off, final int len)
275229
}
276230

277231
/**
278-
* Tells whether a slice of a {@link CharSequence} is interned, regardless
279-
* of whether it appears as a key, a value, or both.
232+
* Tells whether a slice of a {@link CharSequence} is interned.
280233
*
281234
* @param key source sequence
282235
* @param off start offset
@@ -293,16 +246,10 @@ public boolean contains(final CharSequence key, final int off, final int len)
293246
/**
294247
* Copies the sequence stored at {@code ord} into a destination buffer.
295248
*
296-
* <p>If {@code ord} is negative (typically a value returned by
297-
* {@link #ord(CharSequence)} or {@link #valueOrd(CharSequence)} on a miss),
298-
* the same negative value is returned and {@code dst} is left
299-
* untouched. This lets callers compose lookups without an intermediate
300-
* branch:</p>
301-
*
302-
* <pre>{@code
303-
* int len = dic.copy(dic.valueOrd(key), buf, 0);
304-
* if (len < 0) { ...miss... } else { ...buf[0..len)... }
305-
* }</pre>
249+
* <p>If {@code ord} is negative (typically {@link #NOT_IN_DIC} from a
250+
* lookup miss), the same negative value is returned and {@code dst} is
251+
* left untouched. This lets callers compose lookups without an
252+
* intermediate branch.</p>
306253
*
307254
* @param ord ord to read; negative values pass through
308255
* @param dst destination array (must be non-null when {@code ord >= 0})
@@ -458,91 +405,7 @@ public int ord(final CharSequence key, final int off, final int len)
458405
}
459406

460407
/**
461-
* Associates the value sequence with the key sequence. Both are interned
462-
* if absent. Replace-on-put.
463-
*
464-
* @param key key sequence
465-
* @param value value sequence
466-
* @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
467-
* the key had no association before this call
468-
* @throws NullPointerException if either argument is {@code null}
469-
* @throws IllegalArgumentException if either length exceeds 65535
470-
*/
471-
public int put(final CharSequence key, final CharSequence value)
472-
{
473-
if (key == null) {
474-
throw new NullPointerException("key");
475-
}
476-
if (value == null) {
477-
throw new NullPointerException("value");
478-
}
479-
return put(key, 0, key.length(), value, 0, value.length());
480-
}
481-
482-
/**
483-
* Associates a value-slice sequence with a key-slice sequence. Both are
484-
* interned if absent. Replace-on-put.
485-
*
486-
* @param key key sequence
487-
* @param keyOff key start offset
488-
* @param keyLen key length
489-
* @param value value sequence
490-
* @param valueOff value start offset
491-
* @param valueLen value length
492-
* @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
493-
* the key had no association before this call
494-
* @throws NullPointerException if either sequence is {@code null}
495-
* @throws IndexOutOfBoundsException if any offset/length is invalid
496-
* @throws IllegalArgumentException if either length exceeds 65535
497-
*/
498-
public int put(
499-
final CharSequence key, final int keyOff, final int keyLen,
500-
final CharSequence value, final int valueOff, final int valueLen)
501-
{
502-
checkBounds(key, keyOff, keyLen);
503-
checkBounds(value, valueOff, valueLen);
504-
checkLen(keyLen);
505-
checkLen(valueLen);
506-
final int kOrd = intern(null, keyOff, keyLen, key);
507-
final int vOrd = intern(null, valueOff, valueLen, value);
508-
final int prev = values[kOrd];
509-
values[kOrd] = vOrd;
510-
return prev;
511-
}
512-
513-
/**
514-
* Associates a value-slice {@code char[]} with a key-slice {@code char[]}.
515-
* Both are interned if absent. Replace-on-put.
516-
*
517-
* @param key key array
518-
* @param keyOff key start offset
519-
* @param keyLen key length
520-
* @param value value array
521-
* @param valueOff value start offset
522-
* @param valueLen value length
523-
* @return the previous associated value ord, or {@link #HAS_NO_VALUE} if
524-
* the key had no association before this call
525-
* @throws NullPointerException if either array is {@code null}
526-
* @throws IndexOutOfBoundsException if any offset/length is invalid
527-
* @throws IllegalArgumentException if either length exceeds 65535
528-
*/
529-
public int put(
530-
final char[] key, final int keyOff, final int keyLen,
531-
final char[] value, final int valueOff, final int valueLen)
532-
{
533-
checkBounds(key, keyOff, keyLen);
534-
checkBounds(value, valueOff, valueLen);
535-
checkLen(keyLen);
536-
checkLen(valueLen);
537-
final int kOrd = intern(key, keyOff, keyLen, null);
538-
final int vOrd = intern(value, valueOff, valueLen, null);
539-
final int prev = values[kOrd];
540-
values[kOrd] = vOrd;
541-
return prev;
542-
}
543-
544-
/**
545-
* Returns the number of unique sequences interned (keys, values, or both).
408+
* Returns the number of unique sequences interned.
546409
*
547410
* @return number of assigned ords
548411
*/
@@ -605,72 +468,9 @@ public void trimToSize()
605468
if (meta.length != sizeOrds) {
606469
meta = Arrays.copyOf(meta, sizeOrds);
607470
termHash = Arrays.copyOf(termHash, sizeOrds);
608-
values = Arrays.copyOf(values, sizeOrds);
609471
}
610472
}
611473

612-
/**
613-
* Returns the value-ord associated with a key sequence.
614-
*
615-
* @param key key sequence
616-
* @return associated value ord, {@link #NOT_IN_DIC} if the key sequence
617-
* is not in the dictionary, or {@link #HAS_NO_VALUE} if it is in
618-
* the dictionary but has no associated value
619-
* @throws NullPointerException if {@code key} is {@code null}
620-
*/
621-
public int valueOrd(final CharSequence key)
622-
{
623-
final int o = ord(key);
624-
return (o < 0) ? o : values[o];
625-
}
626-
627-
/**
628-
* Returns the value-ord associated with a key by direct ord lookup.
629-
*
630-
* @param keyOrd key ord
631-
* @return associated value ord, or {@link #HAS_NO_VALUE} if no association
632-
* @throws IllegalArgumentException if {@code keyOrd} is invalid
633-
*/
634-
public int valueOrd(final int keyOrd)
635-
{
636-
checkOrd(keyOrd);
637-
return values[keyOrd];
638-
}
639-
640-
/**
641-
* Returns the value-ord associated with a key {@code char[]} slice.
642-
*
643-
* @param key key array
644-
* @param off start offset
645-
* @param len number of code units
646-
* @return associated value ord, {@link #NOT_IN_DIC} if the key is not in
647-
* the dictionary, or {@link #HAS_NO_VALUE} if it has no association
648-
* @throws NullPointerException if {@code key} is {@code null}
649-
* @throws IndexOutOfBoundsException if {@code off}/{@code len} are invalid
650-
*/
651-
public int valueOrd(final char[] key, final int off, final int len)
652-
{
653-
final int o = ord(key, off, len);
654-
return (o < 0) ? o : values[o];
655-
}
656-
657-
/**
658-
* Returns the value-ord associated with a key {@link CharSequence} slice.
659-
*
660-
* @param key key sequence
661-
* @param off start offset
662-
* @param len number of code units
663-
* @return associated value ord, {@link #NOT_IN_DIC} if the key is not in
664-
* the dictionary, or {@link #HAS_NO_VALUE} if it has no association
665-
* @throws NullPointerException if {@code key} is {@code null}
666-
* @throws IndexOutOfBoundsException if {@code off}/{@code len} are invalid
667-
*/
668-
public int valueOrd(final CharSequence key, final int off, final int len)
669-
{
670-
final int o = ord(key, off, len);
671-
return (o < 0) ? o : values[o];
672-
}
673-
674474
/**
675475
* Appends a {@link CharSequence} slice to the slab.
676476
*
@@ -780,9 +580,6 @@ private void ensureOrdCapacity(final int required)
780580
final int cap = Math.max(required, meta.length + (meta.length >>> 1) + 16);
781581
meta = Arrays.copyOf(meta, cap);
782582
termHash = Arrays.copyOf(termHash, cap);
783-
final int oldLen = values.length;
784-
values = Arrays.copyOf(values, cap);
785-
Arrays.fill(values, oldLen, cap, HAS_NO_VALUE);
786583
}
787584

788585
/**
@@ -873,8 +670,7 @@ private static int fmix32(int h1)
873670

874671
/**
875672
* Shared insertion core. Exactly one of {@code a} or {@code cs} must be
876-
* non-null. Returns the ord whether newly assigned or pre-existing
877-
* (B1 ingestion semantics).
673+
* non-null. Returns the ord whether newly assigned or pre-existing.
878674
*
879675
* @param a source array, or {@code null}
880676
* @param off offset in the chosen source

0 commit comments

Comments
 (0)