Skip to content

Commit 2aedc91

Browse files
committed
[1, vocabSize)
1 parent df0cc07 commit 2aedc91

1 file changed

Lines changed: 30 additions & 18 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,12 @@
4848
* <li><b>{@code <field>.terms.off}</b> — native-endian {@code int} offsets into {@code .dat}</li>
4949
* </ul>
5050
* <p>
51-
* Term ids are dense and stable for the frozen snapshot from which the lexicon was built.
52-
* The id assignment follows the lexicographic iteration order returned by Lucene's merged
53-
* {@link TermsEnum} for the field.
51+
* Term id 0 is reserved and represents the absence of a term (empty position).
52+
* Real term ids start at 1 and are dense and stable for the frozen snapshot from which the
53+
* lexicon was built. The id assignment follows the lexicographic iteration order returned by
54+
* Lucene's merged {@link TermsEnum} for the field. The reserved id is stored as a zero-length
55+
* phantom entry in the {@code .dat}/{@code .off} files so that {@link #term(int) term(0)}
56+
* returns {@code ""} and all sidecar files remain self-consistent.
5457
* </p>
5558
* <p>
5659
* This class implements {@link Closeable}. Closing attempts to release the memory-mapped
@@ -74,7 +77,7 @@ public final class TermLexicon implements Closeable {
7477
/** Indexed field for which this lexicon was built. */
7578
private final String field;
7679

77-
/** Exact immutable mapping from UTF-8 term bytes to dense term ids. */
80+
/** Exact immutable mapping from UTF-8 term bytes to dense term ids in {@code [1, vocabSize)}. */
7881
private final FST<Long> fst;
7982

8083
/** Memory-mapped concatenation of all term bytes in term-id order. */
@@ -97,7 +100,7 @@ public final class TermLexicon implements Closeable {
97100
/** IntBuffer view over {@link #offBuf} for direct int access without byte arithmetic. */
98101
private final IntBuffer off;
99102

100-
/** Number of distinct terms in the field. */
103+
/** Number of entries including the reserved id 0; valid ids span {@code [0, vocabSize)}. */
101104
private final int vocabSize;
102105

103106
/** Maximum tolerated mtime difference between the three lexicon files at open time, in milliseconds. */
@@ -325,7 +328,7 @@ public long fstRamBytesUsed() {
325328
* </p>
326329
*
327330
* @param term canonical indexed term form, must match the analyzer output exactly
328-
* @return dense term id in {@code [0, vocabSize)}, or {@code -1} if the term is absent
331+
* @return dense term id in {@code [1, vocabSize)}, or {@code -1} if the term is absent
329332
* @throws IOException if the FST read fails
330333
* @throws NullPointerException if {@code term} is null
331334
*/
@@ -344,7 +347,7 @@ public int id(final String term) throws IOException {
344347
* </p>
345348
*
346349
* @param term canonical indexed term as UTF-8 bytes
347-
* @return dense term id in {@code [0, vocabSize)}, or {@code -1} if the term is absent
350+
* @return dense term id in {@code [1, vocabSize)}, or {@code -1} if the term is absent
348351
* @throws IOException if the FST read fails
349352
* @throws NullPointerException if {@code term} is null
350353
*/
@@ -377,7 +380,8 @@ public Path indexDir() {
377380
* <ul>
378381
* <li>All three files must exist as regular files.</li>
379382
* <li>File modification times must be within {@value #MTIME_TOLERANCE_MS} ms of each other.</li>
380-
* <li>The offsets file size must be a multiple of 4 bytes and contain at least 2 entries.</li>
383+
* <li>The offsets file size must be a multiple of 4 bytes and contain at least 3 entries
384+
* (the reserved phantom slot plus at least one real term).</li>
381385
* <li>The first offset must be 0 and the last must equal the data file size.</li>
382386
* <li>A bounded monotonicity check is run on the first and last {@value #MONO_CHECK} offsets.</li>
383387
* </ul>
@@ -411,8 +415,8 @@ public static TermLexicon open(final Path indexDir, final String field) throws I
411415
throw new IOException("Invalid offsets file (size not a multiple of 4 bytes): " + offPath);
412416
}
413417
final IntBuffer off = offByteBuf.asIntBuffer();
414-
if (off.capacity() < 2) {
415-
throw new IOException("Invalid offsets file (need at least 2 offsets): " + offPath);
418+
if (off.capacity() < 3) {
419+
throw new IOException("Invalid offsets file (need at least 3 offsets: phantom + one real term): " + offPath);
416420
}
417421

418422
final int first = off.get(0);
@@ -486,13 +490,14 @@ public static TermLexicon openOrBuild(final Path indexDir, final IndexReader rea
486490
/**
487491
* Returns the term string for one dense term id.
488492
* <p>
493+
* {@code term(0)} returns the empty string (reserved absent-term slot).
489494
* Uses a per-thread scratch buffer internally. Suitable for moderate use
490495
* (e.g. resolving 50 term ids for display). For tight loops over the full
491496
* vocabulary, prefer {@link #termBytes(int, BytesRefBuilder)} with a caller-owned buffer.
492497
* </p>
493498
*
494499
* @param termId dense term id in {@code [0, vocabSize)}
495-
* @return decoded UTF-8 term string, never null
500+
* @return decoded UTF-8 term string, never null; empty for the reserved id 0
496501
* @throws IllegalArgumentException if {@code termId} is out of range
497502
*/
498503
public String term(final int termId) {
@@ -532,9 +537,13 @@ public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
532537

533538

534539
/**
535-
* Returns the number of distinct terms in the field.
540+
* Returns the number of entries in the lexicon, including the reserved id 0.
541+
* <p>
542+
* Real terms occupy ids {@code [1, vocabSize)}. The count of real terms
543+
* is therefore {@code vocabSize() - 1}.
544+
* </p>
536545
*
537-
* @return vocabulary size (always &gt; 0 for a valid lexicon)
546+
* @return vocabulary size (always &gt; 1 for a valid lexicon)
538547
*/
539548
public int vocabSize() {
540549
return vocabSize;
@@ -544,8 +553,10 @@ public int vocabSize() {
544553
* Builds the three persisted files from the field's merged term dictionary.
545554
* <p>
546555
* Iterates the {@link TermsEnum} in lexicographic order, assigning a dense id
547-
* to each term starting from 0. The FST, concatenated term bytes, and native-endian
548-
* offset array are written to the respective temporary paths.
556+
* to each term starting from 1. Id 0 is reserved as an absent-term sentinel:
557+
* the offset file begins with a zero-length phantom entry ({@code off[0] == off[1] == 0})
558+
* so that downstream consumers can use 0 to mean "no term at this position".
559+
* The FST stores canonical term ids directly — no post-lookup adjustment is needed.
549560
* </p>
550561
*
551562
* @param terms merged terms for the field
@@ -565,7 +576,7 @@ private static void buildFiles(
565576
new FSTCompiler.Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs).build();
566577
final IntsRefBuilder ints = new IntsRefBuilder();
567578

568-
int id = 0;
579+
int id = 1;
569580
int datPos = 0;
570581

571582
final ByteBuffer offBuf = ByteBuffer.allocate(OFF_BUF_INTS * 4).order(ByteOrder.nativeOrder());
@@ -575,8 +586,9 @@ private static void buildFiles(
575586
FileChannel offCh = FileChannel.open(offPath,
576587
StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE)) {
577588

578-
offBuf.putInt(0);
579-
589+
offBuf.putInt(0); // off[0]: start of phantom empty-term slot
590+
offBuf.putInt(0); // off[1]: end of phantom — zero length
591+
580592
final TermsEnum te = terms.iterator();
581593
BytesRef term;
582594
while ((term = te.next()) != null) {

0 commit comments

Comments
 (0)