4848 * <li><b>{@code <field>.terms.off}</b> — native-endian {@code int} offsets into {@code .dat}</li>
4949 * </ul>
5050 * <p>
51- * Term ids are dense and stable for the frozen snapshot from which the lexicon was built.
52- * The id assignment follows the lexicographic iteration order returned by Lucene's merged
53- * {@link TermsEnum} for the field.
51+ * Term id 0 is reserved and represents the absence of a term (empty position).
52+ * Real term ids start at 1 and are dense and stable for the frozen snapshot from which the
53+ * lexicon was built. The id assignment follows the lexicographic iteration order returned by
54+ * Lucene's merged {@link TermsEnum} for the field. The reserved id is stored as a zero-length
55+ * phantom entry in the {@code .dat}/{@code .off} files so that {@link #term(int) term(0)}
56+ * returns {@code ""} and all sidecar files remain self-consistent.
5457 * </p>
5558 * <p>
5659 * This class implements {@link Closeable}. Closing attempts to release the memory-mapped
@@ -74,7 +77,7 @@ public final class TermLexicon implements Closeable {
7477 /** Indexed field for which this lexicon was built. */
7578 private final String field ;
7679
77- /** Exact immutable mapping from UTF-8 term bytes to dense term ids. */
80+ /** Exact immutable mapping from UTF-8 term bytes to dense term ids in {@code [1, vocabSize)} . */
7881 private final FST <Long > fst ;
7982
8083 /** Memory-mapped concatenation of all term bytes in term-id order. */
@@ -97,7 +100,7 @@ public final class TermLexicon implements Closeable {
97100 /** IntBuffer view over {@link #offBuf} for direct int access without byte arithmetic. */
98101 private final IntBuffer off ;
99102
100- /** Number of distinct terms in the field . */
103+ /** Number of entries including the reserved id 0; valid ids span {@code [0, vocabSize)} . */
101104 private final int vocabSize ;
102105
103106 /** Maximum tolerated mtime difference between the three lexicon files at open time, in milliseconds. */
@@ -325,7 +328,7 @@ public long fstRamBytesUsed() {
325328 * </p>
326329 *
327330 * @param term canonical indexed term form, must match the analyzer output exactly
328- * @return dense term id in {@code [0 , vocabSize)}, or {@code -1} if the term is absent
331+ * @return dense term id in {@code [1 , vocabSize)}, or {@code -1} if the term is absent
329332 * @throws IOException if the FST read fails
330333 * @throws NullPointerException if {@code term} is null
331334 */
@@ -344,7 +347,7 @@ public int id(final String term) throws IOException {
344347 * </p>
345348 *
346349 * @param term canonical indexed term as UTF-8 bytes
347- * @return dense term id in {@code [0 , vocabSize)}, or {@code -1} if the term is absent
350+ * @return dense term id in {@code [1 , vocabSize)}, or {@code -1} if the term is absent
348351 * @throws IOException if the FST read fails
349352 * @throws NullPointerException if {@code term} is null
350353 */
@@ -377,7 +380,8 @@ public Path indexDir() {
377380 * <ul>
378381 * <li>All three files must exist as regular files.</li>
379382 * <li>File modification times must be within {@value #MTIME_TOLERANCE_MS} ms of each other.</li>
380- * <li>The offsets file size must be a multiple of 4 bytes and contain at least 2 entries.</li>
383+ * <li>The offsets file size must be a multiple of 4 bytes and contain at least 3 entries
384+ * (the reserved phantom slot plus at least one real term).</li>
381385 * <li>The first offset must be 0 and the last must equal the data file size.</li>
382386 * <li>A bounded monotonicity check is run on the first and last {@value #MONO_CHECK} offsets.</li>
383387 * </ul>
@@ -411,8 +415,8 @@ public static TermLexicon open(final Path indexDir, final String field) throws I
411415 throw new IOException ("Invalid offsets file (size not a multiple of 4 bytes): " + offPath );
412416 }
413417 final IntBuffer off = offByteBuf .asIntBuffer ();
414- if (off .capacity () < 2 ) {
415- throw new IOException ("Invalid offsets file (need at least 2 offsets): " + offPath );
418+ if (off .capacity () < 3 ) {
419+ throw new IOException ("Invalid offsets file (need at least 3 offsets: phantom + one real term ): " + offPath );
416420 }
417421
418422 final int first = off .get (0 );
@@ -486,13 +490,14 @@ public static TermLexicon openOrBuild(final Path indexDir, final IndexReader rea
486490 /**
487491 * Returns the term string for one dense term id.
488492 * <p>
493+ * {@code term(0)} returns the empty string (reserved absent-term slot).
489494 * Uses a per-thread scratch buffer internally. Suitable for moderate use
490495 * (e.g. resolving 50 term ids for display). For tight loops over the full
491496 * vocabulary, prefer {@link #termBytes(int, BytesRefBuilder)} with a caller-owned buffer.
492497 * </p>
493498 *
494499 * @param termId dense term id in {@code [0, vocabSize)}
495- * @return decoded UTF-8 term string, never null
500+ * @return decoded UTF-8 term string, never null; empty for the reserved id 0
496501 * @throws IllegalArgumentException if {@code termId} is out of range
497502 */
498503 public String term (final int termId ) {
@@ -532,9 +537,13 @@ public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
532537
533538
534539 /**
535- * Returns the number of distinct terms in the field.
540+ * Returns the number of entries in the lexicon, including the reserved id 0.
541+ * <p>
542+ * Real terms occupy ids {@code [1, vocabSize)}. The count of real terms
543+ * is therefore {@code vocabSize() - 1}.
544+ * </p>
536545 *
537- * @return vocabulary size (always > 0 for a valid lexicon)
546+ * @return vocabulary size (always > 1 for a valid lexicon)
538547 */
539548 public int vocabSize () {
540549 return vocabSize ;
@@ -544,8 +553,10 @@ public int vocabSize() {
544553 * Builds the three persisted files from the field's merged term dictionary.
545554 * <p>
546555 * Iterates the {@link TermsEnum} in lexicographic order, assigning a dense id
547- * to each term starting from 0. The FST, concatenated term bytes, and native-endian
548- * offset array are written to the respective temporary paths.
556+ * to each term starting from 1. Id 0 is reserved as an absent-term sentinel:
557+ * the offset file begins with a zero-length phantom entry ({@code off[0] == off[1] == 0})
558+ * so that downstream consumers can use 0 to mean "no term at this position".
559+ * The FST stores canonical term ids directly — no post-lookup adjustment is needed.
549560 * </p>
550561 *
551562 * @param terms merged terms for the field
@@ -565,7 +576,7 @@ private static void buildFiles(
565576 new FSTCompiler .Builder <Long >(FST .INPUT_TYPE .BYTE1 , outputs ).build ();
566577 final IntsRefBuilder ints = new IntsRefBuilder ();
567578
568- int id = 0 ;
579+ int id = 1 ;
569580 int datPos = 0 ;
570581
571582 final ByteBuffer offBuf = ByteBuffer .allocate (OFF_BUF_INTS * 4 ).order (ByteOrder .nativeOrder ());
@@ -575,8 +586,9 @@ private static void buildFiles(
575586 FileChannel offCh = FileChannel .open (offPath ,
576587 StandardOpenOption .CREATE_NEW , StandardOpenOption .WRITE )) {
577588
578- offBuf .putInt (0 );
579-
589+ offBuf .putInt (0 ); // off[0]: start of phantom empty-term slot
590+ offBuf .putInt (0 ); // off[1]: end of phantom — zero length
591+
580592 final TermsEnum te = terms .iterator ();
581593 BytesRef term ;
582594 while ((term = te .next ()) != null ) {
0 commit comments