2020import java .io .OutputStream ;
2121import java .nio .ByteBuffer ;
2222import java .nio .ByteOrder ;
23- import java .nio .LongBuffer ;
23+ import java .nio .IntBuffer ;
2424import java .nio .channels .FileChannel ;
2525import java .nio .file .FileAlreadyExistsException ;
2626import java .nio .file .Files ;
3535/**
3636 * Immutable lookup table for one indexed field of one frozen Lucene directory.
3737 * <p>
38- * The lexicon is stored in three files located directly in the Lucene directory:
38+ * The lexicon is persisted in three files located directly in the Lucene directory:
3939 * </p>
4040 * <ul>
4141 * <li><b>{@code <field>.terms.fst}</b>: exact lookup {@code term -> termId}</li>
4242 * <li><b>{@code <field>.terms.dat}</b>: concatenated UTF-8 bytes of all terms in {@code termId} order</li>
43- * <li><b>{@code <field>.terms.off}</b>: big-endian {@code long } offsets into {@code .dat}</li>
43+ * <li><b>{@code <field>.terms.off}</b>: big-endian {@code int } offsets into {@code .dat}</li>
4444 * </ul>
4545 * <p>
4646 * Term ids are dense and stable for the frozen snapshot from which the lexicon was built.
4747 * The id assignment is the lexicographic iteration order returned by Lucene's merged
4848 * {@link TermsEnum} for the field.
4949 * </p>
5050 * <p>
51- * This class exposes only two lifecycle operations:
51+ * This class intentionally exposes only two lifecycle operations:
5252 * </p>
5353 * <ul>
5454 * <li>{@link #write(Path, String)} or {@link #write(Path, IndexReader, String)} create the three files once</li>
5555 * <li>{@link #open(Path, String)} opens the persisted lexicon for lookup</li>
5656 * </ul>
5757 * <p>
58- * The string-based lookup assumes that the caller already provides the field's canonical indexed form.
58+ * String lookup assumes that the caller already provides the field's canonical indexed form.
5959 * No analysis, normalization, stemming or lower-casing is applied here.
6060 * </p>
61+ * <p>
62+ * This KISS implementation memory-maps the whole {@code .dat} file in one {@link ByteBuffer}.
63+ * Consequently, it supports only lexicons whose {@code .dat} length fits in a signed 32-bit integer.
64+ * </p>
6165 */
6266public final class TermLexicon {
6367 /** Lucene directory that contains both the index and the {@code <field>.terms.*} files. */
@@ -79,7 +83,7 @@ public final class TermLexicon {
7983 * the term bytes are stored in {@code dat[off[i] .. off[i+1])}.
8084 * </p>
8185 */
82- private final LongBuffer off ;
86+ private final IntBuffer off ;
8387
8488 /** Number of distinct terms in the field. */
8589 private final int vocabSize ;
@@ -100,15 +104,15 @@ public final class TermLexicon {
100104 * @param indexDir Lucene directory that contains the lexicon files
101105 * @param field indexed field name
102106 * @param fst exact mapping {@code term -> termId}
103- * @param dat memory-mapped term bytes file
107+ * @param dat memory-mapped term- bytes file
104108 * @param off memory-mapped offsets file
105109 */
106110 private TermLexicon (
107111 final Path indexDir ,
108112 final String field ,
109113 final FST <Long > fst ,
110114 final ByteBuffer dat ,
111- final LongBuffer off
115+ final IntBuffer off
112116 ) {
113117 this .indexDir = indexDir ;
114118 this .field = field ;
@@ -118,6 +122,24 @@ private TermLexicon(
118122 this .vocabSize = off .capacity () - 1 ;
119123 }
120124
125+ /**
126+ * Returns {@code true} if the three persisted files for {@code field} exist as regular files.
127+ * <p>
128+ * This is a cheap presence test only. It does not validate sizes, mtimes or file contents.
129+ * </p>
130+ *
131+ * @param indexDir Lucene directory
132+ * @param field indexed field name
133+ * @return {@code true} if {@code .fst}, {@code .dat} and {@code .off} are present
134+ */
135+ public static boolean exists (final Path indexDir , final String field ) {
136+ Objects .requireNonNull (indexDir , "indexDir" );
137+ Objects .requireNonNull (field , "field" );
138+ return Files .isRegularFile (fstPath (indexDir , field ))
139+ && Files .isRegularFile (datPath (indexDir , field ))
140+ && Files .isRegularFile (offPath (indexDir , field ));
141+ }
142+
121143 /**
122144 * Builds the lexicon files for one field using the latest committed state of the Lucene directory.
123145 *
@@ -208,18 +230,18 @@ public static TermLexicon open(final Path indexDir, final String field) throws I
208230
209231 final ByteBuffer dat = mapReadOnly (datPath );
210232 final ByteBuffer offBytes = mapReadOnly (offPath ).order (ByteOrder .BIG_ENDIAN );
211- if ((offBytes .remaining () & 7 ) != 0 ) {
212- throw new IOException ("Invalid offsets file (size is not a multiple of 8 bytes): " + offPath );
233+ if ((offBytes .remaining () & 3 ) != 0 ) {
234+ throw new IOException ("Invalid offsets file (size is not a multiple of 4 bytes): " + offPath );
213235 }
214- final LongBuffer off = offBytes .asLongBuffer ();
236+ final IntBuffer off = offBytes .asIntBuffer ();
215237 if (off .capacity () < 2 ) {
216238 throw new IOException ("Invalid offsets file (need at least 2 offsets): " + offPath );
217239 }
218240
219- final long datLength = dat .capacity ();
220- final long first = off .get (0 );
221- final long last = off .get (off .capacity () - 1 );
222- if (first != 0L ) {
241+ final int datLength = dat .capacity ();
242+ final int first = off .get (0 );
243+ final int last = off .get (off .capacity () - 1 );
244+ if (first != 0 ) {
223245 throw new IOException ("Invalid offsets file, off[0] != 0: " + offPath );
224246 }
225247 if (last != datLength ) {
@@ -260,6 +282,18 @@ public int vocabSize() {
260282 return vocabSize ;
261283 }
262284
285+ /**
286+ * Returns the in-memory size of the loaded FST, in bytes.
287+ * <p>
288+ * This does not include the memory-mapped {@code .dat} and {@code .off} files.
289+ * </p>
290+ *
291+ * @return FST heap usage in bytes
292+ */
293+ public long fstRamBytesUsed () {
294+ return fst .ramBytesUsed ();
295+ }
296+
263297 /**
264298 * Looks up a canonical indexed term represented as a Java string.
265299 *
@@ -299,16 +333,16 @@ public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
299333 checkTermId (termId );
300334 Objects .requireNonNull (reuse , "reuse" );
301335
302- final long start = off .get (termId );
303- final long end = off .get (termId + 1 );
304- final int length = toIntExact ( end - start ) ;
336+ final int start = off .get (termId );
337+ final int end = off .get (termId + 1 );
338+ final int length = end - start ;
305339
306340 reuse .grow (length );
307341 final byte [] dst = reuse .bytes ();
308342
309343 final ByteBuffer dup = dat .duplicate ();
310- dup .position (toIntExact ( start ) );
311- dup .limit (toIntExact ( end ) );
344+ dup .position (start );
345+ dup .limit (end );
312346 dup .get (dst , 0 , length );
313347
314348 reuse .setLength (length );
@@ -360,24 +394,32 @@ private static void buildFiles(
360394 final FSTCompiler <Long > compiler = new FSTCompiler .Builder <Long >(FST .INPUT_TYPE .BYTE1 , outputs ).build ();
361395 final IntsRefBuilder ints = new IntsRefBuilder ();
362396
363- long id = 0L ;
364- long datPos = 0L ;
397+ int id = 0 ;
398+ int datPos = 0 ;
365399
366400 try (OutputStream datOs = new BufferedOutputStream (Files .newOutputStream (datPath , StandardOpenOption .CREATE_NEW ));
367401 DataOutputStream offOut = new DataOutputStream (
368402 new BufferedOutputStream (Files .newOutputStream (offPath , StandardOpenOption .CREATE_NEW ))
369403 )) {
370404
371- offOut .writeLong ( 0L );
405+ offOut .writeInt ( 0 );
372406
373407 final TermsEnum te = terms .iterator ();
374408 BytesRef term ;
375409 while ((term = te .next ()) != null ) {
376- compiler .add (Util .toIntsRef (term , ints ), id ++);
410+ if (id == Integer .MAX_VALUE ) {
411+ throw new IOException ("Too many terms for int term ids in field lexicon" );
412+ }
413+ if (datPos > Integer .MAX_VALUE - term .length ) {
414+ throw new IOException ("Term bytes file would exceed 2GB; this implementation uses 32-bit offsets" );
415+ }
416+
417+ compiler .add (Util .toIntsRef (term , ints ), (long ) id );
377418
378419 datOs .write (term .bytes , term .offset , term .length );
379420 datPos += term .length ;
380- offOut .writeLong (datPos );
421+ offOut .writeInt (datPos );
422+ id ++;
381423 }
382424 }
383425
@@ -440,7 +482,11 @@ private static Path tmpPath(final Path path) {
440482 * @throws IOException if the move fails
441483 */
442484 private static void moveTemp (final Path source , final Path target ) throws IOException {
443- Files .move (source , target , StandardCopyOption .ATOMIC_MOVE );
485+ try {
486+ Files .move (source , target , StandardCopyOption .ATOMIC_MOVE );
487+ } catch (IOException e ) {
488+ Files .move (source , target );
489+ }
444490 }
445491
446492 /**
@@ -524,14 +570,14 @@ private static void checkMtimeCoherence(final Path... paths) throws IOException
524570 * @param offPath offsets file path, used only in error messages
525571 * @throws IOException if checked offsets are not monotonic
526572 */
527- private static void monotonicityCheck (final LongBuffer off , final Path offPath ) throws IOException {
573+ private static void monotonicityCheck (final IntBuffer off , final Path offPath ) throws IOException {
528574 final int n = off .capacity ();
529575 final int head = Math .min (MONO_CHECK , n );
530576 final int tailStart = Math .max (0 , n - MONO_CHECK );
531577
532- long prev = off .get (0 );
578+ int prev = off .get (0 );
533579 for (int i = 1 ; i < head ; i ++) {
534- final long cur = off .get (i );
580+ final int cur = off .get (i );
535581 if (cur < prev ) {
536582 throw new IOException ("Invalid offsets file, offsets decrease at head index " + i + ": " + offPath );
537583 }
@@ -540,7 +586,7 @@ private static void monotonicityCheck(final LongBuffer off, final Path offPath)
540586
541587 prev = off .get (tailStart );
542588 for (int i = tailStart + 1 ; i < n ; i ++) {
543- final long cur = off .get (i );
589+ final int cur = off .get (i );
544590 if (cur < prev ) {
545591 throw new IOException ("Invalid offsets file, offsets decrease at tail index " + i + ": " + offPath );
546592 }
0 commit comments