1+ package com .github .oeuvres .alix .lucene ;
2+
3+ import org .apache .lucene .index .*;
4+ import org .apache .lucene .store .*;
5+ import org .apache .lucene .util .*;
6+ import org .apache .lucene .util .fst .*;
7+
8+ import java .io .Closeable ;
9+ import java .io .IOException ;
10+ import java .nio .ByteBuffer ;
11+ import java .nio .ByteOrder ;
12+ import java .nio .LongBuffer ;
13+ import java .nio .channels .FileChannel ;
14+ import java .nio .file .*;
15+ import java .util .Arrays ;
16+
17+ import static java .lang .Math .toIntExact ;
18+
19+ /**
20+ * Snapshot lexicon for a single field, persisted inside a frozen Lucene directory:
21+ *
22+ * <field>.terms.fst : term(BytesRef) -> termId (dense 0..V-1, lexicographic order)
23+ * <field>.terms.dat : concatenated UTF-8 bytes of terms in termId order
24+ * <field>.terms.off : big-endian long offsets into .dat; length = V+1; off[V] == datLength
25+ *
26+ * Notes:
27+ * - termId is defined by the snapshot build: iteration order of TermsEnum for the field.
28+ * - getId(String) assumes the String is already in the indexed term form for the field.
29+ */
30+ public final class TermLexicon implements Closeable
31+ {
32+ private final Directory dir ; // used to open the FST
33+ private final FST <Long > fst ;
34+
35+ private final ByteBuffer dat ; // mmap .dat
36+ private final LongBuffer off ; // mmap .off (big-endian longs)
37+
38+ private final int vocabSize ;
39+
40+ private static final long MTIME_TOLERANCE_MS = 5_000 ;
41+ private static final int MONO_CHECK = 1024 ;
42+
43+ private static final ThreadLocal <BytesRefBuilder > TL_BRB = ThreadLocal .withInitial (BytesRefBuilder ::new );
44+
45+ private TermLexicon (Directory dir , FST <Long > fst , ByteBuffer dat , LongBuffer off )
46+ {
47+ this .dir = dir ;
48+ this .fst = fst ;
49+ this .dat = dat .asReadOnlyBuffer ();
50+ this .off = off .asReadOnlyBuffer ();
51+ this .vocabSize = off .capacity () - 1 ;
52+ }
53+
54+ public int vocabSize ()
55+ {
56+ return vocabSize ;
57+ }
58+
59+ /** Open lexicon files for {@code field} from a frozen index directory. */
60+ public static TermLexicon open (Path indexDir , String field ) throws IOException
61+ {
62+ final String fstName = field + ".terms.fst" ;
63+ final String datName = field + ".terms.dat" ;
64+ final String offName = field + ".terms.off" ;
65+
66+ ensureExists (indexDir , fstName );
67+ ensureExists (indexDir , datName );
68+ ensureExists (indexDir , offName );
69+ checkMtimeCoherence (indexDir , fstName , datName , offName );
70+
71+ // mmap data files (Path-based mmap is fine in a frozen directory)
72+ ByteBuffer dat = mapReadOnly (indexDir .resolve (datName ));
73+ ByteBuffer offBB = mapReadOnly (indexDir .resolve (offName )).order (ByteOrder .BIG_ENDIAN );
74+
75+ if ((offBB .remaining () & 7 ) != 0 ) {
76+ throw new IOException ("Invalid " + offName + ": size not multiple of 8 bytes" );
77+ }
78+ LongBuffer off = offBB .asLongBuffer ();
79+ if (off .capacity () < 2 ) {
80+ throw new IOException ("Invalid " + offName + ": need at least 2 offsets (V+1)" );
81+ }
82+
83+ // Basic structural checks tying off/dat together
84+ long datLen = dat .capacity ();
85+ long first = off .get (0 );
86+ long last = off .get (off .capacity () - 1 );
87+ if (first != 0L ) {
88+ throw new IOException ("Invalid " + offName + ": off[0] must be 0, got " + first );
89+ }
90+ if (last != datLen ) {
91+ throw new IOException ("Mismatch: " + offName + " last offset=" + last +
92+ " but " + datName + " length=" + datLen );
93+ }
94+
95+ // Bounded monotonicity check (cheap, catches common corruption)
96+ monotonicityCheck (off , offName );
97+
98+ // Load FST via Lucene Directory APIs
99+ Directory d = FSDirectory .open (indexDir );
100+ try (IndexInput in = d .openInput (fstName , IOContext .READONCE )) {
101+ PositiveIntOutputs outputs = PositiveIntOutputs .getSingleton ();
102+ FST <Long > fst = new FST <>(in , outputs );
103+ return new TermLexicon (d , fst , dat , off );
104+ } catch (IOException e ) {
105+ d .close ();
106+ throw e ;
107+ }
108+ }
109+
110+ /** term(String) -> termId, or -1 if absent. */
111+ public int getId (String termForm ) throws IOException
112+ {
113+ BytesRefBuilder brb = TL_BRB .get ();
114+ brb .copyChars (termForm ); // UTF-16 -> UTF-8 BytesRef
115+ Long v = Util .get (fst , brb .get ());
116+ return v == null ? -1 : toIntExact (v );
117+ }
118+
119+ /** term(BytesRef) -> termId, or -1 if absent. */
120+ public int getId (BytesRef termBytes ) throws IOException
121+ {
122+ Long v = Util .get (fst , termBytes );
123+ return v == null ? -1 : toIntExact (v );
124+ }
125+
126+ /** termId -> term bytes copied into {@code dst} (no per-call allocation). */
127+ public BytesRef getTermBytes (int termId , BytesRefBuilder dst )
128+ {
129+ checkTermId (termId );
130+ long start = off .get (termId );
131+ long end = off .get (termId + 1 );
132+ int len = toIntExact (end - start );
133+
134+ dst .grow (len );
135+ byte [] arr = dst .bytes ();
136+
137+ ByteBuffer dup = dat .duplicate ();
138+ dup .position (toIntExact (start ));
139+ dup .get (arr , 0 , len );
140+
141+ dst .setLength (len );
142+ return dst .get ();
143+ }
144+
145+ /** Convenience: termId -> String (allocates). */
146+ public String getTermString (int termId )
147+ {
148+ BytesRefBuilder brb = new BytesRefBuilder ();
149+ return getTermBytes (termId , brb ).utf8ToString ();
150+ }
151+
152+ private void checkTermId (int termId )
153+ {
154+ if (termId < 0 || termId >= vocabSize ) {
155+ throw new IllegalArgumentException ("termId out of range: " + termId + " (vocabSize=" + vocabSize + ")" );
156+ }
157+ }
158+
159+ @ Override
160+ public void close () throws IOException
161+ {
162+ dir .close (); // mmaps are managed by the OS/JVM; close Lucene directory handles
163+ }
164+
165+ // -------------------- Builder --------------------
166+
167+ public static final class Builder
168+ {
169+ private Builder ()
170+ {
171+ }
172+
173+ /**
174+ * Build <field>.terms.{fst,dat,off} into the given Lucene directory.
175+ * Fails if any destination file already exists.
176+ *
177+ * The reader must represent the frozen snapshot you want to serve (DirectoryReader.open(IndexCommit)).
178+ */
179+ public static void buildInto (Path indexDir , IndexReader snapshotReader , String field ) throws IOException
180+ {
181+ final String fstName = field + ".terms.fst" ;
182+ final String datName = field + ".terms.dat" ;
183+ final String offName = field + ".terms.off" ;
184+
185+ try (Directory outDir = FSDirectory .open (indexDir )) {
186+ ensureNotExists (outDir , fstName );
187+ ensureNotExists (outDir , datName );
188+ ensureNotExists (outDir , offName );
189+
190+ Terms terms = MultiTerms .getTerms (snapshotReader , field );
191+ if (terms == null ) {
192+ throw new IllegalArgumentException ("Field not found in reader: " + field );
193+ }
194+
195+ // Temp files + rename to avoid leaving partial final files.
196+ final String tmpFst = fstName + ".tmp" ;
197+ final String tmpDat = datName + ".tmp" ;
198+ final String tmpOff = offName + ".tmp" ;
199+ ensureNotExists (outDir , tmpFst );
200+ ensureNotExists (outDir , tmpDat );
201+ ensureNotExists (outDir , tmpOff );
202+
203+ try {
204+ buildFiles (outDir , terms , tmpDat , tmpOff , tmpFst );
205+ outDir .rename (tmpDat , datName );
206+ outDir .rename (tmpOff , offName );
207+ outDir .rename (tmpFst , fstName );
208+ } catch (IOException e ) {
209+ safeDelete (outDir , tmpDat );
210+ safeDelete (outDir , tmpOff );
211+ safeDelete (outDir , tmpFst );
212+ throw e ;
213+ }
214+ }
215+ }
216+
217+ private static void buildFiles (
218+ Directory outDir ,
219+ Terms terms ,
220+ String datFile ,
221+ String offFile ,
222+ String fstFile ) throws IOException
223+ {
224+ PositiveIntOutputs outputs = PositiveIntOutputs .getSingleton ();
225+ Builder <Long > fstBuilder = new Builder <>(FST .INPUT_TYPE .BYTE1 , outputs );
226+ IntsRefBuilder ints = new IntsRefBuilder ();
227+
228+ long id = 0 ;
229+ long datPos = 0 ;
230+
231+ try (IndexOutput datOut = outDir .createOutput (datFile , IOContext .DEFAULT );
232+ IndexOutput offOut = outDir .createOutput (offFile , IOContext .DEFAULT ))
233+ {
234+
235+ // First offset always 0
236+ offOut .writeLong (0L );
237+
238+ TermsEnum te = terms .iterator ();
239+ BytesRef term ;
240+ while ((term = te .next ()) != null ) {
241+ // FST: term -> id
242+ fstBuilder .add (Util .toIntsRef (term , ints ), id );
243+
244+ // dat: term bytes
245+ datOut .writeBytes (term .bytes , term .offset , term .length );
246+ datPos += term .length ;
247+
248+ // off: next offset
249+ offOut .writeLong (datPos );
250+
251+ id ++;
252+ }
253+ }
254+
255+ FST <Long > fst = fstBuilder .finish ();
256+ try (IndexOutput fstOut = outDir .createOutput (fstFile , IOContext .DEFAULT )) {
257+ fst .save (fstOut );
258+ }
259+ }
260+
261+ private static void safeDelete (Directory dir , String name )
262+ {
263+ try {
264+ if (fileExists (dir , name ))
265+ dir .deleteFile (name );
266+ } catch (Exception ignored ) {
267+ }
268+ }
269+ }
270+
271+ // -------------------- Startup checks / IO helpers --------------------
272+
273+ private static ByteBuffer mapReadOnly (Path p ) throws IOException
274+ {
275+ try (FileChannel ch = FileChannel .open (p , StandardOpenOption .READ )) {
276+ return ch .map (FileChannel .MapMode .READ_ONLY , 0 , ch .size ());
277+ }
278+ }
279+
280+ private static void ensureExists (Path dir , String file ) throws IOException
281+ {
282+ Path p = dir .resolve (file );
283+ if (!Files .isRegularFile (p ))
284+ throw new NoSuchFileException (p .toString ());
285+ }
286+
287+ private static void checkMtimeCoherence (Path dir , String ... files ) throws IOException
288+ {
289+ long min = Long .MAX_VALUE ;
290+ long max = Long .MIN_VALUE ;
291+ for (String f : files ) {
292+ long t = Files .getLastModifiedTime (dir .resolve (f )).toMillis ();
293+ min = Math .min (min , t );
294+ max = Math .max (max , t );
295+ }
296+ if ((max - min ) > MTIME_TOLERANCE_MS ) {
297+ throw new IOException ("Lexicon files mtimes differ by " + (max - min ) + "ms; " +
298+ "possible partial copy or mixed versions: " + Arrays .toString (files ));
299+ }
300+ }
301+
302+ private static void monotonicityCheck (LongBuffer off , String offName ) throws IOException
303+ {
304+ int n = off .capacity ();
305+ int head = Math .min (MONO_CHECK , n );
306+ int tailStart = Math .max (0 , n - MONO_CHECK );
307+
308+ // Head check
309+ long prev = off .get (0 );
310+ for (int i = 1 ; i < head ; i ++) {
311+ long cur = off .get (i );
312+ if (cur < prev )
313+ throw new IOException ("Invalid " + offName + ": offsets decrease at i=" + i );
314+ prev = cur ;
315+ }
316+ // Tail check
317+ prev = off .get (tailStart );
318+ for (int i = tailStart + 1 ; i < n ; i ++) {
319+ long cur = off .get (i );
320+ if (cur < prev )
321+ throw new IOException ("Invalid " + offName + ": offsets decrease near end at i=" + i );
322+ prev = cur ;
323+ }
324+ }
325+
326+ private static void ensureNotExists (Directory dir , String name ) throws IOException
327+ {
328+ if (fileExists (dir , name ))
329+ throw new FileAlreadyExistsException (name );
330+ }
331+
332+ private static boolean fileExists (Directory dir , String name ) throws IOException
333+ {
334+ for (String s : dir .listAll ()) {
335+ if (s .equals (name ))
336+ return true ;
337+ }
338+ return false ;
339+ }
340+ }
0 commit comments