Skip to content

Commit f61a2bb

Browse files
committed
Record field lexicon, first draft
1 parent 91c3cc6 commit f61a2bb

3 files changed

Lines changed: 352 additions & 8 deletions

File tree

Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
package com.github.oeuvres.alix.lucene;
2+
3+
import org.apache.lucene.index.*;
4+
import org.apache.lucene.store.*;
5+
import org.apache.lucene.util.*;
6+
import org.apache.lucene.util.fst.*;
7+
8+
import java.io.Closeable;
9+
import java.io.IOException;
10+
import java.nio.ByteBuffer;
11+
import java.nio.ByteOrder;
12+
import java.nio.LongBuffer;
13+
import java.nio.channels.FileChannel;
14+
import java.nio.file.*;
15+
import java.util.Arrays;
16+
17+
import static java.lang.Math.toIntExact;
18+
19+
/**
20+
* Snapshot lexicon for a single field, persisted inside a frozen Lucene directory:
21+
*
22+
* <field>.terms.fst : term(BytesRef) -> termId (dense 0..V-1, lexicographic order)
23+
* <field>.terms.dat : concatenated UTF-8 bytes of terms in termId order
24+
* <field>.terms.off : big-endian long offsets into .dat; length = V+1; off[V] == datLength
25+
*
26+
* Notes:
27+
* - termId is defined by the snapshot build: iteration order of TermsEnum for the field.
28+
* - getId(String) assumes the String is already in the indexed term form for the field.
29+
*/
30+
public final class TermLexicon implements Closeable
31+
{
32+
private final Directory dir; // used to open the FST
33+
private final FST<Long> fst;
34+
35+
private final ByteBuffer dat; // mmap .dat
36+
private final LongBuffer off; // mmap .off (big-endian longs)
37+
38+
private final int vocabSize;
39+
40+
private static final long MTIME_TOLERANCE_MS = 5_000;
41+
private static final int MONO_CHECK = 1024;
42+
43+
private static final ThreadLocal<BytesRefBuilder> TL_BRB = ThreadLocal.withInitial(BytesRefBuilder::new);
44+
45+
private TermLexicon(Directory dir, FST<Long> fst, ByteBuffer dat, LongBuffer off)
46+
{
47+
this.dir = dir;
48+
this.fst = fst;
49+
this.dat = dat.asReadOnlyBuffer();
50+
this.off = off.asReadOnlyBuffer();
51+
this.vocabSize = off.capacity() - 1;
52+
}
53+
54+
public int vocabSize()
55+
{
56+
return vocabSize;
57+
}
58+
59+
/** Open lexicon files for {@code field} from a frozen index directory. */
60+
public static TermLexicon open(Path indexDir, String field) throws IOException
61+
{
62+
final String fstName = field + ".terms.fst";
63+
final String datName = field + ".terms.dat";
64+
final String offName = field + ".terms.off";
65+
66+
ensureExists(indexDir, fstName);
67+
ensureExists(indexDir, datName);
68+
ensureExists(indexDir, offName);
69+
checkMtimeCoherence(indexDir, fstName, datName, offName);
70+
71+
// mmap data files (Path-based mmap is fine in a frozen directory)
72+
ByteBuffer dat = mapReadOnly(indexDir.resolve(datName));
73+
ByteBuffer offBB = mapReadOnly(indexDir.resolve(offName)).order(ByteOrder.BIG_ENDIAN);
74+
75+
if ((offBB.remaining() & 7) != 0) {
76+
throw new IOException("Invalid " + offName + ": size not multiple of 8 bytes");
77+
}
78+
LongBuffer off = offBB.asLongBuffer();
79+
if (off.capacity() < 2) {
80+
throw new IOException("Invalid " + offName + ": need at least 2 offsets (V+1)");
81+
}
82+
83+
// Basic structural checks tying off/dat together
84+
long datLen = dat.capacity();
85+
long first = off.get(0);
86+
long last = off.get(off.capacity() - 1);
87+
if (first != 0L) {
88+
throw new IOException("Invalid " + offName + ": off[0] must be 0, got " + first);
89+
}
90+
if (last != datLen) {
91+
throw new IOException("Mismatch: " + offName + " last offset=" + last +
92+
" but " + datName + " length=" + datLen);
93+
}
94+
95+
// Bounded monotonicity check (cheap, catches common corruption)
96+
monotonicityCheck(off, offName);
97+
98+
// Load FST via Lucene Directory APIs
99+
Directory d = FSDirectory.open(indexDir);
100+
try (IndexInput in = d.openInput(fstName, IOContext.READONCE)) {
101+
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
102+
FST<Long> fst = new FST<>(in, outputs);
103+
return new TermLexicon(d, fst, dat, off);
104+
} catch (IOException e) {
105+
d.close();
106+
throw e;
107+
}
108+
}
109+
110+
/** term(String) -> termId, or -1 if absent. */
111+
public int getId(String termForm) throws IOException
112+
{
113+
BytesRefBuilder brb = TL_BRB.get();
114+
brb.copyChars(termForm); // UTF-16 -> UTF-8 BytesRef
115+
Long v = Util.get(fst, brb.get());
116+
return v == null ? -1 : toIntExact(v);
117+
}
118+
119+
/** term(BytesRef) -> termId, or -1 if absent. */
120+
public int getId(BytesRef termBytes) throws IOException
121+
{
122+
Long v = Util.get(fst, termBytes);
123+
return v == null ? -1 : toIntExact(v);
124+
}
125+
126+
/** termId -> term bytes copied into {@code dst} (no per-call allocation). */
127+
public BytesRef getTermBytes(int termId, BytesRefBuilder dst)
128+
{
129+
checkTermId(termId);
130+
long start = off.get(termId);
131+
long end = off.get(termId + 1);
132+
int len = toIntExact(end - start);
133+
134+
dst.grow(len);
135+
byte[] arr = dst.bytes();
136+
137+
ByteBuffer dup = dat.duplicate();
138+
dup.position(toIntExact(start));
139+
dup.get(arr, 0, len);
140+
141+
dst.setLength(len);
142+
return dst.get();
143+
}
144+
145+
/** Convenience: termId -> String (allocates). */
146+
public String getTermString(int termId)
147+
{
148+
BytesRefBuilder brb = new BytesRefBuilder();
149+
return getTermBytes(termId, brb).utf8ToString();
150+
}
151+
152+
private void checkTermId(int termId)
153+
{
154+
if (termId < 0 || termId >= vocabSize) {
155+
throw new IllegalArgumentException("termId out of range: " + termId + " (vocabSize=" + vocabSize + ")");
156+
}
157+
}
158+
159+
@Override
160+
public void close() throws IOException
161+
{
162+
dir.close(); // mmaps are managed by the OS/JVM; close Lucene directory handles
163+
}
164+
165+
// -------------------- Builder --------------------
166+
167+
public static final class Builder
168+
{
169+
private Builder()
170+
{
171+
}
172+
173+
/**
174+
* Build <field>.terms.{fst,dat,off} into the given Lucene directory.
175+
* Fails if any destination file already exists.
176+
*
177+
* The reader must represent the frozen snapshot you want to serve (DirectoryReader.open(IndexCommit)).
178+
*/
179+
public static void buildInto(Path indexDir, IndexReader snapshotReader, String field) throws IOException
180+
{
181+
final String fstName = field + ".terms.fst";
182+
final String datName = field + ".terms.dat";
183+
final String offName = field + ".terms.off";
184+
185+
try (Directory outDir = FSDirectory.open(indexDir)) {
186+
ensureNotExists(outDir, fstName);
187+
ensureNotExists(outDir, datName);
188+
ensureNotExists(outDir, offName);
189+
190+
Terms terms = MultiTerms.getTerms(snapshotReader, field);
191+
if (terms == null) {
192+
throw new IllegalArgumentException("Field not found in reader: " + field);
193+
}
194+
195+
// Temp files + rename to avoid leaving partial final files.
196+
final String tmpFst = fstName + ".tmp";
197+
final String tmpDat = datName + ".tmp";
198+
final String tmpOff = offName + ".tmp";
199+
ensureNotExists(outDir, tmpFst);
200+
ensureNotExists(outDir, tmpDat);
201+
ensureNotExists(outDir, tmpOff);
202+
203+
try {
204+
buildFiles(outDir, terms, tmpDat, tmpOff, tmpFst);
205+
outDir.rename(tmpDat, datName);
206+
outDir.rename(tmpOff, offName);
207+
outDir.rename(tmpFst, fstName);
208+
} catch (IOException e) {
209+
safeDelete(outDir, tmpDat);
210+
safeDelete(outDir, tmpOff);
211+
safeDelete(outDir, tmpFst);
212+
throw e;
213+
}
214+
}
215+
}
216+
217+
private static void buildFiles(
218+
Directory outDir,
219+
Terms terms,
220+
String datFile,
221+
String offFile,
222+
String fstFile) throws IOException
223+
{
224+
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
225+
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
226+
IntsRefBuilder ints = new IntsRefBuilder();
227+
228+
long id = 0;
229+
long datPos = 0;
230+
231+
try (IndexOutput datOut = outDir.createOutput(datFile, IOContext.DEFAULT);
232+
IndexOutput offOut = outDir.createOutput(offFile, IOContext.DEFAULT))
233+
{
234+
235+
// First offset always 0
236+
offOut.writeLong(0L);
237+
238+
TermsEnum te = terms.iterator();
239+
BytesRef term;
240+
while ((term = te.next()) != null) {
241+
// FST: term -> id
242+
fstBuilder.add(Util.toIntsRef(term, ints), id);
243+
244+
// dat: term bytes
245+
datOut.writeBytes(term.bytes, term.offset, term.length);
246+
datPos += term.length;
247+
248+
// off: next offset
249+
offOut.writeLong(datPos);
250+
251+
id++;
252+
}
253+
}
254+
255+
FST<Long> fst = fstBuilder.finish();
256+
try (IndexOutput fstOut = outDir.createOutput(fstFile, IOContext.DEFAULT)) {
257+
fst.save(fstOut);
258+
}
259+
}
260+
261+
private static void safeDelete(Directory dir, String name)
262+
{
263+
try {
264+
if (fileExists(dir, name))
265+
dir.deleteFile(name);
266+
} catch (Exception ignored) {
267+
}
268+
}
269+
}
270+
271+
// -------------------- Startup checks / IO helpers --------------------
272+
273+
private static ByteBuffer mapReadOnly(Path p) throws IOException
274+
{
275+
try (FileChannel ch = FileChannel.open(p, StandardOpenOption.READ)) {
276+
return ch.map(FileChannel.MapMode.READ_ONLY, 0, ch.size());
277+
}
278+
}
279+
280+
private static void ensureExists(Path dir, String file) throws IOException
281+
{
282+
Path p = dir.resolve(file);
283+
if (!Files.isRegularFile(p))
284+
throw new NoSuchFileException(p.toString());
285+
}
286+
287+
private static void checkMtimeCoherence(Path dir, String... files) throws IOException
288+
{
289+
long min = Long.MAX_VALUE;
290+
long max = Long.MIN_VALUE;
291+
for (String f : files) {
292+
long t = Files.getLastModifiedTime(dir.resolve(f)).toMillis();
293+
min = Math.min(min, t);
294+
max = Math.max(max, t);
295+
}
296+
if ((max - min) > MTIME_TOLERANCE_MS) {
297+
throw new IOException("Lexicon files mtimes differ by " + (max - min) + "ms; " +
298+
"possible partial copy or mixed versions: " + Arrays.toString(files));
299+
}
300+
}
301+
302+
private static void monotonicityCheck(LongBuffer off, String offName) throws IOException
303+
{
304+
int n = off.capacity();
305+
int head = Math.min(MONO_CHECK, n);
306+
int tailStart = Math.max(0, n - MONO_CHECK);
307+
308+
// Head check
309+
long prev = off.get(0);
310+
for (int i = 1; i < head; i++) {
311+
long cur = off.get(i);
312+
if (cur < prev)
313+
throw new IOException("Invalid " + offName + ": offsets decrease at i=" + i);
314+
prev = cur;
315+
}
316+
// Tail check
317+
prev = off.get(tailStart);
318+
for (int i = tailStart + 1; i < n; i++) {
319+
long cur = off.get(i);
320+
if (cur < prev)
321+
throw new IOException("Invalid " + offName + ": offsets decrease near end at i=" + i);
322+
prev = cur;
323+
}
324+
}
325+
326+
private static void ensureNotExists(Directory dir, String name) throws IOException
327+
{
328+
if (fileExists(dir, name))
329+
throw new FileAlreadyExistsException(name);
330+
}
331+
332+
private static boolean fileExists(Directory dir, String name) throws IOException
333+
{
334+
for (String s : dir.listAll()) {
335+
if (s.equals(name))
336+
return true;
337+
}
338+
return false;
339+
}
340+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
/**
2+
*
3+
*/
4+
/**
5+
*
6+
*/
7+
package com.github.oeuvres.alix.lucene;

search/src/java/com/github/oeuvres/alix/lucene/search/FieldText.java

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,8 @@
5555

5656
import static com.github.oeuvres.alix.common.Upos.*;
5757

58-
import com.github.oeuvres.alix.common.Tag;
58+
import com.github.oeuvres.alix.common.Upos;
5959
import com.github.oeuvres.alix.common.TagFilter;
60-
import com.github.oeuvres.alix.fr.TagFr;
6160
import com.github.oeuvres.alix.lucene.index.BytesDic;
6261
import com.github.oeuvres.alix.util.Chain;
6362
import com.github.oeuvres.alix.util.Char;
@@ -86,8 +85,6 @@ public class FieldText extends FieldCharsAbstract
8685
protected int[] formId4tagNo;
8786
/** formId4isLoc.get(formId) == true: form is a locution. */
8887
private BitSet formId4isLoc;
89-
/** Tag set TODO parameter */
90-
private final Tag tag = TagFr.VERB;
9188

9289
/**
9390
* Build the dictionaries and stats. Each form indexed for the field will be
@@ -214,16 +211,16 @@ public int compareTo(FormRecord o)
214211
char c = chain.charAt(0);
215212
if (Char.isPunctuation(c)) {
216213
if (c == '§') {
217-
formId4tagNo[formId] = PUNsection.code;
214+
formId4tagNo[formId] = PUNCTsection.code;
218215
}
219216
else if (c == '¶') {
220-
formId4tagNo[formId] = PUNpara.code;
217+
formId4tagNo[formId] = PUNCTpara.code;
221218
}
222219
else if (c == '.' || c == '…' || c == '?' || c == '!' ) {
223-
formId4tagNo[formId] = PUNsent.code;
220+
formId4tagNo[formId] = PUNCTsent.code;
224221
}
225222
else {
226-
formId4tagNo[formId] = PUN.code;
223+
formId4tagNo[formId] = PUNCT.code;
227224
}
228225
punRecord.set(formId);
229226
continue;

0 commit comments

Comments
 (0)