Skip to content

Commit f45b2a5

Browse files
committed
It works
1 parent a359a31 commit f45b2a5

3 files changed

Lines changed: 81 additions & 42 deletions

File tree

analysis/src/java/com/github/oeuvres/alix/lucene/analysis/fr/FrenchAnalyzer.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040
import org.apache.lucene.analysis.TokenStream;
4141
import org.apache.lucene.analysis.Tokenizer;
4242
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
43-
import org.apache.lucene.analysis.standard.StandardAnalyzer;
44-
import org.apache.lucene.document.FieldType;
4543

4644
import com.github.oeuvres.alix.lucene.analysis.FinalCleanupFilter;
4745
import com.github.oeuvres.alix.lucene.analysis.LemmaFilter;
@@ -52,7 +50,6 @@
5250
import com.github.oeuvres.alix.lucene.analysis.SentenceStartLowerCaseFilter;
5351

5452
import opennlp.tools.postag.POSModel;
55-
import opennlp.tools.postag.POSTaggerME;
5653

5754
/**
5855
* Analysis scenario for French in Alix. The linguistic features of Alix are

common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java

Lines changed: 76 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import java.io.OutputStream;
2121
import java.nio.ByteBuffer;
2222
import java.nio.ByteOrder;
23-
import java.nio.LongBuffer;
23+
import java.nio.IntBuffer;
2424
import java.nio.channels.FileChannel;
2525
import java.nio.file.FileAlreadyExistsException;
2626
import java.nio.file.Files;
@@ -35,29 +35,33 @@
3535
/**
3636
* Immutable lookup table for one indexed field of one frozen Lucene directory.
3737
* <p>
38-
* The lexicon is stored in three files located directly in the Lucene directory:
38+
* The lexicon is persisted in three files located directly in the Lucene directory:
3939
* </p>
4040
* <ul>
4141
* <li><b>{@code &lt;field&gt;.terms.fst}</b>: exact lookup {@code term -> termId}</li>
4242
* <li><b>{@code &lt;field&gt;.terms.dat}</b>: concatenated UTF-8 bytes of all terms in {@code termId} order</li>
43-
* <li><b>{@code &lt;field&gt;.terms.off}</b>: big-endian {@code long} offsets into {@code .dat}</li>
43+
* <li><b>{@code &lt;field&gt;.terms.off}</b>: big-endian {@code int} offsets into {@code .dat}</li>
4444
* </ul>
4545
* <p>
4646
* Term ids are dense and stable for the frozen snapshot from which the lexicon was built.
4747
* The id assignment is the lexicographic iteration order returned by Lucene's merged
4848
* {@link TermsEnum} for the field.
4949
* </p>
5050
* <p>
51-
* This class exposes only two lifecycle operations:
51+
* This class intentionally exposes only two lifecycle operations:
5252
* </p>
5353
* <ul>
5454
* <li>{@link #write(Path, String)} or {@link #write(Path, IndexReader, String)} create the three files once</li>
5555
* <li>{@link #open(Path, String)} opens the persisted lexicon for lookup</li>
5656
* </ul>
5757
* <p>
58-
* The string-based lookup assumes that the caller already provides the field's canonical indexed form.
58+
* String lookup assumes that the caller already provides the field's canonical indexed form.
5959
* No analysis, normalization, stemming or lower-casing is applied here.
6060
* </p>
61+
* <p>
62+
* This KISS implementation memory-maps the whole {@code .dat} file in one {@link ByteBuffer}.
63+
* Consequently, it supports only lexicons whose {@code .dat} length fits in a signed 32-bit integer.
64+
* </p>
6165
*/
6266
public final class TermLexicon {
6367
/** Lucene directory that contains both the index and the {@code <field>.terms.*} files. */
@@ -79,7 +83,7 @@ public final class TermLexicon {
7983
* the term bytes are stored in {@code dat[off[i] .. off[i+1])}.
8084
* </p>
8185
*/
82-
private final LongBuffer off;
86+
private final IntBuffer off;
8387

8488
/** Number of distinct terms in the field. */
8589
private final int vocabSize;
@@ -100,15 +104,15 @@ public final class TermLexicon {
100104
* @param indexDir Lucene directory that contains the lexicon files
101105
* @param field indexed field name
102106
* @param fst exact mapping {@code term -> termId}
103-
* @param dat memory-mapped term bytes file
107+
* @param dat memory-mapped term-bytes file
104108
* @param off memory-mapped offsets file
105109
*/
106110
private TermLexicon(
107111
final Path indexDir,
108112
final String field,
109113
final FST<Long> fst,
110114
final ByteBuffer dat,
111-
final LongBuffer off
115+
final IntBuffer off
112116
) {
113117
this.indexDir = indexDir;
114118
this.field = field;
@@ -118,6 +122,24 @@ private TermLexicon(
118122
this.vocabSize = off.capacity() - 1;
119123
}
120124

125+
/**
126+
* Returns {@code true} if the three persisted files for {@code field} exist as regular files.
127+
* <p>
128+
* This is a cheap presence test only. It does not validate sizes, mtimes or file contents.
129+
* </p>
130+
*
131+
* @param indexDir Lucene directory
132+
* @param field indexed field name
133+
* @return {@code true} if {@code .fst}, {@code .dat} and {@code .off} are present
134+
*/
135+
public static boolean exists(final Path indexDir, final String field) {
136+
Objects.requireNonNull(indexDir, "indexDir");
137+
Objects.requireNonNull(field, "field");
138+
return Files.isRegularFile(fstPath(indexDir, field))
139+
&& Files.isRegularFile(datPath(indexDir, field))
140+
&& Files.isRegularFile(offPath(indexDir, field));
141+
}
142+
121143
/**
122144
* Builds the lexicon files for one field using the latest committed state of the Lucene directory.
123145
*
@@ -208,18 +230,18 @@ public static TermLexicon open(final Path indexDir, final String field) throws I
208230

209231
final ByteBuffer dat = mapReadOnly(datPath);
210232
final ByteBuffer offBytes = mapReadOnly(offPath).order(ByteOrder.BIG_ENDIAN);
211-
if ((offBytes.remaining() & 7) != 0) {
212-
throw new IOException("Invalid offsets file (size is not a multiple of 8 bytes): " + offPath);
233+
if ((offBytes.remaining() & 3) != 0) {
234+
throw new IOException("Invalid offsets file (size is not a multiple of 4 bytes): " + offPath);
213235
}
214-
final LongBuffer off = offBytes.asLongBuffer();
236+
final IntBuffer off = offBytes.asIntBuffer();
215237
if (off.capacity() < 2) {
216238
throw new IOException("Invalid offsets file (need at least 2 offsets): " + offPath);
217239
}
218240

219-
final long datLength = dat.capacity();
220-
final long first = off.get(0);
221-
final long last = off.get(off.capacity() - 1);
222-
if (first != 0L) {
241+
final int datLength = dat.capacity();
242+
final int first = off.get(0);
243+
final int last = off.get(off.capacity() - 1);
244+
if (first != 0) {
223245
throw new IOException("Invalid offsets file, off[0] != 0: " + offPath);
224246
}
225247
if (last != datLength) {
@@ -260,6 +282,18 @@ public int vocabSize() {
260282
return vocabSize;
261283
}
262284

285+
/**
286+
* Returns the in-memory size of the loaded FST, in bytes.
287+
* <p>
288+
* This does not include the memory-mapped {@code .dat} and {@code .off} files.
289+
* </p>
290+
*
291+
* @return FST heap usage in bytes
292+
*/
293+
public long fstRamBytesUsed() {
294+
return fst.ramBytesUsed();
295+
}
296+
263297
/**
264298
* Looks up a canonical indexed term represented as a Java string.
265299
*
@@ -299,16 +333,16 @@ public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
299333
checkTermId(termId);
300334
Objects.requireNonNull(reuse, "reuse");
301335

302-
final long start = off.get(termId);
303-
final long end = off.get(termId + 1);
304-
final int length = toIntExact(end - start);
336+
final int start = off.get(termId);
337+
final int end = off.get(termId + 1);
338+
final int length = end - start;
305339

306340
reuse.grow(length);
307341
final byte[] dst = reuse.bytes();
308342

309343
final ByteBuffer dup = dat.duplicate();
310-
dup.position(toIntExact(start));
311-
dup.limit(toIntExact(end));
344+
dup.position(start);
345+
dup.limit(end);
312346
dup.get(dst, 0, length);
313347

314348
reuse.setLength(length);
@@ -360,24 +394,32 @@ private static void buildFiles(
360394
final FSTCompiler<Long> compiler = new FSTCompiler.Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs).build();
361395
final IntsRefBuilder ints = new IntsRefBuilder();
362396

363-
long id = 0L;
364-
long datPos = 0L;
397+
int id = 0;
398+
int datPos = 0;
365399

366400
try (OutputStream datOs = new BufferedOutputStream(Files.newOutputStream(datPath, StandardOpenOption.CREATE_NEW));
367401
DataOutputStream offOut = new DataOutputStream(
368402
new BufferedOutputStream(Files.newOutputStream(offPath, StandardOpenOption.CREATE_NEW))
369403
)) {
370404

371-
offOut.writeLong(0L);
405+
offOut.writeInt(0);
372406

373407
final TermsEnum te = terms.iterator();
374408
BytesRef term;
375409
while ((term = te.next()) != null) {
376-
compiler.add(Util.toIntsRef(term, ints), id++);
410+
if (id == Integer.MAX_VALUE) {
411+
throw new IOException("Too many terms for int term ids in field lexicon");
412+
}
413+
if (datPos > Integer.MAX_VALUE - term.length) {
414+
throw new IOException("Term bytes file would exceed 2GB; this implementation uses 32-bit offsets");
415+
}
416+
417+
compiler.add(Util.toIntsRef(term, ints), (long) id);
377418

378419
datOs.write(term.bytes, term.offset, term.length);
379420
datPos += term.length;
380-
offOut.writeLong(datPos);
421+
offOut.writeInt(datPos);
422+
id++;
381423
}
382424
}
383425

@@ -440,7 +482,11 @@ private static Path tmpPath(final Path path) {
440482
* @throws IOException if the move fails
441483
*/
442484
private static void moveTemp(final Path source, final Path target) throws IOException {
443-
Files.move(source, target, StandardCopyOption.ATOMIC_MOVE);
485+
try {
486+
Files.move(source, target, StandardCopyOption.ATOMIC_MOVE);
487+
} catch (IOException e) {
488+
Files.move(source, target);
489+
}
444490
}
445491

446492
/**
@@ -524,14 +570,14 @@ private static void checkMtimeCoherence(final Path... paths) throws IOException
524570
* @param offPath offsets file path, used only in error messages
525571
* @throws IOException if checked offsets are not monotonic
526572
*/
527-
private static void monotonicityCheck(final LongBuffer off, final Path offPath) throws IOException {
573+
private static void monotonicityCheck(final IntBuffer off, final Path offPath) throws IOException {
528574
final int n = off.capacity();
529575
final int head = Math.min(MONO_CHECK, n);
530576
final int tailStart = Math.max(0, n - MONO_CHECK);
531577

532-
long prev = off.get(0);
578+
int prev = off.get(0);
533579
for (int i = 1; i < head; i++) {
534-
final long cur = off.get(i);
580+
final int cur = off.get(i);
535581
if (cur < prev) {
536582
throw new IOException("Invalid offsets file, offsets decrease at head index " + i + ": " + offPath);
537583
}
@@ -540,7 +586,7 @@ private static void monotonicityCheck(final LongBuffer off, final Path offPath)
540586

541587
prev = off.get(tailStart);
542588
for (int i = tailStart + 1; i < n; i++) {
543-
final long cur = off.get(i);
589+
final int cur = off.get(i);
544590
if (cur < prev) {
545591
throw new IOException("Invalid offsets file, offsets decrease at tail index " + i + ": " + offPath);
546592
}

test/src/test/java/com/github/oeuvres/alix/lucene/terms/TermLexiconDemo.java

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,14 @@ private TermLexiconDemo() {
1919
}
2020

2121
public static void main(String[] args) throws Exception {
22-
if (args.length != 3) {
23-
System.err.println("Usage: TermLexiconDemo <indexDir> <field> <term>");
24-
System.exit(1);
25-
}
2622

27-
final Path indexDir = Path.of(args[0]);
28-
final String field = args[1];
29-
final String queryTerm = args[2];
23+
final Path indexDir = Path.of("D:\\code\\piaget-labo\\lucene\\test");
24+
final String field = "text";
25+
final String queryTerm = "juste";
3026

3127
// 1) Build the lexicon once if missing.
32-
if (!java.nio.file.Files.exists(indexDir.resolve(field + ".terms.fst"))) {
33-
TermLexicon.Builder.build(indexDir, field);
28+
if (!TermLexicon.exists(indexDir, field)) {
29+
TermLexicon.write(indexDir, field);
3430
}
3531

3632
// 2) Open the lexicon and do the two core lookups.

0 commit comments

Comments
 (0)