Skip to content

Commit 41eaf60

Browse files
committed
tested, OK
1 parent f294a24 commit 41eaf60

1 file changed

Lines changed: 61 additions & 50 deletions

File tree

  • common/src/java/com/github/oeuvres/alix/lucene/terms

common/src/java/com/github/oeuvres/alix/lucene/terms/TermRail.java

Lines changed: 61 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.github.oeuvres.alix.lucene.terms;
22

3+
import org.apache.lucene.index.FieldInfo;
4+
import org.apache.lucene.index.FieldInfos;
35
import org.apache.lucene.index.Fields;
46
import org.apache.lucene.index.IndexReader;
57
import org.apache.lucene.index.MultiTerms;
@@ -153,38 +155,37 @@ private TermRail(
153155
}
154156

155157
/**
156-
* Builds the rail files for one field from postings and a matching lexicon.
157-
* <p>
158-
* Preconditions:
159-
* </p>
160-
* <ul>
161-
* <li>the field must exist in the supplied reader</li>
162-
* <li>the field must expose positions in postings</li>
163-
* <li>the lexicon must come from the same field and index snapshot</li>
164-
* <li>the target rail files must not already exist</li>
165-
* </ul>
166-
* <p>
167-
* Build fails fast if:
168-
* </p>
158+
* Build a term-id rail for the given field and write it to disk.
159+
*
160+
* <p>Produces two files under {@code dataDir}:</p>
169161
* <ul>
170-
* <li>the field has no terms</li>
171-
* <li>positions are unavailable</li>
172-
* <li>pass 1 sees zero positions</li>
173-
* <li>a postings term is absent from the lexicon</li>
174-
* <li>pass 2 disagrees with pass 1</li>
175-
* <li>two postings target the same {@code (docId, position)} slot</li>
162+
* <li><b>offsets</b> ({@link #offPath offPath(dataDir, field)}) —
163+
* a {@code long[maxDoc + 1]} array of byte offsets into the data file.
164+
* {@code offsets[docId]} is the byte position where the rail for
165+
* {@code docId} begins; the rail length in ints is
166+
* {@code (offsets[docId + 1] - offsets[docId]) / Integer.BYTES}.
167+
* Deleted documents and documents without the field have
168+
* {@code offsets[docId] == offsets[docId + 1]}.</li>
169+
* <li><b>data</b> ({@link #datPath datPath(dataDir, field)}) —
170+
* a flat {@code int[]} rail where each slot holds the
171+
* {@link TermLexicon} id of the term at that position,
172+
* or {@code 0} for unfilled positions (position gaps).
173+
* Files may exceed 2&nbsp;GB.</li>
176174
* </ul>
177-
* <p>
178-
* On failure, temporary files are cleaned up. Because final targets are required
179-
* to be absent before build starts, any final files created during a failed write
180-
* are also removed.
181-
* </p>
182175
*
183-
* @param dataDir Lucene directory that will receive the rail files
184-
* @param reader snapshot reader
185-
* @param field indexed field name
186-
* @param lexicon lexicon built from the same field and snapshot
187-
* @throws IOException if build fails or output files already exist
176+
* <p>Both files are written to temporary paths first and atomically
177+
* renamed on success. On failure, all temporary and final files are
178+
* cleaned up.</p>
179+
*
180+
* @param dataDir directory for the output files.
181+
* @param reader index reader (must have term vectors with positions
182+
* for {@code field}).
183+
* @param field indexed field name.
184+
* @param lexicon FST lexicon mapping terms to integer ids;
185+
* id {@code 0} should be reserved as a sentinel
186+
* (no term at position).
187+
* @param report progress reporter; may be {@code null}.
188+
* @throws IOException if an I/O error occurs during reading or writing.
188189
*/
189190
public static void build(
190191
final Path dataDir,
@@ -198,38 +199,46 @@ public static void build(
198199
Objects.requireNonNull(field, "field");
199200
Objects.requireNonNull(lexicon, "lexicon");
200201
if (report == null) report = Report.ReportNull.INSTANCE;
201-
202+
final FieldInfo fi = FieldInfos.getMergedFieldInfos(reader).fieldInfo(field);
203+
if (fi == null || !fi.hasTermVectors()) {
204+
throw new IllegalArgumentException("field \"" + field + "\" has no term vectors");
205+
}
206+
202207
final Path offFinal = offPath(dataDir, field);
203208
ensureAbsent(offFinal);
204209
final Path offTmp = tmpPath(offFinal);
205210
deleteIfExists(offTmp);
206-
211+
207212
final int maxDoc = reader.maxDoc();
208213
final BitSet liveDocs = FieldStats.liveDocs(reader);
209-
210-
// get fresh doc width (position max + 1), establish offsets
211-
int[] docWidths = FieldStats.docWidths(reader, field, report);
212-
int widthMax = -1;
213-
final long[] offsets = new long[docWidths.length + 1];
214-
long totalSlots = 0L;
214+
215+
final int[] docWidths = FieldStats.docWidths(reader, field, report);
216+
int widthMax = 0;
217+
final long[] offsets = new long[maxDoc + 1];
218+
long totalBytes = 0L;
215219
for (int docId = 0; docId < maxDoc; docId++) {
216-
final int width = (docWidths[docId] < 0 || !liveDocs.get(docId)) ? 0 : docWidths[docId];
217-
if (width > widthMax) widthMax = width;
218-
offsets[docId] = totalSlots;
219-
totalSlots += width * Integer.BYTES;
220+
if (docWidths[docId] <= 0 || !liveDocs.get(docId)) {
221+
docWidths[docId] = 0; // zero in-place so the write loop skips this doc
222+
}
223+
offsets[docId] = totalBytes;
224+
if (docWidths[docId] > widthMax) {
225+
widthMax = docWidths[docId];
226+
}
227+
totalBytes += (long) docWidths[docId] * Integer.BYTES;
220228
}
221-
offsets[maxDoc] = totalSlots;
229+
offsets[maxDoc] = totalBytes;
230+
222231
try (NumWriter offsetsWriter = NumWriter.open(offTmp, (long) offsets.length * Long.BYTES)) {
223232
offsetsWriter.put(0L, offsets, 0, offsets.length);
224233
}
225-
234+
226235
final Path datFinal = datPath(dataDir, field);
227236
ensureAbsent(datFinal);
228237
final Path datTmp = tmpPath(datFinal);
229238
deleteIfExists(datTmp);
230239

231-
try (NumWriter railWriter = NumWriter.open(datTmp, totalSlots)) {
232-
int[] rail = new int[widthMax];
240+
try (NumWriter railWriter = NumWriter.open(datTmp, totalBytes)) {
241+
final int[] rail = new int[widthMax];
233242
final TermVectors termVectors = reader.termVectors();
234243
for (int docId = 0; docId < maxDoc; docId++) {
235244
final int docWidth = docWidths[docId];
@@ -244,8 +253,12 @@ public static void build(
244253
if (terms == null) {
245254
continue;
246255
}
247-
// zero the region — guards against position gaps and stale data
248-
Arrays.fill(rail, 0, rail.length, 0);
256+
if (!terms.hasPositions()) {
257+
throw new IllegalArgumentException(
258+
"field \"" + field + "\" term vectors for docId=" + docId + " have no positions"
259+
);
260+
}
261+
Arrays.fill(rail, 0, docWidth, 0);
249262

250263
final TermsEnum termsEnum = terms.iterator();
251264
BytesRef term;
@@ -261,10 +274,8 @@ public static void build(
261274

262275
railWriter.put(offsets[docId], rail, 0, docWidth);
263276
}
264-
265-
266277
}
267-
278+
268279
try {
269280
moveTemp(datTmp, datFinal);
270281
moveTemp(offTmp, offFinal);

0 commit comments

Comments
 (0)