11package com .github .oeuvres .alix .lucene .terms ;
22
3+ import org .apache .lucene .index .FieldInfo ;
4+ import org .apache .lucene .index .FieldInfos ;
35import org .apache .lucene .index .Fields ;
46import org .apache .lucene .index .IndexReader ;
57import org .apache .lucene .index .MultiTerms ;
@@ -153,38 +155,37 @@ private TermRail(
153155 }
154156
155157 /**
156- * Builds the rail files for one field from postings and a matching lexicon.
157- * <p>
158- * Preconditions:
159- * </p>
160- * <ul>
161- * <li>the field must exist in the supplied reader</li>
162- * <li>the field must expose positions in postings</li>
163- * <li>the lexicon must come from the same field and index snapshot</li>
164- * <li>the target rail files must not already exist</li>
165- * </ul>
166- * <p>
167- * Build fails fast if:
168- * </p>
158+ * Build a term-id rail for the given field and write it to disk.
159+ *
160+ * <p>Produces two files under {@code dataDir}:</p>
169161 * <ul>
170- * <li>the field has no terms</li>
171- * <li>positions are unavailable</li>
172- * <li>pass 1 sees zero positions</li>
173- * <li>a postings term is absent from the lexicon</li>
174- * <li>pass 2 disagrees with pass 1</li>
175- * <li>two postings target the same {@code (docId, position)} slot</li>
162+ * <li><b>offsets</b> ({@link #offPath offPath(dataDir, field)}) —
163+ * a {@code long[maxDoc + 1]} array of byte offsets into the data file.
164+ * {@code offsets[docId]} is the byte position where the rail for
165+ * {@code docId} begins; the rail length in ints is
166+ * {@code (offsets[docId + 1] - offsets[docId]) / Integer.BYTES}.
167+ * Deleted documents and documents without the field have
168+ * {@code offsets[docId] == offsets[docId + 1]}.</li>
169+ * <li><b>data</b> ({@link #datPath datPath(dataDir, field)}) —
170+ * a flat {@code int[]} rail where each slot holds the
171+ * {@link TermLexicon} id of the term at that position,
172+ * or {@code 0} for unfilled positions (position gaps).
173+ * Files may exceed 2 GB.</li>
176174 * </ul>
177- * <p>
178- * On failure, temporary files are cleaned up. Because final targets are required
179- * to be absent before build starts, any final files created during a failed write
180- * are also removed.
181- * </p>
182175 *
183- * @param dataDir Lucene directory that will receive the rail files
184- * @param reader snapshot reader
185- * @param field indexed field name
186- * @param lexicon lexicon built from the same field and snapshot
187- * @throws IOException if build fails or output files already exist
176+ * <p>Both files are written to temporary paths first and atomically
177+ * renamed on success. On failure, all temporary and final files are
178+ * cleaned up.</p>
179+ *
180+ * @param dataDir directory for the output files.
181+ * @param reader index reader (must have term vectors with positions
182+ * for {@code field}).
183+ * @param field indexed field name.
184+ * @param lexicon FST lexicon mapping terms to integer ids;
185+ * id {@code 0} should be reserved as a sentinel
186+ * (no term at position).
187+ * @param report progress reporter; may be {@code null}.
188+ * @throws IOException if an I/O error occurs during reading or writing.
188189 */
189190 public static void build (
190191 final Path dataDir ,
@@ -198,38 +199,46 @@ public static void build(
198199 Objects .requireNonNull (field , "field" );
199200 Objects .requireNonNull (lexicon , "lexicon" );
200201 if (report == null ) report = Report .ReportNull .INSTANCE ;
201-
202+ final FieldInfo fi = FieldInfos .getMergedFieldInfos (reader ).fieldInfo (field );
203+ if (fi == null || !fi .hasTermVectors ()) {
204+ throw new IllegalArgumentException ("field \" " + field + "\" has no term vectors" );
205+ }
206+
202207 final Path offFinal = offPath (dataDir , field );
203208 ensureAbsent (offFinal );
204209 final Path offTmp = tmpPath (offFinal );
205210 deleteIfExists (offTmp );
206-
211+
207212 final int maxDoc = reader .maxDoc ();
208213 final BitSet liveDocs = FieldStats .liveDocs (reader );
209-
210- // get fresh doc width (position max + 1), establish offsets
211- int [] docWidths = FieldStats .docWidths (reader , field , report );
212- int widthMax = -1 ;
213- final long [] offsets = new long [docWidths .length + 1 ];
214- long totalSlots = 0L ;
214+
215+ final int [] docWidths = FieldStats .docWidths (reader , field , report );
216+ int widthMax = 0 ;
217+ final long [] offsets = new long [maxDoc + 1 ];
218+ long totalBytes = 0L ;
215219 for (int docId = 0 ; docId < maxDoc ; docId ++) {
216- final int width = (docWidths [docId ] < 0 || !liveDocs .get (docId )) ? 0 : docWidths [docId ];
217- if (width > widthMax ) widthMax = width ;
218- offsets [docId ] = totalSlots ;
219- totalSlots += width * Integer .BYTES ;
220+ if (docWidths [docId ] <= 0 || !liveDocs .get (docId )) {
221+ docWidths [docId ] = 0 ; // zero in-place so the write loop skips this doc
222+ }
223+ offsets [docId ] = totalBytes ;
224+ if (docWidths [docId ] > widthMax ) {
225+ widthMax = docWidths [docId ];
226+ }
227+ totalBytes += (long ) docWidths [docId ] * Integer .BYTES ;
220228 }
221- offsets [maxDoc ] = totalSlots ;
229+ offsets [maxDoc ] = totalBytes ;
230+
222231 try (NumWriter offsetsWriter = NumWriter .open (offTmp , (long ) offsets .length * Long .BYTES )) {
223232 offsetsWriter .put (0L , offsets , 0 , offsets .length );
224233 }
225-
234+
226235 final Path datFinal = datPath (dataDir , field );
227236 ensureAbsent (datFinal );
228237 final Path datTmp = tmpPath (datFinal );
229238 deleteIfExists (datTmp );
230239
231- try (NumWriter railWriter = NumWriter .open (datTmp , totalSlots )) {
232- int [] rail = new int [widthMax ];
240+ try (NumWriter railWriter = NumWriter .open (datTmp , totalBytes )) {
241+ final int [] rail = new int [widthMax ];
233242 final TermVectors termVectors = reader .termVectors ();
234243 for (int docId = 0 ; docId < maxDoc ; docId ++) {
235244 final int docWidth = docWidths [docId ];
@@ -244,8 +253,12 @@ public static void build(
244253 if (terms == null ) {
245254 continue ;
246255 }
247- // zero the region — guards against position gaps and stale data
248- Arrays .fill (rail , 0 , rail .length , 0 );
256+ if (!terms .hasPositions ()) {
257+ throw new IllegalArgumentException (
258+ "field \" " + field + "\" term vectors for docId=" + docId + " have no positions"
259+ );
260+ }
261+ Arrays .fill (rail , 0 , docWidth , 0 );
249262
250263 final TermsEnum termsEnum = terms .iterator ();
251264 BytesRef term ;
@@ -261,10 +274,8 @@ public static void build(
261274
262275 railWriter .put (offsets [docId ], rail , 0 , docWidth );
263276 }
264-
265-
266277 }
267-
278+
268279 try {
269280 moveTemp (datTmp , datFinal );
270281 moveTemp (offTmp , offFinal );
0 commit comments