Skip to content

Commit 051ea58

Browse files
committed
Scored spans should work, let’s now test
1 parent 0d175f6 commit 051ea58

2 files changed

Lines changed: 269 additions & 0 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/spans/OffsetsCollector.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ public final class OffsetsCollector implements SpanCollector {
6969
/** Number of leaves collected in the current span. */
7070
private int size;
7171

72+
/** Ordinal of this span within its document (0-based), set by the walker or visitor. */
73+
private int ord;
74+
7275
/**
7376
* Constructs a collector with a default initial capacity of 4 leaf terms.
7477
* Suitable for use as a pre-allocated slot in {@link com.github.oeuvres.alix.util.TopSlot}.
@@ -125,6 +128,7 @@ public void copyTo(final OffsetsCollector dest) {
125128
System.arraycopy(data, 0, dest.data, 0, needed);
126129
}
127130
dest.size = size;
131+
dest.ord = ord;
128132
}
129133

130134
/**
@@ -136,6 +140,26 @@ public void copyTo(final OffsetsCollector dest) {
136140
public int endOffset(final int i) {
137141
return data[i * STRIDE + 2];
138142
}
143+
144+
/**
145+
* Returns the ordinal of this span within its document (0-based).
146+
* Set externally by the walker or visitor during span enumeration;
147+
* {@code -1} if not set.
148+
*/
149+
public int ord() {
150+
return ord;
151+
}
152+
153+
/**
154+
* Sets the span ordinal. Called by the walker or visitor during the
155+
* {@code nextStartPosition()} loop so that the ordinal is available to
156+
* the listener via {@link #ord()}.
157+
*
158+
* @param ord 0-based ordinal of this span in the document
159+
*/
160+
public void ord(final int ord) {
161+
this.ord = ord;
162+
}
139163

140164
/**
141165
* Returns the token position of the {@code i}-th collected leaf.
@@ -156,6 +180,7 @@ public int position(final int i) {
156180
@Override
157181
public void reset() {
158182
size = 0;
183+
ord = -1;
159184
}
160185

161186
/**
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
package com.github.oeuvres.alix.lucene.spans;
2+
3+
import java.io.IOException;
4+
import java.util.Arrays;
5+
import java.util.Objects;
6+
7+
import org.apache.lucene.index.LeafReaderContext;
8+
import org.apache.lucene.queries.spans.SpanQuery;
9+
import org.apache.lucene.queries.spans.SpanWeight;
10+
import org.apache.lucene.queries.spans.Spans;
11+
import org.apache.lucene.search.IndexSearcher;
12+
import org.apache.lucene.search.ScoreMode;
13+
14+
import com.github.oeuvres.alix.lucene.ResultsListener;
15+
import com.github.oeuvres.alix.lucene.terms.FieldStats;
16+
import com.github.oeuvres.alix.lucene.terms.TermRail;
17+
import com.github.oeuvres.alix.util.TopSlot;
18+
19+
/**
20+
* Visits individual documents for a {@link SpanQuery}, scores spans by passage
21+
* informativeness, and streams the top spans to a {@link ResultsListener}.
22+
*
23+
* <p>The caller controls the outer loop — typically over a {@code ScoreDoc[]}
24+
* returned by {@link org.apache.lucene.search.IndexSearcher#search}:</p>
25+
* <pre>{@code
26+
* TopDocs topDocs = searcher.search(spanQuery, 20);
27+
* SpanVisitor visitor = new SpanVisitor(searcher, spanQuery, listener,
28+
* fieldStats, termRail, 3, 15);
29+
* listener.start(spanQuery, null, topDocs.scoreDocs.length);
30+
* for (ScoreDoc sd : topDocs.scoreDocs) {
31+
* listener.startDoc(sd.doc, sd.score);
32+
* visitor.visit(sd.doc, sd.score);
33+
* listener.endDoc(visitor.spanTotal());
34+
* }
35+
* listener.end(true);
36+
* }</pre>
37+
*
38+
* <h2>Span selection</h2>
39+
*
40+
* <p>All spans in the document are enumerated. Each span is scored by summing the
41+
* corpus-level {@link FieldStats#termWeight} of distinct terms in a window of
42+
* {@link #windowRadius} token positions around the span. Term deduplication within
43+
* each window is done in O(1) per term using a stamp array — no per-span reset
44+
* is needed. The top {@code topSpans} spans by this score are emitted to the
45+
* listener in descending passage score order (most informative first).</p>
46+
*
47+
* <h2>Span ordinal</h2>
48+
*
49+
* <p>Each emitted {@link OffsetsCollector} carries its 0-based ordinal within the
50+
* document via {@link OffsetsCollector#spanOrd()}. This ordinal counts all spans
51+
* in the document, not only the emitted ones, and can be used to build a stable
52+
* link back to the span's position in the original document.</p>
53+
*/
54+
public final class SpanVisitor {
55+
56+
private final IndexSearcher searcher;
57+
private final SpanQuery spanQuery;
58+
private final ResultsListener listener;
59+
private final FieldStats fieldStats;
60+
private final TermRail termRail;
61+
62+
/** Token radius around the span used for passage scoring. */
63+
public final int windowRadius;
64+
65+
/**
66+
* Cached per-leaf {@link Spans} instances, indexed by leaf ordinal.
67+
* Reused when the next visited docId is strictly greater than the last
68+
* visited docId in the same leaf. Rebuilt otherwise, since
69+
* {@link Spans#advance} is forward-only.
70+
*/
71+
private final Spans[] leafSpans;
72+
73+
/**
74+
* Last global docId visited per leaf, indexed by leaf ordinal.
75+
* Initialised to {@code -1}. Used to decide whether to advance or rebuild.
76+
*/
77+
private final int[] leafLastDocId;
78+
79+
/** The {@link SpanWeight} used to obtain per-leaf {@link Spans}. */
80+
private final SpanWeight spanWeight;
81+
82+
/** Pre-allocated top-k container for span scores within one document. */
83+
private final TopSlot<OffsetsCollector> top;
84+
85+
/** Reusable collector for the current span during enumeration. */
86+
private final OffsetsCollector collector = new OffsetsCollector(8);
87+
88+
/**
89+
* Stamp array for O(1) distinct-term deduplication during passage scoring.
90+
* {@code termStamp[termId] == spanOrd} means the term has already been
91+
* counted for the current span window. Stamped with the 0-based span ordinal,
92+
* which increases monotonically across all {@link #visit} calls — no reset
93+
* of the array is ever needed.
94+
*/
95+
private final int[] termStamp;
96+
97+
/**
98+
* Total number of spans found in the most recently visited document,
99+
* including spans not emitted because they did not score in the top-k.
100+
*/
101+
private int spanTotal;
102+
103+
/**
104+
* Creates a span visitor.
105+
*
106+
* @param searcher used for query planning and leaf access
107+
* @param spanQuery span query to enumerate; rewritten at construction
108+
* @param listener consumer of streamed span results
109+
* @param fieldStats corpus statistics providing term weights; must have
110+
* {@link FieldStats#buildWeights} already called
111+
* @param termRail position index for the same field and snapshot
112+
* @param topSpans maximum number of spans to emit per document
113+
* @param windowRadius token radius around each span for passage scoring
114+
* @throws IOException if query rewriting fails
115+
*/
116+
public SpanVisitor(
117+
final IndexSearcher searcher,
118+
final SpanQuery spanQuery,
119+
final ResultsListener listener,
120+
final FieldStats fieldStats,
121+
final TermRail termRail,
122+
final int topSpans,
123+
final int windowRadius) throws IOException {
124+
this.searcher = Objects.requireNonNull(searcher, "searcher");
125+
this.spanQuery = (SpanQuery) searcher.rewrite(
126+
Objects.requireNonNull(spanQuery, "spanQuery"));
127+
this.listener = Objects.requireNonNull(listener, "listener");
128+
this.fieldStats = Objects.requireNonNull(fieldStats, "fieldStats");
129+
this.termRail = Objects.requireNonNull(termRail, "termRail");
130+
this.windowRadius = Math.max(0, windowRadius);
131+
132+
this.spanWeight = (SpanWeight) this.spanQuery.createWeight(
133+
searcher, ScoreMode.COMPLETE_NO_SCORES, 1f);
134+
135+
this.leafSpans = new Spans[searcher.getLeafContexts().size()];
136+
this.leafLastDocId = new int[searcher.getLeafContexts().size()];
137+
java.util.Arrays.fill(this.leafLastDocId, -1);
138+
139+
this.top = new TopSlot<>(OffsetsCollector::new, Math.max(1, topSpans));
140+
141+
this.termStamp = new int[fieldStats.vocabSize()];
142+
Arrays.fill(termStamp, -1);
143+
}
144+
145+
/**
146+
* Returns the total number of spans found in the most recently visited document,
147+
* including those not emitted. Useful for the caller to pass to
148+
* {@link ResultsListener#endDoc(int)}.
149+
*
150+
* @return total span count for the last {@link #visit} call
151+
*/
152+
public int spanTotal() {
153+
return spanTotal;
154+
}
155+
156+
/**
157+
* Visits one document: enumerates all its spans, scores them by passage
158+
* informativeness, and emits the top spans to the listener.
159+
*
160+
* <p>The caller must have already called {@link ResultsListener#startDoc}
161+
* before this method, and must call {@link ResultsListener#endDoc} after.</p>
162+
*
163+
* @param docId global Lucene doc id
164+
* @param score BM25 score assigned by the search phase, passed through for
165+
* any listener use (e.g. relevance display)
166+
* @throws IOException if index access or listener output fails
167+
*/
168+
public void visit(final int docId, final float score) throws IOException {
169+
spanTotal = 0;
170+
top.clear();
171+
172+
final double[] weights = fieldStats.termWeightsRef();
173+
174+
// Find the leaf containing this docId, advance Spans to it.
175+
Spans spans = null;
176+
for (final LeafReaderContext ctx : searcher.getLeafContexts()) {
177+
final int localDocId = docId - ctx.docBase;
178+
if (localDocId < 0 || localDocId >= ctx.reader().maxDoc()) continue;
179+
180+
// Reuse cached Spans if docId is strictly greater than the last
181+
// visited docId in this leaf (advance is possible). Otherwise discard
182+
// and rebuild — relevance order does not guarantee ascending docIds.
183+
if (leafSpans[ctx.ord] == null || docId <= leafLastDocId[ctx.ord]) {
184+
leafSpans[ctx.ord] = spanWeight.getSpans(ctx, SpanWeight.Postings.OFFSETS);
185+
}
186+
spans = leafSpans[ctx.ord];
187+
if (spans == null) return;
188+
if (spans.advance(localDocId) != localDocId) return;
189+
leafLastDocId[ctx.ord] = docId;
190+
break;
191+
}
192+
if (spans == null) return;
193+
194+
// Enumerate all spans, score each, keep top-k.
195+
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
196+
collector.reset();
197+
spans.collect(collector);
198+
collector.sort();
199+
collector.ord(spanTotal);
200+
201+
final double spanScore = scoreSpan(docId, spanTotal, collector, weights);
202+
final OffsetsCollector slot = top.insert(spanScore);
203+
if (slot != null) {
204+
collector.copyTo(slot);
205+
}
206+
spanTotal++;
207+
}
208+
209+
// Emit top spans to listener in descending passage score order.
210+
for (TopSlot.Entry<OffsetsCollector> e : top) {
211+
listener.span(e.value());
212+
}
213+
}
214+
215+
/**
216+
* Scores one span by summing the corpus-level term weights of distinct terms
217+
* in a window of {@link #windowRadius} positions around the span.
218+
*
219+
* <p>Deduplication uses {@link #termStamp}: a term whose stamp equals the
220+
* current {@code spanOrd} has already been counted for this span's window.
221+
* Because {@code spanOrd} increases monotonically and never resets,
222+
* stamps from any previous span are automatically stale — no array reset
223+
* is needed between spans or between documents.</p>
224+
*/
225+
private double scoreSpan(
226+
final int docId,
227+
final int spanOrd,
228+
final OffsetsCollector col,
229+
final double[] weights) throws IOException {
230+
final int posLo = Math.max(0, col.position(0) - windowRadius);
231+
final int posHi = col.position(col.size() - 1) + windowRadius;
232+
final double[] acc = {0d};
233+
234+
termRail.scanWindow(docId, posLo, posHi, termId -> {
235+
if (termId > 0
236+
&& termId < termStamp.length
237+
&& termStamp[termId] != spanOrd) {
238+
termStamp[termId] = spanOrd;
239+
acc[0] += weights[termId];
240+
}
241+
});
242+
return acc[0];
243+
}
244+
}

0 commit comments

Comments
 (0)