Skip to content

Commit b72ed8e

Browse files
committed
Coocs should be right
1 parent 639163a commit b72ed8e

4 files changed

Lines changed: 319 additions & 4 deletions

File tree

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
package com.github.oeuvres.alix.lucene.spans;
2+
3+
import java.io.IOException;
4+
import java.util.BitSet;
5+
import java.util.Objects;
6+
7+
import com.github.oeuvres.alix.lucene.terms.FieldStats;
8+
import com.github.oeuvres.alix.lucene.terms.FocusBuffers;
9+
import com.github.oeuvres.alix.lucene.terms.TermRail;
10+
11+
/**
12+
* {@link SpanListener} that accumulates per-term cooccurrence counts in a fixed-width window around each pivot match.
13+
*
14+
* <p>
15+
* For each match {@code [start, end)} delivered by {@link SpanWalker}, the listener marks the context positions {@code [max(0, start - left), start)} and {@code [end, min(docWidth, end + right))} in a per-document bitset. After the document is exhausted, the marked positions are resolved to term ids via {@link TermRail#scanPositions} and counts are written into a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance.
16+
* </p>
17+
*
18+
* <h2>Pivot self-exclusion</h2>
19+
*
20+
* <p>
21+
* Pivot positions of every match in the document are tracked in a separate bitset and removed from the window mask before the rail scan. With many pivot matches close together (large slop, or {@code SpanOr} of co-occurring terms), pivot positions naturally land in another match's window; without this exclusion they would inflate the cooccurrence counts and tilt the focus token denominator, biasing every score.
22+
* </p>
23+
*
24+
* <h2>Per-document deduplication for document frequency</h2>
25+
*
26+
* <p>
27+
* A vocabulary-sized bitset records which term ids have already been counted in the current document, so {@link FocusBuffers#termDocs} is incremented at most once per (term, document) pair while {@link FocusBuffers#termFreq} is incremented per occurrence.
28+
* </p>
29+
*
30+
* <h2>Lifecycle</h2>
31+
*
32+
* <p>
33+
* The listener is bound to a {@link FocusBuffers} via {@link #bindTo(FocusBuffers)} before the walk starts. {@link com.github.oeuvres.alix.lucene.terms.TopTerms#coocs} performs the binding, runs the walk, and reads back {@link #coocTokens()} and {@link #coocDocsTotal()}.
34+
* </p>
35+
*
36+
* <p>
37+
* This class is not thread-safe and is single-use per walk: a fresh instance, or a fresh {@link #bindTo(FocusBuffers)}, is required for each cooccurrence query.
38+
* </p>
39+
*/
40+
public final class CoocListener implements SpanListener
41+
{
42+
/** Number of documents that contributed at least one window position resolving to a real term. */
43+
private int coocDocsTotal;
44+
45+
/** Total non-gap positions visited across all documents. */
46+
private long coocTokens;
47+
48+
/** Field statistics for the pivot field; used for vocabulary size and max document width. */
49+
private final FieldStats fieldStats;
50+
51+
/** Number of context positions to read on the right of each match. */
52+
private final int left;
53+
54+
/** Pivot positions in the current document, accumulated across all matches. */
55+
private final BitSet pivotMask;
56+
57+
/** Forward positional rail for the pivot field. */
58+
private final TermRail rail;
59+
60+
/** Number of context positions to read on the left of each match. */
61+
private final int right;
62+
63+
/** Per-document set of term ids already counted toward {@link FocusBuffers#termDocs}. */
64+
private final BitSet termSeen;
65+
66+
/** Bound focus buffers; {@code null} until {@link #bindTo(FocusBuffers)} is called. */
67+
private FocusBuffers buffers;
68+
69+
/** Whether the current document contributed at least one cooc position. */
70+
private boolean docContributed;
71+
72+
/** Window positions in the current document, accumulated across all matches. */
73+
private final BitSet windowMask;
74+
75+
/**
76+
* Constructs a cooccurrence listener.
77+
*
78+
* @param fieldStats field statistics for the pivot field
79+
* @param rail forward positional rail for the same field
80+
* @param left context width on the left of each match, in positions; must be {@code >= 0}
81+
* @param right context width on the right of each match, in positions; must be {@code >= 0}
82+
* @throws IllegalArgumentException if {@code left} or {@code right} is negative, or if both are zero
83+
* @throws NullPointerException if {@code fieldStats} or {@code rail} is {@code null}
84+
*/
85+
public CoocListener(
86+
final FieldStats fieldStats,
87+
final TermRail rail,
88+
final int left,
89+
final int right)
90+
{
91+
this.fieldStats = Objects.requireNonNull(fieldStats, "fieldStats");
92+
this.rail = Objects.requireNonNull(rail, "rail");
93+
if (left < 0 || right < 0) {
94+
throw new IllegalArgumentException("left and right must be >= 0; got left=" + left + ", right=" + right);
95+
}
96+
if (left == 0 && right == 0) {
97+
throw new IllegalArgumentException("left and right cannot both be 0");
98+
}
99+
this.left = left;
100+
this.right = right;
101+
this.windowMask = new BitSet(fieldStats.maxWidth());
102+
this.pivotMask = new BitSet(fieldStats.maxWidth());
103+
this.termSeen = new BitSet(fieldStats.vocabSize());
104+
}
105+
106+
/**
107+
* Binds this listener to a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. Must be called before the walk starts.
108+
*
109+
* @param buffers focus buffers to write into
110+
* @throws NullPointerException if {@code buffers} is {@code null}
111+
* @throws IllegalArgumentException if buffer lengths do not match {@code fieldStats.vocabSize()}
112+
*/
113+
public void bindTo(final FocusBuffers buffers)
114+
{
115+
Objects.requireNonNull(buffers, "buffers");
116+
final int vocab = fieldStats.vocabSize();
117+
if (buffers.termFreq().length != vocab || buffers.termDocs().length != vocab) {
118+
throw new IllegalArgumentException(
119+
"buffer length mismatch: vocabSize=" + vocab
120+
+ ", termFreq.length=" + buffers.termFreq().length
121+
+ ", termDocs.length=" + buffers.termDocs().length);
122+
}
123+
this.buffers = buffers;
124+
}
125+
126+
/**
127+
* Returns the number of documents that contributed at least one cooccurrence position.
128+
*
129+
* @return focus document count
130+
*/
131+
public int coocDocsTotal()
132+
{
133+
return coocDocsTotal;
134+
}
135+
136+
/**
137+
* Returns the total non-gap positions visited across all documents. Used as the focus-side denominator in keyness scoring.
138+
*
139+
* @return total cooccurrence token count
140+
*/
141+
public long coocTokens()
142+
{
143+
return coocTokens;
144+
}
145+
146+
@Override
147+
public void endDoc(final int spanCount) throws IOException
148+
{
149+
if (buffers == null) {
150+
throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk");
151+
}
152+
windowMask.andNot(pivotMask);
153+
if (windowMask.isEmpty()) return;
154+
155+
final long[] termFreq = buffers.termFreq();
156+
final int[] termDocs = buffers.termDocs();
157+
final int docId = lastDocId;
158+
159+
rail.scanPositions(docId, windowMask, termId -> {
160+
termFreq[termId]++;
161+
coocTokens++;
162+
if (!termSeen.get(termId)) {
163+
termSeen.set(termId);
164+
termDocs[termId]++;
165+
}
166+
docContributed = true;
167+
});
168+
169+
if (docContributed) coocDocsTotal++;
170+
}
171+
172+
@Override
173+
public boolean span(final SpanMatch match) throws IOException
174+
{
175+
final int start = match.startPosition();
176+
final int end = match.endPosition();
177+
// left context: [max(0, start - left), start)
178+
if (left > 0 && start > 0) {
179+
windowMask.set(Math.max(0, start - left), start);
180+
}
181+
// right context: [end, end + right) — clamped by scanPositions
182+
if (right > 0) {
183+
windowMask.set(end, end + right);
184+
}
185+
// pivot positions: [start, end)
186+
pivotMask.set(start, end);
187+
return true;
188+
}
189+
190+
@Override
191+
public void start() throws IOException
192+
{
193+
if (buffers == null) {
194+
throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk");
195+
}
196+
coocTokens = 0L;
197+
coocDocsTotal = 0;
198+
}
199+
200+
@Override
201+
public void startDoc(final int docId) throws IOException
202+
{
203+
this.lastDocId = docId;
204+
windowMask.clear();
205+
pivotMask.clear();
206+
termSeen.clear();
207+
docContributed = false;
208+
}
209+
210+
/** Last docId seen, captured in {@link #startDoc(int)} and consumed in {@link #endDoc(int)}. */
211+
private int lastDocId = -1;
212+
}

common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import org.apache.lucene.util.FixedBitSet;
1616

1717
import com.github.oeuvres.alix.lucene.Partition;
18+
import com.github.oeuvres.alix.lucene.spans.CoocListener;
19+
import com.github.oeuvres.alix.lucene.spans.SpanWalker;
1820
import com.github.oeuvres.alix.util.TopArray;
1921

2022
/**
@@ -117,6 +119,38 @@ public TopTerms(final FieldStats fieldStats, final TermLexicon lexicon)
117119
this.lexicon = Objects.requireNonNull(lexicon, "lexicon");
118120
}
119121

122+
/**
123+
* Streams a span walk through the listener and collects per-term cooccurrence counts directly into this instance's focus buffers.
124+
*
125+
* <p>
126+
* The listener is bound to this instance's focus arrays via {@link #focusBuffers()}, the walker is then driven from doc id {@code 0} to exhaustion, and the focus scalars (token total, doc total) are read back from the listener at the end. After this call, {@link #focusScore(KeynessScorer, int)} can be invoked exactly as after {@link #focus(IndexReader, FixedBitSet)}, with the cooccurrence counts in place of the doc-subset counts.
127+
* </p>
128+
*
129+
* <p>
130+
* The walker must be configured with the pivot {@link org.apache.lucene.queries.spans.SpanQuery} and any document-side filter. The listener must be a {@link com.github.oeuvres.alix.lucene.spans.CoocListener} bound to the same field as this instance.
131+
* </p>
132+
*
133+
* @param listener cooccurrence listener that writes into the focus buffers
134+
* @param walker span walker that drives the listener
135+
* @return this instance
136+
* @throws IOException on walker I/O failure
137+
* @throws NullPointerException if {@code listener} or {@code walker} is {@code null}
138+
*/
139+
public TopTerms coocs(
140+
final CoocListener listener,
141+
final SpanWalker walker) throws IOException
142+
{
143+
Objects.requireNonNull(listener, "listener");
144+
Objects.requireNonNull(walker, "walker");
145+
prepareFocus();
146+
listener.bindTo(focusBuffers());
147+
walker.walk(0);
148+
setFocusTotals(listener.coocTokens(), listener.coocDocsTotal());
149+
activeCounts = focusTermFreq;
150+
return this;
151+
}
152+
153+
120154
/**
121155
* Populates focus statistics from a document subset.
122156
*
@@ -763,6 +797,14 @@ public TopTerms partScore(
763797
return this;
764798
}
765799

800+
/**
801+
* Allocates or zeros the focus buffers and resets the focus scalars. Idempotent. Called automatically by {@link #coocs} and {@link #focusBuffers()}.
802+
*/
803+
public void prepareFocus()
804+
{
805+
initFocus();
806+
}
807+
766808
/**
767809
* Ranks terms by a caller-supplied score vector.
768810
*
@@ -815,6 +857,18 @@ public TopTerms ranking(final double[] weights, final int topK)
815857
return this;
816858
}
817859

860+
/**
861+
* Sets the focus token total and focus document total. Used by external collectors after they have populated the focus buffers via {@link #focusBuffers()}.
862+
*
863+
* @param focusTokens total non-gap positions counted across the focus
864+
* @param focusDocs number of documents that contributed at least one focus position
865+
*/
866+
public void setFocusTotals(final long focusTokens, final int focusDocs)
867+
{
868+
this.focusTokens = focusTokens;
869+
this.focusDocs = focusDocs;
870+
}
871+
818872
/**
819873
* Returns the current number of ranked terms.
820874
*
@@ -848,7 +902,7 @@ private void buildRank(final TopArray top, final double[] scoreVec)
848902
termScores = scoreVec;
849903
hilites = null;
850904
}
851-
905+
852906
/**
853907
* Checks that a document-id bitset can be safely addressed with reader
854908
* global document ids.
@@ -955,6 +1009,19 @@ private Terms requireTerms(final IndexReader reader, final String action) throws
9551009
return terms;
9561010
}
9571011

1012+
/**
1013+
* Returns the focus accumulation buffers of this instance, allocating them on first call. Used by external collectors that write counts directly into the buffers; see {@link #coocs}.
1014+
*
1015+
* @return aliased buffers; index {@code 0} is the absent-term sentinel and must not be written
1016+
*/
1017+
FocusBuffers focusBuffers()
1018+
{
1019+
if (focusTermFreq == null) {
1020+
prepareFocus();
1021+
}
1022+
return new FocusBuffers(focusTermFreq, focusTermDocs);
1023+
}
1024+
9581025
/**
9591026
* Package-private setter used by specialized ranking producers.
9601027
*
@@ -1116,4 +1183,23 @@ public TermEntry next()
11161183
return new TermEntry(rank, rank2termId[rank]);
11171184
}
11181185
}
1186+
1187+
/**
1188+
* Handle to a {@link TopTerms} instance's focus accumulation buffers.
1189+
*
1190+
* <p>
1191+
* Returned by {@link TopTerms#focusBuffers()} and used by external collectors (such as cooccurrence listeners) that need to write per-term frequency and document-frequency counts directly into the {@link TopTerms} arrays without an intermediate copy. The arrays are aliased, not copied: writes into them are immediately visible to {@link TopTerms#focusScore}.
1192+
* </p>
1193+
*
1194+
* <p>
1195+
* Both arrays are indexed by dense term id and have length {@code FieldStats.vocabSize()}. Index {@code 0} is the absent-term sentinel and must not be written.
1196+
* </p>
1197+
*
1198+
* @param termFreq per-term occurrence count buffer
1199+
* @param termDocs per-term document-frequency buffer
1200+
*/
1201+
record FocusBuffers(long[] termFreq, int[] termDocs)
1202+
{
1203+
}
1204+
11191205
}

web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import com.github.oeuvres.alix.lucene.FlucText;
1919
import com.github.oeuvres.alix.lucene.LuceneIndex;
2020
import com.github.oeuvres.alix.lucene.Partition;
21+
import com.github.oeuvres.alix.lucene.spans.CoocListener;
22+
import com.github.oeuvres.alix.lucene.spans.SpanWalker;
2123
import com.github.oeuvres.alix.lucene.terms.KeynessScorer;
2224
import com.github.oeuvres.alix.lucene.terms.PartScorer;
2325
import com.github.oeuvres.alix.lucene.terms.TermScorer;
@@ -157,9 +159,22 @@ else if ("chi2".equals(scorerName)) {
157159
}
158160
// coocs, with or without doc filter TODO
159161
else {
160-
pars.response().setStatus(501);
161-
meta.put("error", "Co-occurrence mode not yet implemented");
162-
return null;
162+
final int ctx = pars.getInt(CTX, CTX_RANGE, CTX_DEFAULT, CTX);
163+
final int left = pars.getInt(CTX_LEFT, CTX_RANGE, ctx, CTX_LEFT);
164+
final int right = pars.getInt(CTX_RIGHT, CTX_RANGE, ctx, CTX_RIGHT);
165+
final CoocListener listener = new CoocListener(
166+
ftext.fieldStats(),
167+
ftext.termRail(),
168+
left,
169+
right);
170+
final SpanWalker walker = new SpanWalker(
171+
index.searcher(),
172+
spanQuery,
173+
filterQuery,
174+
listener);
175+
topTerms.coocs(listener, walker);
176+
// return topTerms.focusScore(new KeynessScorer.LMI(), topK);
177+
return topTerms;
163178
}
164179
}
165180

web/src/main/java/com/github/oeuvres/alix/web/Pars.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ private Pars() {}
66
public static final String BM25 = "bm25";
77
public static final String CTX = "ctx";
88
public static final int CTX_DEFAULT = 10;
9+
public static final String CTX_LEFT = "ctxleft";
910
public static final int[] CTX_RANGE = {0, 30};
11+
public static final String CTX_RIGHT = "ctxright";
1012
public static final String DATE = "date";
1113
public static final String DOCS = "docs";
1214
public static final int DOCS_DEFAULT = 100;

0 commit comments

Comments
 (0)