|
| 1 | +package com.github.oeuvres.alix.lucene.spans; |
| 2 | + |
| 3 | +import java.io.IOException; |
| 4 | +import java.util.BitSet; |
| 5 | +import java.util.Objects; |
| 6 | + |
| 7 | +import com.github.oeuvres.alix.lucene.terms.FieldStats; |
| 8 | +import com.github.oeuvres.alix.lucene.terms.FocusBuffers; |
| 9 | +import com.github.oeuvres.alix.lucene.terms.TermRail; |
| 10 | + |
| 11 | +/** |
| 12 | + * {@link SpanListener} that accumulates per-term cooccurrence counts in a fixed-width window around each pivot match. |
| 13 | + * |
| 14 | + * <p> |
| 15 | + * For each match {@code [start, end)} delivered by {@link SpanWalker}, the listener marks the context positions {@code [max(0, start - left), start)} and {@code [end, min(docWidth, end + right))} in a per-document bitset. After the document is exhausted, the marked positions are resolved to term ids via {@link TermRail#scanPositions} and counts are written into a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. |
| 16 | + * </p> |
| 17 | + * |
| 18 | + * <h2>Pivot self-exclusion</h2> |
| 19 | + * |
| 20 | + * <p> |
| 21 | + * Pivot positions of every match in the document are tracked in a separate bitset and removed from the window mask before the rail scan. With many pivot matches close together (large slop, or {@code SpanOr} of co-occurring terms), pivot positions naturally land in another match's window; without this exclusion they would inflate the cooccurrence counts and tilt the focus token denominator, biasing every score. |
| 22 | + * </p> |
| 23 | + * |
| 24 | + * <h2>Per-document deduplication for document frequency</h2> |
| 25 | + * |
| 26 | + * <p> |
| 27 | + * A vocabulary-sized bitset records which term ids have already been counted in the current document, so {@link FocusBuffers#termDocs} is incremented at most once per (term, document) pair while {@link FocusBuffers#termFreq} is incremented per occurrence. |
| 28 | + * </p> |
| 29 | + * |
| 30 | + * <h2>Lifecycle</h2> |
| 31 | + * |
| 32 | + * <p> |
| 33 | + * The listener is bound to a {@link FocusBuffers} via {@link #bindTo(FocusBuffers)} before the walk starts. {@link com.github.oeuvres.alix.lucene.terms.TopTerms#coocs} performs the binding, runs the walk, and reads back {@link #coocTokens()} and {@link #coocDocsTotal()}. |
| 34 | + * </p> |
| 35 | + * |
| 36 | + * <p> |
| 37 | + * This class is not thread-safe and is single-use per walk: a fresh instance, or a fresh {@link #bindTo(FocusBuffers)}, is required for each cooccurrence query. |
| 38 | + * </p> |
| 39 | + */ |
| 40 | +public final class CoocListener implements SpanListener |
| 41 | +{ |
| 42 | + /** Number of documents that contributed at least one window position resolving to a real term. */ |
| 43 | + private int coocDocsTotal; |
| 44 | + |
| 45 | + /** Total non-gap positions visited across all documents. */ |
| 46 | + private long coocTokens; |
| 47 | + |
| 48 | + /** Field statistics for the pivot field; used for vocabulary size and max document width. */ |
| 49 | + private final FieldStats fieldStats; |
| 50 | + |
| 51 | + /** Number of context positions to read on the right of each match. */ |
| 52 | + private final int left; |
| 53 | + |
| 54 | + /** Pivot positions in the current document, accumulated across all matches. */ |
| 55 | + private final BitSet pivotMask; |
| 56 | + |
| 57 | + /** Forward positional rail for the pivot field. */ |
| 58 | + private final TermRail rail; |
| 59 | + |
| 60 | + /** Number of context positions to read on the left of each match. */ |
| 61 | + private final int right; |
| 62 | + |
| 63 | + /** Per-document set of term ids already counted toward {@link FocusBuffers#termDocs}. */ |
| 64 | + private final BitSet termSeen; |
| 65 | + |
| 66 | + /** Bound focus buffers; {@code null} until {@link #bindTo(FocusBuffers)} is called. */ |
| 67 | + private FocusBuffers buffers; |
| 68 | + |
| 69 | + /** Whether the current document contributed at least one cooc position. */ |
| 70 | + private boolean docContributed; |
| 71 | + |
| 72 | + /** Window positions in the current document, accumulated across all matches. */ |
| 73 | + private final BitSet windowMask; |
| 74 | + |
| 75 | + /** |
| 76 | + * Constructs a cooccurrence listener. |
| 77 | + * |
| 78 | + * @param fieldStats field statistics for the pivot field |
| 79 | + * @param rail forward positional rail for the same field |
| 80 | + * @param left context width on the left of each match, in positions; must be {@code >= 0} |
| 81 | + * @param right context width on the right of each match, in positions; must be {@code >= 0} |
| 82 | + * @throws IllegalArgumentException if {@code left} or {@code right} is negative, or if both are zero |
| 83 | + * @throws NullPointerException if {@code fieldStats} or {@code rail} is {@code null} |
| 84 | + */ |
| 85 | + public CoocListener( |
| 86 | + final FieldStats fieldStats, |
| 87 | + final TermRail rail, |
| 88 | + final int left, |
| 89 | + final int right) |
| 90 | + { |
| 91 | + this.fieldStats = Objects.requireNonNull(fieldStats, "fieldStats"); |
| 92 | + this.rail = Objects.requireNonNull(rail, "rail"); |
| 93 | + if (left < 0 || right < 0) { |
| 94 | + throw new IllegalArgumentException("left and right must be >= 0; got left=" + left + ", right=" + right); |
| 95 | + } |
| 96 | + if (left == 0 && right == 0) { |
| 97 | + throw new IllegalArgumentException("left and right cannot both be 0"); |
| 98 | + } |
| 99 | + this.left = left; |
| 100 | + this.right = right; |
| 101 | + this.windowMask = new BitSet(fieldStats.maxWidth()); |
| 102 | + this.pivotMask = new BitSet(fieldStats.maxWidth()); |
| 103 | + this.termSeen = new BitSet(fieldStats.vocabSize()); |
| 104 | + } |
| 105 | + |
| 106 | + /** |
| 107 | + * Binds this listener to a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. Must be called before the walk starts. |
| 108 | + * |
| 109 | + * @param buffers focus buffers to write into |
| 110 | + * @throws NullPointerException if {@code buffers} is {@code null} |
| 111 | + * @throws IllegalArgumentException if buffer lengths do not match {@code fieldStats.vocabSize()} |
| 112 | + */ |
| 113 | + public void bindTo(final FocusBuffers buffers) |
| 114 | + { |
| 115 | + Objects.requireNonNull(buffers, "buffers"); |
| 116 | + final int vocab = fieldStats.vocabSize(); |
| 117 | + if (buffers.termFreq().length != vocab || buffers.termDocs().length != vocab) { |
| 118 | + throw new IllegalArgumentException( |
| 119 | + "buffer length mismatch: vocabSize=" + vocab |
| 120 | + + ", termFreq.length=" + buffers.termFreq().length |
| 121 | + + ", termDocs.length=" + buffers.termDocs().length); |
| 122 | + } |
| 123 | + this.buffers = buffers; |
| 124 | + } |
| 125 | + |
| 126 | + /** |
| 127 | + * Returns the number of documents that contributed at least one cooccurrence position. |
| 128 | + * |
| 129 | + * @return focus document count |
| 130 | + */ |
| 131 | + public int coocDocsTotal() |
| 132 | + { |
| 133 | + return coocDocsTotal; |
| 134 | + } |
| 135 | + |
| 136 | + /** |
| 137 | + * Returns the total non-gap positions visited across all documents. Used as the focus-side denominator in keyness scoring. |
| 138 | + * |
| 139 | + * @return total cooccurrence token count |
| 140 | + */ |
| 141 | + public long coocTokens() |
| 142 | + { |
| 143 | + return coocTokens; |
| 144 | + } |
| 145 | + |
| 146 | + @Override |
| 147 | + public void endDoc(final int spanCount) throws IOException |
| 148 | + { |
| 149 | + if (buffers == null) { |
| 150 | + throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk"); |
| 151 | + } |
| 152 | + windowMask.andNot(pivotMask); |
| 153 | + if (windowMask.isEmpty()) return; |
| 154 | + |
| 155 | + final long[] termFreq = buffers.termFreq(); |
| 156 | + final int[] termDocs = buffers.termDocs(); |
| 157 | + final int docId = lastDocId; |
| 158 | + |
| 159 | + rail.scanPositions(docId, windowMask, termId -> { |
| 160 | + termFreq[termId]++; |
| 161 | + coocTokens++; |
| 162 | + if (!termSeen.get(termId)) { |
| 163 | + termSeen.set(termId); |
| 164 | + termDocs[termId]++; |
| 165 | + } |
| 166 | + docContributed = true; |
| 167 | + }); |
| 168 | + |
| 169 | + if (docContributed) coocDocsTotal++; |
| 170 | + } |
| 171 | + |
| 172 | + @Override |
| 173 | + public boolean span(final SpanMatch match) throws IOException |
| 174 | + { |
| 175 | + final int start = match.startPosition(); |
| 176 | + final int end = match.endPosition(); |
| 177 | + // left context: [max(0, start - left), start) |
| 178 | + if (left > 0 && start > 0) { |
| 179 | + windowMask.set(Math.max(0, start - left), start); |
| 180 | + } |
| 181 | + // right context: [end, end + right) — clamped by scanPositions |
| 182 | + if (right > 0) { |
| 183 | + windowMask.set(end, end + right); |
| 184 | + } |
| 185 | + // pivot positions: [start, end) |
| 186 | + pivotMask.set(start, end); |
| 187 | + return true; |
| 188 | + } |
| 189 | + |
| 190 | + @Override |
| 191 | + public void start() throws IOException |
| 192 | + { |
| 193 | + if (buffers == null) { |
| 194 | + throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk"); |
| 195 | + } |
| 196 | + coocTokens = 0L; |
| 197 | + coocDocsTotal = 0; |
| 198 | + } |
| 199 | + |
| 200 | + @Override |
| 201 | + public void startDoc(final int docId) throws IOException |
| 202 | + { |
| 203 | + this.lastDocId = docId; |
| 204 | + windowMask.clear(); |
| 205 | + pivotMask.clear(); |
| 206 | + termSeen.clear(); |
| 207 | + docContributed = false; |
| 208 | + } |
| 209 | + |
| 210 | + /** Last docId seen, captured in {@link #startDoc(int)} and consumed in {@link #endDoc(int)}. */ |
| 211 | + private int lastDocId = -1; |
| 212 | +} |
0 commit comments