Coocs should be right

glorieux-f · glorieux-f · commit b72ed8ed4732 · 2026-05-04T10:22:49.000+02:00
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java b/common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java
@@ -0,0 +1,212 @@
+package com.github.oeuvres.alix.lucene.spans;
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Objects;
+
+import com.github.oeuvres.alix.lucene.terms.FieldStats;
+import com.github.oeuvres.alix.lucene.terms.FocusBuffers;
+import com.github.oeuvres.alix.lucene.terms.TermRail;
+
+/**
+ * {@link SpanListener} that accumulates per-term cooccurrence counts in a fixed-width window around each pivot match.
+ *
+ * <p>
+ * For each match {@code [start, end)} delivered by {@link SpanWalker}, the listener marks the context positions {@code [max(0, start - left), start)} and {@code [end, min(docWidth, end + right))} in a per-document bitset. After the document is exhausted, the marked positions are resolved to term ids via {@link TermRail#scanPositions} and counts are written into a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance.
+ * </p>
+ *
+ * <h2>Pivot self-exclusion</h2>
+ *
+ * <p>
+ * Pivot positions of every match in the document are tracked in a separate bitset and removed from the window mask before the rail scan. With many pivot matches close together (large slop, or {@code SpanOr} of co-occurring terms), pivot positions naturally land in another match's window; without this exclusion they would inflate the cooccurrence counts and tilt the focus token denominator, biasing every score.
+ * </p>
+ *
+ * <h2>Per-document deduplication for document frequency</h2>
+ *
+ * <p>
+ * A vocabulary-sized bitset records which term ids have already been counted in the current document, so {@link FocusBuffers#termDocs} is incremented at most once per (term, document) pair while {@link FocusBuffers#termFreq} is incremented per occurrence.
+ * </p>
+ *
+ * <h2>Lifecycle</h2>
+ *
+ * <p>
+ * The listener is bound to a {@link FocusBuffers} via {@link #bindTo(FocusBuffers)} before the walk starts. {@link com.github.oeuvres.alix.lucene.terms.TopTerms#coocs} performs the binding, runs the walk, and reads back {@link #coocTokens()} and {@link #coocDocsTotal()}.
+ * </p>
+ *
+ * <p>
+ * This class is not thread-safe and is single-use per walk: a fresh instance, or a fresh {@link #bindTo(FocusBuffers)}, is required for each cooccurrence query.
+ * </p>
+ */
+public final class CoocListener implements SpanListener
+{
+    /** Number of documents that contributed at least one window position resolving to a real term. */
+    private int coocDocsTotal;
+
+    /** Total non-gap positions visited across all documents. */
+    private long coocTokens;
+
+    /** Field statistics for the pivot field; used for vocabulary size and max document width. */
+    private final FieldStats fieldStats;
+
+    /** Number of context positions to read on the right of each match. */
+    private final int left;
+
+    /** Pivot positions in the current document, accumulated across all matches. */
+    private final BitSet pivotMask;
+
+    /** Forward positional rail for the pivot field. */
+    private final TermRail rail;
+
+    /** Number of context positions to read on the left of each match. */
+    private final int right;
+
+    /** Per-document set of term ids already counted toward {@link FocusBuffers#termDocs}. */
+    private final BitSet termSeen;
+
+    /** Bound focus buffers; {@code null} until {@link #bindTo(FocusBuffers)} is called. */
+    private FocusBuffers buffers;
+
+    /** Whether the current document contributed at least one cooc position. */
+    private boolean docContributed;
+
+    /** Window positions in the current document, accumulated across all matches. */
+    private final BitSet windowMask;
+
+    /**
+     * Constructs a cooccurrence listener.
+     *
+     * @param fieldStats field statistics for the pivot field
+     * @param rail       forward positional rail for the same field
+     * @param left       context width on the left of each match, in positions; must be {@code >= 0}
+     * @param right      context width on the right of each match, in positions; must be {@code >= 0}
+     * @throws IllegalArgumentException if {@code left} or {@code right} is negative, or if both are zero
+     * @throws NullPointerException     if {@code fieldStats} or {@code rail} is {@code null}
+     */
+    public CoocListener(
+        final FieldStats fieldStats,
+        final TermRail rail,
+        final int left,
+        final int right)
+    {
+        this.fieldStats = Objects.requireNonNull(fieldStats, "fieldStats");
+        this.rail = Objects.requireNonNull(rail, "rail");
+        if (left < 0 || right < 0) {
+            throw new IllegalArgumentException("left and right must be >= 0; got left=" + left + ", right=" + right);
+        }
+        if (left == 0 && right == 0) {
+            throw new IllegalArgumentException("left and right cannot both be 0");
+        }
+        this.left = left;
+        this.right = right;
+        this.windowMask = new BitSet(fieldStats.maxWidth());
+        this.pivotMask = new BitSet(fieldStats.maxWidth());
+        this.termSeen = new BitSet(fieldStats.vocabSize());
+    }
+
+    /**
+     * Binds this listener to a {@link FocusBuffers} obtained from a {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. Must be called before the walk starts.
+     *
+     * @param buffers focus buffers to write into
+     * @throws NullPointerException     if {@code buffers} is {@code null}
+     * @throws IllegalArgumentException if buffer lengths do not match {@code fieldStats.vocabSize()}
+     */
+    public void bindTo(final FocusBuffers buffers)
+    {
+        Objects.requireNonNull(buffers, "buffers");
+        final int vocab = fieldStats.vocabSize();
+        if (buffers.termFreq().length != vocab || buffers.termDocs().length != vocab) {
+            throw new IllegalArgumentException(
+                "buffer length mismatch: vocabSize=" + vocab
+                + ", termFreq.length=" + buffers.termFreq().length
+                + ", termDocs.length=" + buffers.termDocs().length);
+        }
+        this.buffers = buffers;
+    }
+
+    /**
+     * Returns the number of documents that contributed at least one cooccurrence position.
+     *
+     * @return focus document count
+     */
+    public int coocDocsTotal()
+    {
+        return coocDocsTotal;
+    }
+
+    /**
+     * Returns the total non-gap positions visited across all documents. Used as the focus-side denominator in keyness scoring.
+     *
+     * @return total cooccurrence token count
+     */
+    public long coocTokens()
+    {
+        return coocTokens;
+    }
+
+    @Override
+    public void endDoc(final int spanCount) throws IOException
+    {
+        if (buffers == null) {
+            throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk");
+        }
+        windowMask.andNot(pivotMask);
+        if (windowMask.isEmpty()) return;
+
+        final long[] termFreq = buffers.termFreq();
+        final int[] termDocs = buffers.termDocs();
+        final int docId = lastDocId;
+
+        rail.scanPositions(docId, windowMask, termId -> {
+            termFreq[termId]++;
+            coocTokens++;
+            if (!termSeen.get(termId)) {
+                termSeen.set(termId);
+                termDocs[termId]++;
+            }
+            docContributed = true;
+        });
+
+        if (docContributed) coocDocsTotal++;
+    }
+
+    @Override
+    public boolean span(final SpanMatch match) throws IOException
+    {
+        final int start = match.startPosition();
+        final int end = match.endPosition();
+        // left context: [max(0, start - left), start)
+        if (left > 0 && start > 0) {
+            windowMask.set(Math.max(0, start - left), start);
+        }
+        // right context: [end, end + right) — clamped by scanPositions
+        if (right > 0) {
+            windowMask.set(end, end + right);
+        }
+        // pivot positions: [start, end)
+        pivotMask.set(start, end);
+        return true;
+    }
+
+    @Override
+    public void start() throws IOException
+    {
+        if (buffers == null) {
+            throw new IllegalStateException("CoocListener not bound; call bindTo(FocusBuffers) before the walk");
+        }
+        coocTokens = 0L;
+        coocDocsTotal = 0;
+    }
+
+    @Override
+    public void startDoc(final int docId) throws IOException
+    {
+        this.lastDocId = docId;
+        windowMask.clear();
+        pivotMask.clear();
+        termSeen.clear();
+        docContributed = false;
+    }
+
+    /** Last docId seen, captured in {@link #startDoc(int)} and consumed in {@link #endDoc(int)}. */
+    private int lastDocId = -1;
+}
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java b/common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java
@@ -15,6 +15,8 @@
 import org.apache.lucene.util.FixedBitSet;
 
 import com.github.oeuvres.alix.lucene.Partition;
+import com.github.oeuvres.alix.lucene.spans.CoocListener;
+import com.github.oeuvres.alix.lucene.spans.SpanWalker;
 import com.github.oeuvres.alix.util.TopArray;
 
 /**
@@ -117,6 +119,38 @@ public TopTerms(final FieldStats fieldStats, final TermLexicon lexicon)
         this.lexicon = Objects.requireNonNull(lexicon, "lexicon");
     }
     
+    /**
+     * Streams a span walk through the listener and collects per-term cooccurrence counts directly into this instance's focus buffers.
+     *
+     * <p>
+     * The listener is bound to this instance's focus arrays via {@link #focusBuffers()}, the walker is then driven from doc id {@code 0} to exhaustion, and the focus scalars (token total, doc total) are read back from the listener at the end. After this call, {@link #focusScore(KeynessScorer, int)} can be invoked exactly as after {@link #focus(IndexReader, FixedBitSet)}, with the cooccurrence counts in place of the doc-subset counts.
+     * </p>
+     *
+     * <p>
+     * The walker must be configured with the pivot {@link org.apache.lucene.queries.spans.SpanQuery} and any document-side filter. The listener must be a {@link com.github.oeuvres.alix.lucene.spans.CoocListener} bound to the same field as this instance.
+     * </p>
+     *
+     * @param listener cooccurrence listener that writes into the focus buffers
+     * @param walker   span walker that drives the listener
+     * @return this instance
+     * @throws IOException          on walker I/O failure
+     * @throws NullPointerException if {@code listener} or {@code walker} is {@code null}
+     */
+    public TopTerms coocs(
+        final CoocListener listener,
+        final SpanWalker walker) throws IOException
+    {
+        Objects.requireNonNull(listener, "listener");
+        Objects.requireNonNull(walker, "walker");
+        prepareFocus();
+        listener.bindTo(focusBuffers());
+        walker.walk(0);
+        setFocusTotals(listener.coocTokens(), listener.coocDocsTotal());
+        activeCounts = focusTermFreq;
+        return this;
+    }
+
+    
     /**
      * Populates focus statistics from a document subset.
      *
@@ -763,6 +797,14 @@ public TopTerms partScore(
         return this;
     }
     
+    /**
+     * Allocates or zeros the focus buffers and resets the focus scalars. Idempotent. Called automatically by {@link #coocs} and {@link #focusBuffers()}.
+     */
+    public void prepareFocus()
+    {
+        initFocus();
+    }
+
     /**
      * Ranks terms by a caller-supplied score vector.
      *
@@ -815,6 +857,18 @@ public TopTerms ranking(final double[] weights, final int topK)
         return this;
     }
     
+    /**
+     * Sets the focus token total and focus document total. Used by external collectors after they have populated the focus buffers via {@link #focusBuffers()}.
+     *
+     * @param focusTokens total non-gap positions counted across the focus
+     * @param focusDocs   number of documents that contributed at least one focus position
+     */
+    public void setFocusTotals(final long focusTokens, final int focusDocs)
+    {
+        this.focusTokens = focusTokens;
+        this.focusDocs = focusDocs;
+    }
+
     /**
      * Returns the current number of ranked terms.
      *
@@ -848,7 +902,7 @@ private void buildRank(final TopArray top, final double[] scoreVec)
         termScores = scoreVec;
         hilites = null;
     }
-    
+
     /**
      * Checks that a document-id bitset can be safely addressed with reader
      * global document ids.
@@ -955,6 +1009,19 @@ private Terms requireTerms(final IndexReader reader, final String action) throws
         return terms;
     }
     
+    /**
+     * Returns the focus accumulation buffers of this instance, allocating them on first call. Used by external collectors that write counts directly into the buffers; see {@link #coocs}.
+     *
+     * @return aliased buffers; index {@code 0} is the absent-term sentinel and must not be written
+     */
+    FocusBuffers focusBuffers()
+    {
+        if (focusTermFreq == null) {
+            prepareFocus();
+        }
+        return new FocusBuffers(focusTermFreq, focusTermDocs);
+    }
+
     /**
      * Package-private setter used by specialized ranking producers.
      *
@@ -1116,4 +1183,23 @@ public TermEntry next()
             return new TermEntry(rank, rank2termId[rank]);
         }
     }
+    
+    /**
+     * Handle to a {@link TopTerms} instance's focus accumulation buffers.
+     *
+     * <p>
+     * Returned by {@link TopTerms#focusBuffers()} and used by external collectors (such as cooccurrence listeners) that need to write per-term frequency and document-frequency counts directly into the {@link TopTerms} arrays without an intermediate copy. The arrays are aliased, not copied: writes into them are immediately visible to {@link TopTerms#focusScore}.
+     * </p>
+     *
+     * <p>
+     * Both arrays are indexed by dense term id and have length {@code FieldStats.vocabSize()}. Index {@code 0} is the absent-term sentinel and must not be written.
+     * </p>
+     *
+     * @param termFreq per-term occurrence count buffer
+     * @param termDocs per-term document-frequency buffer
+     */
+    record FocusBuffers(long[] termFreq, int[] termDocs)
+    {
+    }
+
 }
diff --git a/web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java b/web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java
@@ -18,6 +18,8 @@
 import com.github.oeuvres.alix.lucene.FlucText;
 import com.github.oeuvres.alix.lucene.LuceneIndex;
 import com.github.oeuvres.alix.lucene.Partition;
+import com.github.oeuvres.alix.lucene.spans.CoocListener;
+import com.github.oeuvres.alix.lucene.spans.SpanWalker;
 import com.github.oeuvres.alix.lucene.terms.KeynessScorer;
 import com.github.oeuvres.alix.lucene.terms.PartScorer;
 import com.github.oeuvres.alix.lucene.terms.TermScorer;
@@ -157,9 +159,22 @@ else if ("chi2".equals(scorerName)) {
         }
         // coocs, with or without doc filter TODO
         else {
-            pars.response().setStatus(501);
-            meta.put("error", "Co-occurrence mode not yet implemented");
-            return null;
+            final int ctx = pars.getInt(CTX, CTX_RANGE, CTX_DEFAULT, CTX);
+            final int left = pars.getInt(CTX_LEFT, CTX_RANGE, ctx, CTX_LEFT);
+            final int right = pars.getInt(CTX_RIGHT, CTX_RANGE, ctx, CTX_RIGHT);
+            final CoocListener listener = new CoocListener(
+                ftext.fieldStats(),
+                ftext.termRail(),
+                left,
+                right);
+            final SpanWalker walker = new SpanWalker(
+                index.searcher(),
+                spanQuery,
+                filterQuery,
+                listener);
+            topTerms.coocs(listener, walker);
+            // return topTerms.focusScore(new KeynessScorer.LMI(), topK);
+            return topTerms;
         }
     }
     
diff --git a/web/src/main/java/com/github/oeuvres/alix/web/Pars.java b/web/src/main/java/com/github/oeuvres/alix/web/Pars.java
@@ -6,7 +6,9 @@ private Pars() {}
     public static final String BM25            = "bm25";
     public static final String CTX             = "ctx";
     public static final int    CTX_DEFAULT     = 10;
+    public static final String CTX_LEFT        = "ctxleft";
     public static final int[]  CTX_RANGE       = {0, 30};
+    public static final String CTX_RIGHT       = "ctxright";
     public static final String DATE            = "date";
     public static final String DOCS            = "docs";
     public static final int    DOCS_DEFAULT    = 100;