oeuvres
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java‎
Lines changed: 25 additions & 2 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanListener.java‎
Lines changed: 41 additions & 24 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanListener.java‎
Lines changed: 41 additions & 24 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java‎
Lines changed: 11 additions & 4 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java‎
Lines changed: 37 additions & 0 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java‎
Lines changed: 37 additions & 0 deletions
@@ -1,6 +1,7 @@
 package com.github.oeuvres.alix.lucene.spans;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.BitSet;
 import java.util.Objects;
 
@@ -89,6 +90,10 @@ public final class CoocListener implements SpanListener
     /** Window positions in the current document, accumulated across all matches. */
     private final BitSet windowMask;
 
+    private static final int[] EMPTY_INT = new int[0];
+    /** Do not count pivots occurrences */
+    private int[] pivotIds = EMPTY_INT;
+    
     /**
      * Constructs a cooccurrence listener.
      *
@@ -122,12 +127,28 @@ public CoocListener(
         this.termSeen = new BitSet(fieldStats.vocabSize());
     }
 
+    /**
+     * Provide the set of pivot term ids that should be
+     * excluded from co-occurrence accounting. Called in
+     * {@link }
+     * 
+     * @throws NullPointerException if {@code buffers} is {@code null}
+     */
+    protected void setPivotIds(final int[] pivotIds)
+    {
+        Objects.requireNonNull(pivotIds, "pivotIds");
+        this.pivotIds = pivotIds;
+    }
+    
     /**
      * Binds this listener to a {@link FocusBuffers} obtained from a
-     * {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. Must be called before the
+     * {@link TopTerms} instance. Must be called before the
      * walk starts.
      *
-     * @param buffers focus buffers to write into
+     * @param buffers      focus buffers to write into
+     * @param pivotTermIds term ids of the query terms; positions resolving to one of these ids
+     *                     are excluded from both {@code termFreq}/{@code termDocs} and
+     *                     {@code coocTokens}. May be empty but not {@code null}.
      * @throws NullPointerException     if {@code buffers} is {@code null}
      * @throws IllegalArgumentException if buffer lengths do not match
      *                                  {@code fieldStats.vocabSize()}
@@ -181,6 +202,8 @@ public void endDoc(final int spanCount) throws IOException
         final int docId = lastDocId;
 
         rail.scanPositions(docId, windowMask, termId -> {
+            if (Arrays.binarySearch(pivotIds, termId) >= 0)
+                return;
             termFreq[termId]++;
             coocTokens++;
             if (!termSeen.get(termId)) {
 
@@ -5,64 +5,81 @@
 /**
  * Receives streamed span-match events from a {@link SpanWalker}.
  *
- * <p>Lifecycle, per call to {@link SpanWalker#walk(int)}:</p>
+ * <p>
+ * Lifecycle, per call to {@link SpanWalker#walk(int)}:
+ * </p>
  * <ol>
- *   <li>{@link #start()}</li>
- *   <li>For each matching document, in natural index order:
- *     <ol>
- *       <li>{@link #wantsMoreDocs()} — return {@code false} to stop the walk</li>
- *       <li>{@link #startDoc(int)}</li>
- *       <li>{@link #span(SpanMatch)} — once per match; return {@code false} to skip the remaining matches of the current document</li>
- *       <li>{@link #endDoc(int)}</li>
- *     </ol>
- *   </li>
- *   <li>{@link #end(boolean)}</li>
+ * <li>{@link #start()}</li>
+ * <li>For each matching document, in natural index order:
+ * <ol>
+ * <li>{@link #wantsMoreDocs()} — return {@code false} to stop the walk</li>
+ * <li>{@link #startDoc(int)}</li>
+ * <li>{@link #span(SpanMatch)} — once per match; return {@code false} to skip the remaining matches
+ * of the current document</li>
+ * <li>{@link #endDoc(int)}</li>
+ * </ol>
+ * </li>
+ * <li>{@link #end(boolean)}</li>
  * </ol>
  *
- * <p>The contract is shared across output formats (HTML, JSON, term aggregation…) and across walker types.</p>
+ * <p>
+ * The contract is shared across output formats (HTML, JSON, term aggregation…) and across walker
+ * types.
+ * </p>
  */
 public interface SpanListener
 {
     /**
      * Called once at the end of every walk, after the last document.
      *
-     * @param exhausted {@code true} if the index was traversed to its end; {@code false} if {@link #wantsMoreDocs()} cut the walk short
+     * @param exhausted {@code true} if the index was traversed to its end; {@code false} if
+     *                  {@link #wantsMoreDocs()} cut the walk short
      */
-    default void end(final boolean exhausted) throws IOException {}
-
+    default void end(final boolean exhausted) throws IOException
+    {
+    }
+    
     /**
-     * Called after all matches of the current document have been delivered to {@link #span(SpanMatch)}, or after {@code span} returned {@code false}.
+     * Called after all matches of the current document have been delivered to
+     * {@link #span(SpanMatch)}, or after {@code span} returned {@code false}.
      *
-     * @param spanCount total number of matches in the document, including those skipped after {@code span} returned {@code false}
+     * @param spanCount total number of matches in the document, including those skipped after
+     *                  {@code span} returned {@code false}
      */
     void endDoc(int spanCount) throws IOException;
-
+    
     /**
      * Called once per match of the current document, in ascending start-position order.
      *
-     * <p>The {@link SpanMatch} is reused across calls; do not retain a reference.</p>
+     * <p>
+     * The {@link SpanMatch} is reused across calls; do not retain a reference.
+     * </p>
      *
      * @param collector match data (term offsets, ordinal within the document) for the current span
-     * @return {@code true} to continue receiving matches in the current document; {@code false} to skip the remainder
+     * @return {@code true} to continue receiving matches in the current document; {@code false} to
+     *         skip the remainder
      */
     boolean span(SpanMatch collector) throws IOException;
-
+    
     /**
      * Called once at the start of every walk, before any document is visited.
      */
-    default void start() throws IOException {}
+    default void start() throws IOException
+    {
+    }
 
     /**
      * Called before the matches of a document are delivered.
      *
      * @param docId global Lucene document id
      */
     void startDoc(int docId) throws IOException;
-
+    
     /**
      * Polled before each candidate document is opened.
      *
-     * @return {@code true} to continue, {@code false} to stop the walk; {@link #end(boolean) end(false)} is then called
+     * @return {@code true} to continue, {@code false} to stop the walk; {@link #end(boolean)
+     *         end(false)} is then called
      */
     default boolean wantsMoreDocs()
     {
 
@@ -16,6 +16,8 @@
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.Weight;
 
+import com.github.oeuvres.alix.lucene.terms.TermLexicon;
+
 /**
  * Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to
  * a {@link SpanListener}, in natural index order.
@@ -70,15 +72,20 @@ public final class SpanWalker
      *                              {@code null}
      */
     public SpanWalker(
-            final IndexSearcher searcher,
-            final SpanQuery spanQuery,
-            final Query filterQuery,
-            final SpanListener listener) throws IOException
+        final IndexSearcher searcher,
+        final TermLexicon lexicon,
+        final SpanQuery spanQuery,
+        final Query filterQuery,
+        final SpanListener listener) throws IOException
     {
         this.searcher = Objects.requireNonNull(searcher, "searcher");
         Objects.requireNonNull(spanQuery, "spanQuery");
         this.listener = Objects.requireNonNull(listener, "listener");
         this.spanQuery = (SpanQuery) searcher.rewrite(spanQuery);
+        if (listener instanceof CoocListener) {
+            int[] pivotIds = lexicon.termIds(spanQuery);
+            ((CoocListener)listener).setPivotIds(pivotIds);
+        }
         this.filterQuery = (filterQuery == null) ? null : searcher.rewrite(filterQuery);
     }
 
 
@@ -2,8 +2,12 @@
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiTerms;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -13,11 +17,13 @@
 import org.apache.lucene.util.fst.Util;
 
 import com.github.oeuvres.alix.util.IOUtil;
+import com.github.oeuvres.alix.util.IntList;
 
 import java.io.BufferedOutputStream;
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.io.UncheckedIOException;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.IntBuffer;
@@ -460,6 +466,37 @@ public static TermLexicon openOrBuild(final IndexReader reader, final Path sideD
         return open(sideDir, field);
     }
 
+    /**
+     * Resolves the terms in a Query to their termIds in the given lexicon, restricted to a
+     * single field. Terms that the lexicon does not know (e.g. because they were never indexed)
+     * are silently dropped.
+     *
+     * @param query   A {@link Query} already rewrittent with {@link IndexSearcher#rewrite(Query)}
+     * @param field   the field whose terms to keep; foreign-field terms are ignored
+     * @return distinct termIds, sorted
+     */
+    public int[] termIds(final Query query)
+    {
+        final IntList ids = new IntList();
+        query.visit(new QueryVisitor()
+        {
+            @Override
+            public void consumeTerms(final Query q, final Term... ts)
+            {
+                for (final Term t : ts) {
+                    // if (!field.equals(t.field())) continue; // user should know
+                    try {
+                        final int id = id(t.bytes());
+                        if (id >= 0) ids.push(id);
+                    } catch (IOException e) {
+                        throw new UncheckedIOException(e);
+                    }
+                }
+            }
+        });
+        return ids.toArray();
+    }
+
     /**
      * Returns the number of entries in the lexicon, including the reserved id 0.
      * <p>