Skip to content

Commit f313af1

Browse files
committed
Exclude pivotsId from cooc counts
1 parent aa9f21e commit f313af1

9 files changed

Lines changed: 954 additions & 199 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.github.oeuvres.alix.lucene.spans;
22

33
import java.io.IOException;
4+
import java.util.Arrays;
45
import java.util.BitSet;
56
import java.util.Objects;
67

@@ -89,6 +90,10 @@ public final class CoocListener implements SpanListener
8990
/** Window positions in the current document, accumulated across all matches. */
9091
private final BitSet windowMask;
9192

93+
private static final int[] EMPTY_INT = new int[0];
94+
/** Do not count pivots occurrences */
95+
private int[] pivotIds = EMPTY_INT;
96+
9297
/**
9398
* Constructs a cooccurrence listener.
9499
*
@@ -122,12 +127,28 @@ public CoocListener(
122127
this.termSeen = new BitSet(fieldStats.vocabSize());
123128
}
124129

130+
/**
131+
* Provide the set of pivot term ids that should be
132+
* excluded from co-occurrence accounting. Called in
133+
* {@link }
134+
*
135+
* @throws NullPointerException if {@code buffers} is {@code null}
136+
*/
137+
protected void setPivotIds(final int[] pivotIds)
138+
{
139+
Objects.requireNonNull(pivotIds, "pivotIds");
140+
this.pivotIds = pivotIds;
141+
}
142+
125143
/**
126144
* Binds this listener to a {@link FocusBuffers} obtained from a
127-
* {@link com.github.oeuvres.alix.lucene.terms.TopTerms} instance. Must be called before the
145+
* {@link TopTerms} instance. Must be called before the
128146
* walk starts.
129147
*
130-
* @param buffers focus buffers to write into
148+
* @param buffers focus buffers to write into
149+
* @param pivotTermIds term ids of the query terms; positions resolving to one of these ids
150+
* are excluded from both {@code termFreq}/{@code termDocs} and
151+
* {@code coocTokens}. May be empty but not {@code null}.
131152
* @throws NullPointerException if {@code buffers} is {@code null}
132153
* @throws IllegalArgumentException if buffer lengths do not match
133154
* {@code fieldStats.vocabSize()}
@@ -181,6 +202,8 @@ public void endDoc(final int spanCount) throws IOException
181202
final int docId = lastDocId;
182203

183204
rail.scanPositions(docId, windowMask, termId -> {
205+
if (Arrays.binarySearch(pivotIds, termId) >= 0)
206+
return;
184207
termFreq[termId]++;
185208
coocTokens++;
186209
if (!termSeen.get(termId)) {

common/src/java/com/github/oeuvres/alix/lucene/spans/SpanListener.java

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,64 +5,81 @@
55
/**
66
* Receives streamed span-match events from a {@link SpanWalker}.
77
*
8-
* <p>Lifecycle, per call to {@link SpanWalker#walk(int)}:</p>
8+
* <p>
9+
* Lifecycle, per call to {@link SpanWalker#walk(int)}:
10+
* </p>
911
* <ol>
10-
* <li>{@link #start()}</li>
11-
* <li>For each matching document, in natural index order:
12-
* <ol>
13-
* <li>{@link #wantsMoreDocs()} — return {@code false} to stop the walk</li>
14-
* <li>{@link #startDoc(int)}</li>
15-
* <li>{@link #span(SpanMatch)} — once per match; return {@code false} to skip the remaining matches of the current document</li>
16-
* <li>{@link #endDoc(int)}</li>
17-
* </ol>
18-
* </li>
19-
* <li>{@link #end(boolean)}</li>
12+
* <li>{@link #start()}</li>
13+
* <li>For each matching document, in natural index order:
14+
* <ol>
15+
* <li>{@link #wantsMoreDocs()} — return {@code false} to stop the walk</li>
16+
* <li>{@link #startDoc(int)}</li>
17+
* <li>{@link #span(SpanMatch)} — once per match; return {@code false} to skip the remaining matches
18+
* of the current document</li>
19+
* <li>{@link #endDoc(int)}</li>
20+
* </ol>
21+
* </li>
22+
* <li>{@link #end(boolean)}</li>
2023
* </ol>
2124
*
22-
* <p>The contract is shared across output formats (HTML, JSON, term aggregation…) and across walker types.</p>
25+
* <p>
26+
* The contract is shared across output formats (HTML, JSON, term aggregation…) and across walker
27+
* types.
28+
* </p>
2329
*/
2430
public interface SpanListener
2531
{
2632
/**
2733
* Called once at the end of every walk, after the last document.
2834
*
29-
* @param exhausted {@code true} if the index was traversed to its end; {@code false} if {@link #wantsMoreDocs()} cut the walk short
35+
* @param exhausted {@code true} if the index was traversed to its end; {@code false} if
36+
* {@link #wantsMoreDocs()} cut the walk short
3037
*/
31-
default void end(final boolean exhausted) throws IOException {}
32-
38+
default void end(final boolean exhausted) throws IOException
39+
{
40+
}
41+
3342
/**
34-
* Called after all matches of the current document have been delivered to {@link #span(SpanMatch)}, or after {@code span} returned {@code false}.
43+
* Called after all matches of the current document have been delivered to
44+
* {@link #span(SpanMatch)}, or after {@code span} returned {@code false}.
3545
*
36-
* @param spanCount total number of matches in the document, including those skipped after {@code span} returned {@code false}
46+
* @param spanCount total number of matches in the document, including those skipped after
47+
* {@code span} returned {@code false}
3748
*/
3849
void endDoc(int spanCount) throws IOException;
39-
50+
4051
/**
4152
* Called once per match of the current document, in ascending start-position order.
4253
*
43-
* <p>The {@link SpanMatch} is reused across calls; do not retain a reference.</p>
54+
* <p>
55+
* The {@link SpanMatch} is reused across calls; do not retain a reference.
56+
* </p>
4457
*
4558
* @param collector match data (term offsets, ordinal within the document) for the current span
46-
* @return {@code true} to continue receiving matches in the current document; {@code false} to skip the remainder
59+
* @return {@code true} to continue receiving matches in the current document; {@code false} to
60+
* skip the remainder
4761
*/
4862
boolean span(SpanMatch collector) throws IOException;
49-
63+
5064
/**
5165
* Called once at the start of every walk, before any document is visited.
5266
*/
53-
default void start() throws IOException {}
67+
default void start() throws IOException
68+
{
69+
}
5470

5571
/**
5672
* Called before the matches of a document are delivered.
5773
*
5874
* @param docId global Lucene document id
5975
*/
6076
void startDoc(int docId) throws IOException;
61-
77+
6278
/**
6379
* Polled before each candidate document is opened.
6480
*
65-
* @return {@code true} to continue, {@code false} to stop the walk; {@link #end(boolean) end(false)} is then called
81+
* @return {@code true} to continue, {@code false} to stop the walk; {@link #end(boolean)
82+
* end(false)} is then called
6683
*/
6784
default boolean wantsMoreDocs()
6885
{

common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import org.apache.lucene.search.Scorer;
1717
import org.apache.lucene.search.Weight;
1818

19+
import com.github.oeuvres.alix.lucene.terms.TermLexicon;
20+
1921
/**
2022
* Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to
2123
* a {@link SpanListener}, in natural index order.
@@ -70,15 +72,20 @@ public final class SpanWalker
7072
* {@code null}
7173
*/
7274
public SpanWalker(
73-
final IndexSearcher searcher,
74-
final SpanQuery spanQuery,
75-
final Query filterQuery,
76-
final SpanListener listener) throws IOException
75+
final IndexSearcher searcher,
76+
final TermLexicon lexicon,
77+
final SpanQuery spanQuery,
78+
final Query filterQuery,
79+
final SpanListener listener) throws IOException
7780
{
7881
this.searcher = Objects.requireNonNull(searcher, "searcher");
7982
Objects.requireNonNull(spanQuery, "spanQuery");
8083
this.listener = Objects.requireNonNull(listener, "listener");
8184
this.spanQuery = (SpanQuery) searcher.rewrite(spanQuery);
85+
if (listener instanceof CoocListener) {
86+
int[] pivotIds = lexicon.termIds(spanQuery);
87+
((CoocListener)listener).setPivotIds(pivotIds);
88+
}
8289
this.filterQuery = (filterQuery == null) ? null : searcher.rewrite(filterQuery);
8390
}
8491

common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@
22

33
import org.apache.lucene.index.IndexReader;
44
import org.apache.lucene.index.MultiTerms;
5+
import org.apache.lucene.index.Term;
56
import org.apache.lucene.index.Terms;
67
import org.apache.lucene.index.TermsEnum;
8+
import org.apache.lucene.search.IndexSearcher;
9+
import org.apache.lucene.search.Query;
10+
import org.apache.lucene.search.QueryVisitor;
711
import org.apache.lucene.util.BytesRef;
812
import org.apache.lucene.util.BytesRefBuilder;
913
import org.apache.lucene.util.IntsRefBuilder;
@@ -13,11 +17,13 @@
1317
import org.apache.lucene.util.fst.Util;
1418

1519
import com.github.oeuvres.alix.util.IOUtil;
20+
import com.github.oeuvres.alix.util.IntList;
1621

1722
import java.io.BufferedOutputStream;
1823
import java.io.Closeable;
1924
import java.io.IOException;
2025
import java.io.OutputStream;
26+
import java.io.UncheckedIOException;
2127
import java.nio.ByteBuffer;
2228
import java.nio.ByteOrder;
2329
import java.nio.IntBuffer;
@@ -460,6 +466,37 @@ public static TermLexicon openOrBuild(final IndexReader reader, final Path sideD
460466
return open(sideDir, field);
461467
}
462468

469+
/**
470+
* Resolves the terms in a Query to their termIds in the given lexicon, restricted to a
471+
* single field. Terms that the lexicon does not know (e.g. because they were never indexed)
472+
* are silently dropped.
473+
*
474+
* @param query A {@link Query} already rewrittent with {@link IndexSearcher#rewrite(Query)}
475+
* @param field the field whose terms to keep; foreign-field terms are ignored
476+
* @return distinct termIds, sorted
477+
*/
478+
public int[] termIds(final Query query)
479+
{
480+
final IntList ids = new IntList();
481+
query.visit(new QueryVisitor()
482+
{
483+
@Override
484+
public void consumeTerms(final Query q, final Term... ts)
485+
{
486+
for (final Term t : ts) {
487+
// if (!field.equals(t.field())) continue; // user should know
488+
try {
489+
final int id = id(t.bytes());
490+
if (id >= 0) ids.push(id);
491+
} catch (IOException e) {
492+
throw new UncheckedIOException(e);
493+
}
494+
}
495+
}
496+
});
497+
return ids.toArray();
498+
}
499+
463500
/**
464501
* Returns the number of entries in the lexicon, including the reserved id 0.
465502
* <p>

0 commit comments

Comments
 (0)