Skip to content

Commit aa5a3ea

Browse files
committed
TopTerms big refactor
1 parent 75745d1 commit aa5a3ea

9 files changed

Lines changed: 481 additions & 582 deletions

File tree

analysis/src/resources/com/github/oeuvres/alix/xml/alix.xsl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,7 @@
199199
<xsl:if test="/*/@cert">
200200
<alix:field name="cert" type="category" value="{normalize-space(/*/@cert)}"/>
201201
</xsl:if>
202-
<alix:field name="title" type="category" value="{normalize-space($doctitle)}"/>
203-
202+
<alix:field name="title" type="store" value="{normalize-space($doctitle)}"/>
204203
<xsl:for-each select="/tei:TEI/tei:teiHeader/tei:fileDesc/tei:notesStmt">
205204
<xsl:call-template name="note-bibl"/>
206205
</xsl:for-each>

common/src/java/com/github/oeuvres/alix/lucene/fluc/FlucString.java

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
import java.io.IOException;
44
import java.util.ArrayList;
55
import java.util.Arrays;
6+
import java.util.Comparator;
7+
import java.util.LinkedHashMap;
68
import java.util.List;
9+
import java.util.Map;
10+
import java.util.PriorityQueue;
711

812
import org.apache.lucene.index.FieldInfo;
913
import org.apache.lucene.index.IndexReader;
@@ -49,6 +53,7 @@
4953
*/
5054
public abstract class FlucString extends Fluc
5155
{
56+
private static final int TOP_LABELS = 20;
5257
/**
5358
* Sorted label dictionary: {@code sortedLabels[labelId]} is the string
5459
* label for that id. Order is {@link String#compareTo(String)};
@@ -61,23 +66,23 @@ public abstract class FlucString extends Fluc
6166
* {@code sortedLabels[labelId]}.
6267
*/
6368
protected final int[] labelId4docs;
64-
69+
6570
/**
6671
* A label and its corpus-level document count, used during construction.
6772
*
68-
* @param label string label as read from the inverted index
69-
* @param docs document frequency across all segments
73+
* @param label string label as read from the inverted index
74+
* @param docs document frequency across all segments
7075
*/
7176
private record LabelDocs(String label, int docs)
72-
implements Comparable<LabelDocs>
77+
implements Comparable<LabelDocs>
7378
{
7479
@Override
7580
public int compareTo(final LabelDocs other)
7681
{
7782
return this.label.compareTo(other.label);
7883
}
7984
}
80-
85+
8186
/**
8287
* Builds the sorted label dictionary and corpus doc counts from the
8388
* keyword inverted index via {@link MultiTerms}.
@@ -88,9 +93,9 @@ public int compareTo(final LabelDocs other)
8893
* @throws IllegalArgumentException if the field has no inverted index
8994
*/
9095
protected FlucString(
91-
final FieldInfo fi,
92-
final IndexReader reader
93-
) throws IOException {
96+
final FieldInfo fi,
97+
final IndexReader reader) throws IOException
98+
{
9499
super(fi, probeStoredViaPostings(reader, fi.name), reader.getDocCount(fi.name));
95100
final List<LabelDocs> list = new ArrayList<>();
96101
final Terms terms = MultiTerms.getTerms(reader, fi.name);
@@ -109,8 +114,29 @@ protected FlucString(
109114
sortedLabels[i] = sorted[i].label();
110115
labelId4docs[i] = sorted[i].docs();
111116
}
117+
118+
// inside FlucString constructor, after labelId4docs is filled
119+
final int k = Math.min(TOP_LABELS, sortedLabels.length);
120+
final PriorityQueue<Long> heap = new PriorityQueue<>(k);
121+
for (int id = 0; id < labelId4docs.length; id++) {
122+
final long packed = ((long) labelId4docs[id] << 32) | (id & 0xFFFFFFFFL);
123+
if (heap.size() < k) {
124+
heap.add(packed);
125+
} else if (packed > heap.peek()) {
126+
heap.poll();
127+
heap.add(packed);
128+
}
129+
}
130+
final Long[] toSort = heap.toArray(new Long[0]);
131+
Arrays.sort(toSort, Comparator.reverseOrder());
132+
final Map<String, Integer> top = new LinkedHashMap<>(k);
133+
for (Long packed : toSort) {
134+
final int id = (int) (packed & 0xFFFFFFFFL);
135+
top.put(sortedLabels[id], (int) (packed >>> 32));
136+
}
137+
description.put("topLabels", top);
112138
}
113-
139+
114140
/**
115141
* Number of distinct labels in this field.
116142
*
@@ -120,7 +146,7 @@ public int labelCount()
120146
{
121147
return sortedLabels.length;
122148
}
123-
149+
124150
/**
125151
* String label for a labelId.
126152
*
@@ -131,7 +157,7 @@ public String label(final int labelId)
131157
{
132158
return sortedLabels[labelId];
133159
}
134-
160+
135161
/**
136162
* LabelId for a string label, or {@code -1} if not found.
137163
* Uses binary search on the sorted dictionary.
@@ -155,7 +181,7 @@ public int labelId(final String label)
155181
* @return first labelId, or {@code -1}
156182
*/
157183
abstract public int docLabel(final int docId);
158-
184+
159185
/**
160186
* Full-corpus document count for a labelId.
161187
*
@@ -166,7 +192,7 @@ public int docs(final int labelId)
166192
{
167193
return labelId4docs[labelId];
168194
}
169-
195+
170196
/**
171197
* Full-corpus document count by labelId.
172198
* Returns a defensive copy of the precomputed array.
@@ -178,7 +204,7 @@ public int[] countByLabel()
178204
{
179205
return labelId4docs.clone();
180206
}
181-
207+
182208
/**
183209
* Filtered document count by labelId.
184210
* Implemented by subclasses, which hold the per-document value vectors.
@@ -188,7 +214,7 @@ public int[] countByLabel()
188214
* @throws IOException on Lucene I/O errors
189215
*/
190216
public abstract int[] countByLabel(BitSet docFilter) throws IOException;
191-
217+
192218
/**
193219
* For each labelId, the rank in {@code topDocs} of its first
194220
* representative document, or {@link Integer#MIN_VALUE} if absent.
@@ -198,8 +224,10 @@ public int[] countByLabel()
198224
* @return array of length {@link #labelCount()}, indexed by labelId
199225
*/
200226
public abstract int[] nos(TopDocs topDocs);
201-
227+
202228
@Override
203-
public void close() { }
204-
229+
public void close()
230+
{
231+
}
232+
205233
}

common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import com.github.oeuvres.alix.lucene.terms.FieldStats;
88
import com.github.oeuvres.alix.lucene.terms.TermRail;
99
import com.github.oeuvres.alix.lucene.terms.TopTerms;
10-
import com.github.oeuvres.alix.lucene.terms.TopTerms.FocusBuffers;
10+
import com.github.oeuvres.alix.lucene.terms.TopTerms.Buffers;
1111

1212
/**
1313
* {@link SpanListener} that accumulates per-term cooccurrence counts in a fixed-width window around
@@ -82,7 +82,7 @@ public final class CoocListener implements SpanListener
8282
private final BitSet termSeen;
8383

8484
/** Bound focus buffers; {@code null} until {@link #bindTo(FocusBuffers)} is called. */
85-
private TopTerms.FocusBuffers buffers;
85+
private Buffers buffers;
8686

8787
/** Whether the current document contributed at least one cooc position. */
8888
private boolean docContributed;
@@ -133,7 +133,7 @@ public CoocListener(
133133
* @throws IllegalArgumentException if buffer lengths do not match
134134
* {@code fieldStats.vocabSize()}
135135
*/
136-
public void bindTo(final FocusBuffers buffers)
136+
public void bindTo(final Buffers buffers)
137137
{
138138
Objects.requireNonNull(buffers, "buffers");
139139
final int vocab = fieldStats.vocabSize();

common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,48 +17,57 @@
1717
import org.apache.lucene.search.Weight;
1818

1919
/**
20-
* Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to a {@link SpanListener}, in natural index order.
20+
* Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to
21+
* a {@link SpanListener}, in natural index order.
2122
*
2223
* <p>
23-
* No matches are retained in memory; per-match state is owned by the listener. If the index is sorted, natural order is the index sort order.
24+
* No matches are retained in memory; per-match state is owned by the listener. If the index is
25+
* sorted, natural order is the index sort order.
2426
* </p>
2527
*
2628
* <h2>Pagination</h2>
2729
*
2830
* <p>
29-
* {@link #walk(int)} accepts a global docId cursor (inclusive). The first call passes {@code 0}; subsequent calls pass the value returned by the previous call, which is the first unprocessed docId, or {@code -1} when the walk has completed.
31+
* {@link #walk(int)} accepts a global docId cursor (inclusive). The first call passes {@code 0};
32+
* subsequent calls pass the value returned by the previous call, which is the first unprocessed
33+
* docId, or {@code -1} when the walk has completed.
3034
* </p>
3135
*
3236
* <h2>Filter query</h2>
3337
*
3438
* <p>
35-
* Filter constraints (dates, tags…) belong in {@code filterQuery}. The walker leapfrogs the span and filter iterators per leaf, advancing whichever lags. With an {@link IndexSearcher} backed by an {@code LRUQueryCache}, filter results are cached across requests.
39+
* Filter constraints (dates, tags…) belong in {@code filterQuery}. The walker leapfrogs the span
40+
* and filter iterators per leaf, advancing whichever lags. With an {@link IndexSearcher} backed by
41+
* an {@code LRUQueryCache}, filter results are cached across requests.
3642
* </p>
3743
*
3844
* <h2>Document count</h2>
3945
*
4046
* <p>
41-
* {@link #hits()} performs a separate full-count scan and is independent of {@link #walk(int)}. Call it on demand only.
47+
* {@link #hits()} performs a separate full-count scan and is independent of {@link #walk(int)}.
48+
* Call it on demand only.
4249
* </p>
4350
*/
4451
public final class SpanWalker
4552
{
4653
private static final int INITIAL_LEAF_CAPACITY = 8;
47-
54+
4855
private final IndexSearcher searcher;
4956
private final SpanQuery spanQuery;
5057
private final Query filterQuery;
5158
private final SpanListener listener;
52-
59+
5360
/**
54-
* Creates a walker bound to a query, an optional filter, and a listener. Both queries are rewritten once, here.
61+
* Creates a walker bound to a query, an optional filter, and a listener. Both queries are
62+
* rewritten once, here.
5563
*
5664
* @param searcher used for query rewrite and leaf access
5765
* @param spanQuery span query to enumerate
5866
* @param filterQuery non-scoring filter, or {@code null}
5967
* @param listener consumer of streamed matches
6068
* @throws IOException on rewrite failure
61-
* @throws NullPointerException if {@code searcher}, {@code spanQuery}, or {@code listener} is {@code null}
69+
* @throws NullPointerException if {@code searcher}, {@code spanQuery}, or {@code listener} is
70+
* {@code null}
6271
*/
6372
public SpanWalker(
6473
final IndexSearcher searcher,
@@ -72,9 +81,10 @@ public SpanWalker(
7281
this.spanQuery = (SpanQuery) searcher.rewrite(spanQuery);
7382
this.filterQuery = (filterQuery == null) ? null : searcher.rewrite(filterQuery);
7483
}
75-
84+
7685
/**
77-
* Counts the documents matched by the span query intersected with the optional filter. Performs a full scan; does not interact with {@link #walk(int)}.
86+
* Counts the documents matched by the span query intersected with the optional filter. Performs
87+
* a full scan; does not interact with {@link #walk(int)}.
7888
*
7989
* @return matching document count
8090
* @throws IOException on I/O failure
@@ -89,16 +99,22 @@ public int hits() throws IOException
8999
.build();
90100
return searcher.count(countQuery);
91101
}
92-
102+
93103
/**
94104
* Streams matches to the listener starting at the given global docId (inclusive).
95105
*
96106
* <p>
97-
* Each matching document is visited once, in natural order; for each document the matches are delivered in ascending start-position order. The listener may stop the walk at any document boundary by returning {@code false} from {@link SpanListener#wantsMoreDocs()}.
107+
* Each matching document is visited once, in natural order; for each document the matches are
108+
* delivered in ascending start-position order. The listener may stop the walk at any document
109+
* boundary by returning {@code false} from {@link SpanListener#wantsMoreDocs()}.
98110
* </p>
99111
*
100112
* <p>
101-
* Per match the walker performs the canonical {@link SpanMatch} lifecycle: {@link SpanMatch#reset()}, {@link SpanMatch#range(int, int)} from {@link Spans#startPosition()} / {@link Spans#endPosition()}, {@link SpanMatch#ord(int)}, then {@link Spans#collect(org.apache.lucene.queries.spans.SpanCollector)} and {@link SpanMatch#sort()}.
113+
* Per match the walker performs the canonical {@link SpanMatch} lifecycle:
114+
* {@link SpanMatch#reset()}, {@link SpanMatch#range(int, int)} from
115+
* {@link Spans#startPosition()} / {@link Spans#endPosition()}, {@link SpanMatch#ord(int)}, then
116+
* {@link Spans#collect(org.apache.lucene.queries.spans.SpanCollector)} and
117+
* {@link SpanMatch#sort()}.
102118
* </p>
103119
*
104120
* @param docStart first global docId to visit, inclusive; pass {@code 0} on the first call
@@ -112,46 +128,51 @@ public int walk(final int docStart) throws IOException
112128
? null
113129
: searcher.createWeight(filterQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
114130
final SpanMatch match = new SpanMatch(INITIAL_LEAF_CAPACITY);
115-
131+
116132
int nextCursor = -1;
117133
boolean exhausted = true;
118134
listener.start();
119-
135+
120136
outer: for (final LeafReaderContext ctx : searcher.getLeafContexts()) {
121-
if (ctx.docBase + ctx.reader().maxDoc() <= docStart) continue;
122-
137+
if (ctx.docBase + ctx.reader().maxDoc() <= docStart)
138+
continue;
139+
123140
final Spans spans = spanWeight.getSpans(ctx, SpanWeight.Postings.OFFSETS);
124-
if (spans == null) continue;
125-
141+
if (spans == null)
142+
continue;
143+
126144
DocIdSetIterator filterIt = null;
127145
if (filterWeight != null) {
128146
final Scorer filterScorer = filterWeight.scorer(ctx);
129-
if (filterScorer == null) continue;
147+
if (filterScorer == null)
148+
continue;
130149
filterIt = filterScorer.iterator();
131150
}
132-
151+
133152
final int localStart = Math.max(0, docStart - ctx.docBase);
134153
int localDocId = (localStart == 0) ? spans.nextDoc() : spans.advance(localStart);
135154
int filterDoc = (filterIt == null)
136155
? DocIdSetIterator.NO_MORE_DOCS
137156
: (localStart == 0 ? filterIt.nextDoc() : filterIt.advance(localStart));
138-
157+
139158
while (localDocId != DocIdSetIterator.NO_MORE_DOCS) {
140159
if (filterIt != null) {
141-
if (filterDoc < localDocId) filterDoc = filterIt.advance(localDocId);
142-
if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) break;
160+
if (filterDoc < localDocId)
161+
filterDoc = filterIt.advance(localDocId);
162+
if (filterDoc == DocIdSetIterator.NO_MORE_DOCS)
163+
break;
143164
if (filterDoc > localDocId) {
144165
localDocId = spans.advance(filterDoc);
145166
continue;
146167
}
147168
}
148-
169+
149170
if (!listener.wantsMoreDocs()) {
150171
nextCursor = ctx.docBase + localDocId;
151172
exhausted = false;
152173
break outer;
153174
}
154-
175+
155176
listener.startDoc(ctx.docBase + localDocId);
156177
int spanCount = 0;
157178
boolean wantsMore = true;
@@ -170,7 +191,7 @@ public int walk(final int docStart) throws IOException
170191
localDocId = spans.nextDoc();
171192
}
172193
}
173-
194+
174195
listener.end(exhausted);
175196
return nextCursor;
176197
}

0 commit comments

Comments
 (0)