Skip to content

Commit 81cca1a

Browse files
committed
Facet and Category
1 parent 94b9dbe commit 81cca1a

5 files changed

Lines changed: 406 additions & 14 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/Fluc.java

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,7 @@
1414
import org.apache.lucene.index.FieldInfo;
1515
import org.apache.lucene.index.IndexOptions;
1616
import org.apache.lucene.index.IndexReader;
17-
import org.apache.lucene.index.LeafReader;
1817
import org.apache.lucene.index.LeafReaderContext;
19-
import org.apache.lucene.index.PostingsEnum;
20-
import org.apache.lucene.index.Terms;
21-
import org.apache.lucene.index.TermsEnum;
22-
import org.apache.lucene.search.DocIdSetIterator;
2318

2419
/**
2520
* Field of a Lucene index, with type-specific cached resources.
@@ -247,18 +242,17 @@ public static Map<String, Fluc> inferFields(
247242
final Map<String, Fluc> map = new TreeMap<>();
248243
for (FieldInfo fi : infoMap.values()) {
249244
final boolean isIndexed = fi.getIndexOptions() != IndexOptions.NONE;
250-
final boolean hasDocValues = fi.getDocValuesType() != DocValuesType.NONE;
251245
final boolean hasPoints = fi.getPointDimensionCount() > 0;
252246
final boolean hasPositions = fi.getIndexOptions().compareTo(
253247
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
254248

255249

256250
final Fluc fluc;
257251
if (hasPositions) {
258-
fluc = new FlucText(reader, fi, sideDir);
252+
fluc = new FlucText(fi, reader, sideDir);
259253
}
260254
else if (hasPoints) {
261-
fluc = new FlucNum(reader, fi);
255+
fluc = new FlucNum(fi, reader);
262256
}
263257
else if (fi.getDocValuesType() == DocValuesType.SORTED) {
264258
fluc = new FlucCategory(fi, reader);
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
package com.github.oeuvres.alix.lucene;
2+
3+
import java.io.IOException;
4+
import java.util.Arrays;
5+
6+
import org.apache.lucene.index.FieldInfo;
7+
import org.apache.lucene.index.IndexReader;
8+
import org.apache.lucene.index.LeafReaderContext;
9+
import org.apache.lucene.index.SortedDocValues;
10+
import org.apache.lucene.search.DocIdSetIterator;
11+
import org.apache.lucene.search.SortField;
12+
import org.apache.lucene.search.TopDocs;
13+
import org.apache.lucene.util.BitSet;
14+
import org.apache.lucene.util.Bits;
15+
import org.apache.lucene.util.BytesRef;
16+
17+
/**
18+
* Single-valued string field: per-document label lookup, filtered
19+
* aggregation, and sort support.
20+
*
21+
* <p>
22+
* Extends {@link FlucString}, reusing its sorted label dictionary and
23+
* corpus doc counts, and materializes a flat per-document label vector
24+
* from {@code SortedDocValues} in one O(docs) pass.
25+
* {@code docId4labelId[docId]} stores the labelId for that document,
26+
* or {@code -1} if the document carries no value for this field.
27+
* </p>
28+
*
29+
* <p>
30+
* Because each document carries at most one label, this class also
31+
* exposes a {@link SortField} for use in Lucene result ordering —
32+
* not possible with multi-valued {@link FlucFacet}.
33+
* </p>
34+
*
35+
* <p>
36+
* Registered eagerly by {@link Fluc#inferFields} when
37+
* {@code DocValuesType.SORTED} is detected.
38+
* </p>
39+
*
40+
* <h2>Thread safety</h2>
41+
* <p>
42+
* All state is immutable after construction. Methods are safe for
43+
* concurrent access without synchronization.
44+
* </p>
45+
*/
46+
public final class FlucCategory extends FlucString
47+
{
48+
/**
49+
* Per-document label id: {@code docId4labelId[docId] = labelId},
50+
* or {@code -1} if the document carries no value for this field.
51+
*/
52+
private final int[] docId4labelId;
53+
54+
/**
55+
* Delegates dictionary construction to {@link FlucString}, then
56+
* materializes the per-document label vector from
57+
* {@code SortedDocValues} in one O(docs) pass.
58+
*
59+
* <p>
60+
* For each segment leaf, a local ordinal-to-labelId mapping is built
61+
* by looking up each local ordinal string in the globally sorted
62+
* {@link #sortedLabels} array. This mapping is then applied while
63+
* iterating live documents.
64+
* </p>
65+
*
66+
* @param reader frozen index reader
67+
* @param fi field metadata
68+
* @throws IOException on Lucene I/O errors
69+
* @throws IllegalArgumentException if the field is not a
70+
* {@code SortedDocValues} field
71+
*/
72+
public FlucCategory(
73+
final FieldInfo fi,
74+
final IndexReader reader
75+
) throws IOException {
76+
super(reader, fi);
77+
final int[] labelIds = new int[reader.maxDoc()];
78+
Arrays.fill(labelIds, -1);
79+
for (LeafReaderContext ctx : reader.leaves()) {
80+
final SortedDocValues sdv = ctx.reader().getSortedDocValues(fi.name);
81+
if (sdv == null) continue;
82+
final Bits liveDocs = ctx.reader().getLiveDocs();
83+
final int docBase = ctx.docBase;
84+
final int localCount = sdv.getValueCount();
85+
final int[] localOrd2labelId = new int[localCount];
86+
for (int ord = 0; ord < localCount; ord++) {
87+
final BytesRef bytes = sdv.lookupOrd(ord);
88+
localOrd2labelId[ord] = Arrays.binarySearch(
89+
sortedLabels, bytes.utf8ToString());
90+
}
91+
int docLeaf;
92+
while ((docLeaf = sdv.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
93+
if (liveDocs != null && !liveDocs.get(docLeaf)) continue;
94+
labelIds[docBase + docLeaf] = localOrd2labelId[sdv.ordValue()];
95+
}
96+
}
97+
this.docId4labelId = labelIds;
98+
}
99+
100+
/**
101+
* LabelId for one document, or {@code -1} if the document carries
102+
* no value for this field.
103+
*
104+
* @param docId internal Lucene document id
105+
* @return labelId &ge; 0, or {@code -1}
106+
*/
107+
public int docLabel(final int docId)
108+
{
109+
return docId4labelId[docId];
110+
}
111+
112+
/**
113+
* String label for one document, or {@code null} if the document
114+
* carries no value for this field.
115+
*
116+
* @param docId internal Lucene document id
117+
* @return string label, or {@code null}
118+
*/
119+
public String docLabelString(final int docId)
120+
{
121+
final int labelId = docId4labelId[docId];
122+
return labelId < 0 ? null : sortedLabels[labelId];
123+
}
124+
125+
/**
126+
* Filtered document count by labelId.
127+
* Element {@code i} holds the count of documents in {@code docFilter}
128+
* whose label equals {@code label(i)}.
129+
*
130+
* @param docFilter set of Lucene internal document ids
131+
* @return counts array of length {@link #labelCount()}
132+
*/
133+
@Override
134+
public int[] countByLabel(final BitSet docFilter)
135+
{
136+
final int[] counts = new int[labelCount()];
137+
for (int docId = docFilter.nextSetBit(0);
138+
docId != DocIdSetIterator.NO_MORE_DOCS;
139+
docId = docFilter.nextSetBit(docId + 1)) {
140+
final int labelId = docId4labelId[docId];
141+
if (labelId < 0) continue;
142+
counts[labelId]++;
143+
}
144+
return counts;
145+
}
146+
147+
/**
148+
* For each labelId, the rank in {@code topDocs} of its first
149+
* representative document, or {@link Integer#MIN_VALUE} if absent.
150+
* Useful for navigation: jump to the first result for a given category.
151+
*
152+
* @param topDocs ordered search results
153+
* @return array of length {@link #labelCount()}, indexed by labelId
154+
*/
155+
@Override
156+
public int[] nos(final TopDocs topDocs)
157+
{
158+
final int[] nos = new int[labelCount()];
159+
Arrays.fill(nos, Integer.MIN_VALUE);
160+
for (int n = 0; n < topDocs.scoreDocs.length; n++) {
161+
final int labelId = docId4labelId[topDocs.scoreDocs[n].doc];
162+
if (labelId < 0) continue;
163+
if (nos[labelId] != Integer.MIN_VALUE) continue;
164+
nos[labelId] = n;
165+
}
166+
return nos;
167+
}
168+
169+
/**
170+
* A {@link SortField} for ordering search results by this field.
171+
* Only possible for single-valued fields; not available on
172+
* {@link FlucFacet}.
173+
*
174+
* @param reverse {@code true} for descending order
175+
* @return sort field backed by the {@code SortedDocValues} column
176+
*/
177+
public SortField sortField(final boolean reverse)
178+
{
179+
return new SortField(name(), SortField.Type.STRING, reverse);
180+
}
181+
}

0 commit comments

Comments
 (0)