1+ package com .github .oeuvres .alix .lucene ;
2+
3+ import java .io .IOException ;
4+ import java .util .Arrays ;
5+
6+ import org .apache .lucene .index .FieldInfo ;
7+ import org .apache .lucene .index .IndexReader ;
8+ import org .apache .lucene .index .LeafReaderContext ;
9+ import org .apache .lucene .index .SortedDocValues ;
10+ import org .apache .lucene .search .DocIdSetIterator ;
11+ import org .apache .lucene .search .SortField ;
12+ import org .apache .lucene .search .TopDocs ;
13+ import org .apache .lucene .util .BitSet ;
14+ import org .apache .lucene .util .Bits ;
15+ import org .apache .lucene .util .BytesRef ;
16+
17+ /**
18+ * Single-valued string field: per-document label lookup, filtered
19+ * aggregation, and sort support.
20+ *
21+ * <p>
22+ * Extends {@link FlucString}, reusing its sorted label dictionary and
23+ * corpus doc counts, and materializes a flat per-document label vector
24+ * from {@code SortedDocValues} in one O(docs) pass.
25+ * {@code docId4labelId[docId]} stores the labelId for that document,
26+ * or {@code -1} if the document carries no value for this field.
27+ * </p>
28+ *
29+ * <p>
30+ * Because each document carries at most one label, this class also
31+ * exposes a {@link SortField} for use in Lucene result ordering —
32+ * not possible with multi-valued {@link FlucFacet}.
33+ * </p>
34+ *
35+ * <p>
36+ * Registered eagerly by {@link Fluc#inferFields} when
37+ * {@code DocValuesType.SORTED} is detected.
38+ * </p>
39+ *
40+ * <h2>Thread safety</h2>
41+ * <p>
42+ * All state is immutable after construction. Methods are safe for
43+ * concurrent access without synchronization.
44+ * </p>
45+ */
46+ public final class FlucCategory extends FlucString
47+ {
48+ /**
49+ * Per-document label id: {@code docId4labelId[docId] = labelId},
50+ * or {@code -1} if the document carries no value for this field.
51+ */
52+ private final int [] docId4labelId ;
53+
54+ /**
55+ * Delegates dictionary construction to {@link FlucString}, then
56+ * materializes the per-document label vector from
57+ * {@code SortedDocValues} in one O(docs) pass.
58+ *
59+ * <p>
60+ * For each segment leaf, a local ordinal-to-labelId mapping is built
61+ * by looking up each local ordinal string in the globally sorted
62+ * {@link #sortedLabels} array. This mapping is then applied while
63+ * iterating live documents.
64+ * </p>
65+ *
66+ * @param reader frozen index reader
67+ * @param fi field metadata
68+ * @throws IOException on Lucene I/O errors
69+ * @throws IllegalArgumentException if the field is not a
70+ * {@code SortedDocValues} field
71+ */
72+ public FlucCategory (
73+ final FieldInfo fi ,
74+ final IndexReader reader
75+ ) throws IOException {
76+ super (reader , fi );
77+ final int [] labelIds = new int [reader .maxDoc ()];
78+ Arrays .fill (labelIds , -1 );
79+ for (LeafReaderContext ctx : reader .leaves ()) {
80+ final SortedDocValues sdv = ctx .reader ().getSortedDocValues (fi .name );
81+ if (sdv == null ) continue ;
82+ final Bits liveDocs = ctx .reader ().getLiveDocs ();
83+ final int docBase = ctx .docBase ;
84+ final int localCount = sdv .getValueCount ();
85+ final int [] localOrd2labelId = new int [localCount ];
86+ for (int ord = 0 ; ord < localCount ; ord ++) {
87+ final BytesRef bytes = sdv .lookupOrd (ord );
88+ localOrd2labelId [ord ] = Arrays .binarySearch (
89+ sortedLabels , bytes .utf8ToString ());
90+ }
91+ int docLeaf ;
92+ while ((docLeaf = sdv .nextDoc ()) != DocIdSetIterator .NO_MORE_DOCS ) {
93+ if (liveDocs != null && !liveDocs .get (docLeaf )) continue ;
94+ labelIds [docBase + docLeaf ] = localOrd2labelId [sdv .ordValue ()];
95+ }
96+ }
97+ this .docId4labelId = labelIds ;
98+ }
99+
100+ /**
101+ * LabelId for one document, or {@code -1} if the document carries
102+ * no value for this field.
103+ *
104+ * @param docId internal Lucene document id
105+ * @return labelId ≥ 0, or {@code -1}
106+ */
107+ public int docLabel (final int docId )
108+ {
109+ return docId4labelId [docId ];
110+ }
111+
112+ /**
113+ * String label for one document, or {@code null} if the document
114+ * carries no value for this field.
115+ *
116+ * @param docId internal Lucene document id
117+ * @return string label, or {@code null}
118+ */
119+ public String docLabelString (final int docId )
120+ {
121+ final int labelId = docId4labelId [docId ];
122+ return labelId < 0 ? null : sortedLabels [labelId ];
123+ }
124+
125+ /**
126+ * Filtered document count by labelId.
127+ * Element {@code i} holds the count of documents in {@code docFilter}
128+ * whose label equals {@code label(i)}.
129+ *
130+ * @param docFilter set of Lucene internal document ids
131+ * @return counts array of length {@link #labelCount()}
132+ */
133+ @ Override
134+ public int [] countByLabel (final BitSet docFilter )
135+ {
136+ final int [] counts = new int [labelCount ()];
137+ for (int docId = docFilter .nextSetBit (0 );
138+ docId != DocIdSetIterator .NO_MORE_DOCS ;
139+ docId = docFilter .nextSetBit (docId + 1 )) {
140+ final int labelId = docId4labelId [docId ];
141+ if (labelId < 0 ) continue ;
142+ counts [labelId ]++;
143+ }
144+ return counts ;
145+ }
146+
147+ /**
148+ * For each labelId, the rank in {@code topDocs} of its first
149+ * representative document, or {@link Integer#MIN_VALUE} if absent.
150+ * Useful for navigation: jump to the first result for a given category.
151+ *
152+ * @param topDocs ordered search results
153+ * @return array of length {@link #labelCount()}, indexed by labelId
154+ */
155+ @ Override
156+ public int [] nos (final TopDocs topDocs )
157+ {
158+ final int [] nos = new int [labelCount ()];
159+ Arrays .fill (nos , Integer .MIN_VALUE );
160+ for (int n = 0 ; n < topDocs .scoreDocs .length ; n ++) {
161+ final int labelId = docId4labelId [topDocs .scoreDocs [n ].doc ];
162+ if (labelId < 0 ) continue ;
163+ if (nos [labelId ] != Integer .MIN_VALUE ) continue ;
164+ nos [labelId ] = n ;
165+ }
166+ return nos ;
167+ }
168+
169+ /**
170+ * A {@link SortField} for ordering search results by this field.
171+ * Only possible for single-valued fields; not available on
172+ * {@link FlucFacet}.
173+ *
174+ * @param reverse {@code true} for descending order
175+ * @return sort field backed by the {@code SortedDocValues} column
176+ */
177+ public SortField sortField (final boolean reverse )
178+ {
179+ return new SortField (name (), SortField .Type .STRING , reverse );
180+ }
181+ }
0 commit comments