oeuvres
diff --git a/‎analysis/src/resources/com/github/oeuvres/alix/xml/alix.xsl‎
Lines changed: 1 addition & 2 deletions b/‎analysis/src/resources/com/github/oeuvres/alix/xml/alix.xsl‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/fluc/FlucString.java‎
Lines changed: 46 additions & 18 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/fluc/FlucString.java‎
Lines changed: 46 additions & 18 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java‎
Lines changed: 3 additions & 3 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/spans/CoocListener.java‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java‎
Lines changed: 49 additions & 28 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/spans/SpanWalker.java‎
Lines changed: 49 additions & 28 deletions
@@ -199,8 +199,7 @@
     <xsl:if test="/*/@cert">
       <alix:field name="cert" type="category" value="{normalize-space(/*/@cert)}"/>
     </xsl:if>
-    <alix:field name="title" type="category" value="{normalize-space($doctitle)}"/>
-
+    <alix:field name="title" type="store" value="{normalize-space($doctitle)}"/>
     <xsl:for-each select="/tei:TEI/tei:teiHeader/tei:fileDesc/tei:notesStmt">
       <xsl:call-template name="note-bibl"/>
     </xsl:for-each>
 
@@ -3,7 +3,11 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
 
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
@@ -49,6 +53,7 @@
  */
 public abstract class FlucString extends Fluc
 {
+    private static final int TOP_LABELS = 20;
     /**
      * Sorted label dictionary: {@code sortedLabels[labelId]} is the string
      * label for that id. Order is {@link String#compareTo(String)};
@@ -61,23 +66,23 @@ public abstract class FlucString extends Fluc
      * {@code sortedLabels[labelId]}.
      */
     protected final int[] labelId4docs;
-
+    
     /**
      * A label and its corpus-level document count, used during construction.
      *
-     * @param label  string label as read from the inverted index
-     * @param docs   document frequency across all segments
+     * @param label string label as read from the inverted index
+     * @param docs  document frequency across all segments
      */
     private record LabelDocs(String label, int docs)
-        implements Comparable<LabelDocs>
+            implements Comparable<LabelDocs>
     {
         @Override
         public int compareTo(final LabelDocs other)
         {
             return this.label.compareTo(other.label);
         }
     }
-
+    
     /**
      * Builds the sorted label dictionary and corpus doc counts from the
      * keyword inverted index via {@link MultiTerms}.
@@ -88,9 +93,9 @@ public int compareTo(final LabelDocs other)
      * @throws IllegalArgumentException if the field has no inverted index
      */
     protected FlucString(
-        final FieldInfo fi,
-        final IndexReader reader
-    ) throws IOException {
+            final FieldInfo fi,
+            final IndexReader reader) throws IOException
+    {
         super(fi, probeStoredViaPostings(reader, fi.name), reader.getDocCount(fi.name));
         final List<LabelDocs> list = new ArrayList<>();
         final Terms terms = MultiTerms.getTerms(reader, fi.name);
@@ -109,8 +114,29 @@ protected FlucString(
             sortedLabels[i] = sorted[i].label();
             labelId4docs[i] = sorted[i].docs();
         }
+        
+        // inside FlucString constructor, after labelId4docs is filled
+        final int k = Math.min(TOP_LABELS, sortedLabels.length);
+        final PriorityQueue<Long> heap = new PriorityQueue<>(k);
+        for (int id = 0; id < labelId4docs.length; id++) {
+            final long packed = ((long) labelId4docs[id] << 32) | (id & 0xFFFFFFFFL);
+            if (heap.size() < k) {
+                heap.add(packed);
+            } else if (packed > heap.peek()) {
+                heap.poll();
+                heap.add(packed);
+            }
+        }
+        final Long[] toSort = heap.toArray(new Long[0]);
+        Arrays.sort(toSort, Comparator.reverseOrder());
+        final Map<String, Integer> top = new LinkedHashMap<>(k);
+        for (Long packed : toSort) {
+            final int id = (int) (packed & 0xFFFFFFFFL);
+            top.put(sortedLabels[id], (int) (packed >>> 32));
+        }
+        description.put("topLabels", top);
     }
-
+    
     /**
      * Number of distinct labels in this field.
      *
@@ -120,7 +146,7 @@ public int labelCount()
     {
         return sortedLabels.length;
     }
-
+    
     /**
      * String label for a labelId.
      *
@@ -131,7 +157,7 @@ public String label(final int labelId)
     {
         return sortedLabels[labelId];
     }
-
+    
     /**
      * LabelId for a string label, or {@code -1} if not found.
      * Uses binary search on the sorted dictionary.
@@ -155,7 +181,7 @@ public int labelId(final String label)
      * @return first labelId, or {@code -1}
      */
     abstract public int docLabel(final int docId);
-
+    
     /**
      * Full-corpus document count for a labelId.
      *
@@ -166,7 +192,7 @@ public int docs(final int labelId)
     {
         return labelId4docs[labelId];
     }
-
+    
     /**
      * Full-corpus document count by labelId.
      * Returns a defensive copy of the precomputed array.
@@ -178,7 +204,7 @@ public int[] countByLabel()
     {
         return labelId4docs.clone();
     }
-
+    
     /**
      * Filtered document count by labelId.
      * Implemented by subclasses, which hold the per-document value vectors.
@@ -188,7 +214,7 @@ public int[] countByLabel()
      * @throws IOException on Lucene I/O errors
      */
     public abstract int[] countByLabel(BitSet docFilter) throws IOException;
-
+    
     /**
      * For each labelId, the rank in {@code topDocs} of its first
      * representative document, or {@link Integer#MIN_VALUE} if absent.
@@ -198,8 +224,10 @@ public int[] countByLabel()
      * @return array of length {@link #labelCount()}, indexed by labelId
      */
     public abstract int[] nos(TopDocs topDocs);
-
+    
     @Override
-    public void close() { }
-
+    public void close()
+    {
+    }
+    
 }
@@ -7,7 +7,7 @@
 import com.github.oeuvres.alix.lucene.terms.FieldStats;
 import com.github.oeuvres.alix.lucene.terms.TermRail;
 import com.github.oeuvres.alix.lucene.terms.TopTerms;
-import com.github.oeuvres.alix.lucene.terms.TopTerms.FocusBuffers;
+import com.github.oeuvres.alix.lucene.terms.TopTerms.Buffers;
 
 /**
  * {@link SpanListener} that accumulates per-term cooccurrence counts in a fixed-width window around
@@ -82,7 +82,7 @@ public final class CoocListener implements SpanListener
     private final BitSet termSeen;
 
     /** Bound focus buffers; {@code null} until {@link #bindTo(FocusBuffers)} is called. */
-    private TopTerms.FocusBuffers buffers;
+    private Buffers buffers;
 
     /** Whether the current document contributed at least one cooc position. */
     private boolean docContributed;
@@ -133,7 +133,7 @@ public CoocListener(
      * @throws IllegalArgumentException if buffer lengths do not match
      *                                  {@code fieldStats.vocabSize()}
      */
-    public void bindTo(final FocusBuffers buffers)
+    public void bindTo(final Buffers buffers)
     {
         Objects.requireNonNull(buffers, "buffers");
         final int vocab = fieldStats.vocabSize();
 
@@ -17,48 +17,57 @@
 import org.apache.lucene.search.Weight;
 
 /**
- * Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to a {@link SpanListener}, in natural index order.
+ * Streams the matches of a {@link SpanQuery}, optionally intersected with a non-scoring filter, to
+ * a {@link SpanListener}, in natural index order.
  *
  * <p>
- * No matches are retained in memory; per-match state is owned by the listener. If the index is sorted, natural order is the index sort order.
+ * No matches are retained in memory; per-match state is owned by the listener. If the index is
+ * sorted, natural order is the index sort order.
  * </p>
  *
  * <h2>Pagination</h2>
  *
  * <p>
- * {@link #walk(int)} accepts a global docId cursor (inclusive). The first call passes {@code 0}; subsequent calls pass the value returned by the previous call, which is the first unprocessed docId, or {@code -1} when the walk has completed.
+ * {@link #walk(int)} accepts a global docId cursor (inclusive). The first call passes {@code 0};
+ * subsequent calls pass the value returned by the previous call, which is the first unprocessed
+ * docId, or {@code -1} when the walk has completed.
  * </p>
  *
  * <h2>Filter query</h2>
  *
  * <p>
- * Filter constraints (dates, tags…) belong in {@code filterQuery}. The walker leapfrogs the span and filter iterators per leaf, advancing whichever lags. With an {@link IndexSearcher} backed by an {@code LRUQueryCache}, filter results are cached across requests.
+ * Filter constraints (dates, tags…) belong in {@code filterQuery}. The walker leapfrogs the span
+ * and filter iterators per leaf, advancing whichever lags. With an {@link IndexSearcher} backed by
+ * an {@code LRUQueryCache}, filter results are cached across requests.
  * </p>
  *
  * <h2>Document count</h2>
  *
  * <p>
- * {@link #hits()} performs a separate full-count scan and is independent of {@link #walk(int)}. Call it on demand only.
+ * {@link #hits()} performs a separate full-count scan and is independent of {@link #walk(int)}.
+ * Call it on demand only.
  * </p>
  */
 public final class SpanWalker
 {
     private static final int INITIAL_LEAF_CAPACITY = 8;
-
+    
     private final IndexSearcher searcher;
     private final SpanQuery spanQuery;
     private final Query filterQuery;
     private final SpanListener listener;
-
+    
     /**
-     * Creates a walker bound to a query, an optional filter, and a listener. Both queries are rewritten once, here.
+     * Creates a walker bound to a query, an optional filter, and a listener. Both queries are
+     * rewritten once, here.
      *
      * @param searcher    used for query rewrite and leaf access
      * @param spanQuery   span query to enumerate
      * @param filterQuery non-scoring filter, or {@code null}
      * @param listener    consumer of streamed matches
      * @throws IOException          on rewrite failure
-     * @throws NullPointerException if {@code searcher}, {@code spanQuery}, or {@code listener} is {@code null}
+     * @throws NullPointerException if {@code searcher}, {@code spanQuery}, or {@code listener} is
+     *                              {@code null}
      */
     public SpanWalker(
             final IndexSearcher searcher,
@@ -72,9 +81,10 @@ public SpanWalker(
         this.spanQuery = (SpanQuery) searcher.rewrite(spanQuery);
         this.filterQuery = (filterQuery == null) ? null : searcher.rewrite(filterQuery);
     }
-
+    
     /**
-     * Counts the documents matched by the span query intersected with the optional filter. Performs a full scan; does not interact with {@link #walk(int)}.
+     * Counts the documents matched by the span query intersected with the optional filter. Performs
+     * a full scan; does not interact with {@link #walk(int)}.
      *
      * @return matching document count
      * @throws IOException on I/O failure
@@ -89,16 +99,22 @@ public int hits() throws IOException
                         .build();
         return searcher.count(countQuery);
     }
-
+    
     /**
      * Streams matches to the listener starting at the given global docId (inclusive).
      *
      * <p>
-     * Each matching document is visited once, in natural order; for each document the matches are delivered in ascending start-position order. The listener may stop the walk at any document boundary by returning {@code false} from {@link SpanListener#wantsMoreDocs()}.
+     * Each matching document is visited once, in natural order; for each document the matches are
+     * delivered in ascending start-position order. The listener may stop the walk at any document
+     * boundary by returning {@code false} from {@link SpanListener#wantsMoreDocs()}.
      * </p>
      *
      * <p>
-     * Per match the walker performs the canonical {@link SpanMatch} lifecycle: {@link SpanMatch#reset()}, {@link SpanMatch#range(int, int)} from {@link Spans#startPosition()} / {@link Spans#endPosition()}, {@link SpanMatch#ord(int)}, then {@link Spans#collect(org.apache.lucene.queries.spans.SpanCollector)} and {@link SpanMatch#sort()}.
+     * Per match the walker performs the canonical {@link SpanMatch} lifecycle:
+     * {@link SpanMatch#reset()}, {@link SpanMatch#range(int, int)} from
+     * {@link Spans#startPosition()} / {@link Spans#endPosition()}, {@link SpanMatch#ord(int)}, then
+     * {@link Spans#collect(org.apache.lucene.queries.spans.SpanCollector)} and
+     * {@link SpanMatch#sort()}.
      * </p>
      *
      * @param docStart first global docId to visit, inclusive; pass {@code 0} on the first call
@@ -112,46 +128,51 @@ public int walk(final int docStart) throws IOException
                 ? null
                 : searcher.createWeight(filterQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
         final SpanMatch match = new SpanMatch(INITIAL_LEAF_CAPACITY);
-
+        
         int nextCursor = -1;
         boolean exhausted = true;
         listener.start();
-
+        
         outer: for (final LeafReaderContext ctx : searcher.getLeafContexts()) {
-            if (ctx.docBase + ctx.reader().maxDoc() <= docStart) continue;
-
+            if (ctx.docBase + ctx.reader().maxDoc() <= docStart)
+                continue;
+            
             final Spans spans = spanWeight.getSpans(ctx, SpanWeight.Postings.OFFSETS);
-            if (spans == null) continue;
-
+            if (spans == null)
+                continue;
+            
             DocIdSetIterator filterIt = null;
             if (filterWeight != null) {
                 final Scorer filterScorer = filterWeight.scorer(ctx);
-                if (filterScorer == null) continue;
+                if (filterScorer == null)
+                    continue;
                 filterIt = filterScorer.iterator();
             }
-
+            
             final int localStart = Math.max(0, docStart - ctx.docBase);
             int localDocId = (localStart == 0) ? spans.nextDoc() : spans.advance(localStart);
             int filterDoc = (filterIt == null)
                     ? DocIdSetIterator.NO_MORE_DOCS
                     : (localStart == 0 ? filterIt.nextDoc() : filterIt.advance(localStart));
-
+            
             while (localDocId != DocIdSetIterator.NO_MORE_DOCS) {
                 if (filterIt != null) {
-                    if (filterDoc < localDocId) filterDoc = filterIt.advance(localDocId);
-                    if (filterDoc == DocIdSetIterator.NO_MORE_DOCS) break;
+                    if (filterDoc < localDocId)
+                        filterDoc = filterIt.advance(localDocId);
+                    if (filterDoc == DocIdSetIterator.NO_MORE_DOCS)
+                        break;
                     if (filterDoc > localDocId) {
                         localDocId = spans.advance(filterDoc);
                         continue;
                     }
                 }
-
+                
                 if (!listener.wantsMoreDocs()) {
                     nextCursor = ctx.docBase + localDocId;
                     exhausted = false;
                     break outer;
                 }
-
+                
                 listener.startDoc(ctx.docBase + localDocId);
                 int spanCount = 0;
                 boolean wantsMore = true;
@@ -170,7 +191,7 @@ public int walk(final int docStart) throws IOException
                 localDocId = spans.nextDoc();
             }
         }
-
+        
         listener.end(exhausted);
         return nextCursor;
     }