form() for json API

glorieux-f · glorieux-f · commit 75745d1566ae · 2026-05-05T15:59:43.000+02:00
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java b/common/src/java/com/github/oeuvres/alix/lucene/terms/TermLexicon.java
@@ -46,7 +46,7 @@
  * Real term ids start at 1 and are dense and stable for the frozen snapshot from which the
  * lexicon was built. The id assignment follows the lexicographic iteration order returned by
  * Lucene's merged {@link TermsEnum} for the field. The reserved id is stored as a zero-length
- * phantom entry in the {@code .dat}/{@code .off} files so that {@link #term(int) term(0)}
+ * phantom entry in the {@code .dat}/{@code .off} files so that {@link #form(int) term(0)}
  * returns {@code ""} and all sidecar files remain self-consistent.
  * </p>
  * <p>
@@ -107,7 +107,7 @@ public final class TermLexicon implements Closeable {
     private static final ThreadLocal<BytesRefBuilder> TL_TERM_BYTES =
         ThreadLocal.withInitial(BytesRefBuilder::new);
 
-    /** Per-thread reusable scratch buffer for {@link #term(int)} to avoid allocation per call. */
+    /** Per-thread reusable scratch buffer for {@link #form(int)} to avoid allocation per call. */
     private static final ThreadLocal<BytesRefBuilder> TL_TERM_STRING =
         ThreadLocal.withInitial(BytesRefBuilder::new);
 
@@ -254,6 +254,54 @@ public String field() {
         return field;
     }
 
+    /**
+     * Returns the term string for one dense term id.
+     * <p>
+     * {@code term(0)} returns the empty string (reserved absent-term slot).
+     * Uses a per-thread scratch buffer internally. Suitable for moderate use
+     * (e.g. resolving 50 term ids for display). For tight loops over the full
+     * vocabulary, prefer {@link #formBytes(int, BytesRefBuilder)} with a caller-owned buffer.
+     * </p>
+     *
+     * @param termId dense term id in {@code [0, vocabSize)}
+     * @return decoded UTF-8 term string, never null; empty for the reserved id 0
+     * @throws IllegalArgumentException if {@code termId} is out of range
+     */
+    public String form(final int termId) {
+        return formBytes(termId, TL_TERM_STRING.get()).utf8ToString();
+    }
+
+    /**
+     * Copies the raw UTF-8 bytes of one term into a caller-provided reusable buffer.
+     * <p>
+     * This avoids allocation when called in a loop. The bytes are read directly
+     * from the memory-mapped {@code .dat} buffer.
+     * </p>
+     *
+     * @param termId dense term id in {@code [0, vocabSize)}
+     * @param reuse  destination buffer that will receive the term bytes;
+     *               grown automatically if needed
+     * @return {@code reuse.get()} after the copy, valid until the next call on the same buffer
+     * @throws IllegalArgumentException if {@code termId} is out of range
+     * @throws NullPointerException     if {@code reuse} is null
+     */
+    public BytesRef formBytes(final int termId, final BytesRefBuilder reuse) {
+        checkTermId(termId);
+        Objects.requireNonNull(reuse, "reuse");
+    
+        final int start = off.get(termId);
+        final int end = off.get(termId + 1);
+        final int length = end - start;
+    
+        reuse.grow(length);
+        final ByteBuffer dup = dat.duplicate();
+        dup.position(start);
+        dup.limit(end);
+        dup.get(reuse.bytes(), 0, length);
+        reuse.setLength(length);
+        return reuse.get();
+    }
+
     /**
      * Returns the in-memory heap usage of the loaded FST, in bytes.
      * <p>
@@ -412,55 +460,6 @@ public static TermLexicon openOrBuild(final IndexReader reader, final Path sideD
         return open(sideDir, field);
     }
 
-    /**
-     * Returns the term string for one dense term id.
-     * <p>
-     * {@code term(0)} returns the empty string (reserved absent-term slot).
-     * Uses a per-thread scratch buffer internally. Suitable for moderate use
-     * (e.g. resolving 50 term ids for display). For tight loops over the full
-     * vocabulary, prefer {@link #termBytes(int, BytesRefBuilder)} with a caller-owned buffer.
-     * </p>
-     *
-     * @param termId dense term id in {@code [0, vocabSize)}
-     * @return decoded UTF-8 term string, never null; empty for the reserved id 0
-     * @throws IllegalArgumentException if {@code termId} is out of range
-     */
-    public String term(final int termId) {
-        return termBytes(termId, TL_TERM_STRING.get()).utf8ToString();
-    }
-
-    /**
-     * Copies the raw UTF-8 bytes of one term into a caller-provided reusable buffer.
-     * <p>
-     * This avoids allocation when called in a loop. The bytes are read directly
-     * from the memory-mapped {@code .dat} buffer.
-     * </p>
-     *
-     * @param termId dense term id in {@code [0, vocabSize)}
-     * @param reuse  destination buffer that will receive the term bytes;
-     *               grown automatically if needed
-     * @return {@code reuse.get()} after the copy, valid until the next call on the same buffer
-     * @throws IllegalArgumentException if {@code termId} is out of range
-     * @throws NullPointerException     if {@code reuse} is null
-     */
-    public BytesRef termBytes(final int termId, final BytesRefBuilder reuse) {
-        checkTermId(termId);
-        Objects.requireNonNull(reuse, "reuse");
-
-        final int start = off.get(termId);
-        final int end = off.get(termId + 1);
-        final int length = end - start;
-
-        reuse.grow(length);
-        final ByteBuffer dup = dat.duplicate();
-        dup.position(start);
-        dup.limit(end);
-        dup.get(reuse.bytes(), 0, length);
-        reuse.setLength(length);
-        return reuse.get();
-    }
-
-    
     /**
      * Returns the number of entries in the lexicon, including the reserved id 0.
      * <p>
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/terms/TermSuggest.java b/common/src/java/com/github/oeuvres/alix/lucene/terms/TermSuggest.java
@@ -105,7 +105,7 @@ public TermSuggest(
         for (int termId = 0; termId < vocabSize; termId++) {
             sb.append(SEP);
             offsets[termId] = sb.length();
-            sb.append(Char.toAscii(lexicon.term(termId)));
+            sb.append(Char.toAscii(lexicon.form(termId)));
         }
         sb.append(SEP);
         offsets[vocabSize] = sb.length();
@@ -205,7 +205,7 @@ public TopTerms suggest(
         int rank = 0;
         for (TopArray.IdScore entry : top) {
             final int termId = entry.id();
-            final String term = lexicon.term(termId);
+            final String term = lexicon.form(termId);
             final String termFolded = ascii.substring(
                 offsets[termId],
                 offsets[termId + 1] - 1
diff --git a/common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java b/common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java
@@ -980,6 +980,26 @@ public final class TermEntry
             this.termId = termId;
         }
         
+        /**
+         * Returns the current doc count for this term
+         *
+         * @return current count of document with this term
+         */
+        public long docs()
+        {
+            return termDocs[termId];
+        }
+
+        /**
+         * Returns the global doc count for this term
+         *
+         * @return global count of document with this term
+         */
+        public long fieldDocs()
+        {
+            return fieldStats.termDocs(termId);
+        }
+
         /**
          * Returns the full-field occurrence count.
          *
@@ -1030,9 +1050,9 @@ public double score()
          *
          * @return display term
          */
-        public String term()
+        public String form()
         {
-            return lexicon.term(termId);
+            return lexicon.form(termId);
         }
         
         /**
diff --git a/test/src/main/java/com/github/oeuvres/alix/lucene/terms/CoocDemo.java b/test/src/main/java/com/github/oeuvres/alix/lucene/terms/CoocDemo.java
@@ -234,7 +234,7 @@ public static void main(String[] args) throws Exception {
                 final int limit = Math.min(topN, entries.size());
                 for (int i = 0; i < limit; i++) {
                     final Entry e = entries.get(i);
-                    System.out.printf("  %-28s %8d %8.4f%n", lex.term(e.termId()), e.count(), e.score());
+                    System.out.printf("  %-28s %8d %8.4f%n", lex.form(e.termId()), e.count(), e.score());
                 }
                 if (entries.isEmpty()) {
                     System.out.println("  (no co-occurrents above minCount=" + minCount + ")");
diff --git a/test/src/main/java/com/github/oeuvres/alix/lucene/terms/TermLexiconDemo.java b/test/src/main/java/com/github/oeuvres/alix/lucene/terms/TermLexiconDemo.java
@@ -42,10 +42,10 @@ public static void main(String[] args) throws Exception {
                 return;
             }
     
-            System.out.println("term(" + termId + ") = " + lexicon.term(termId));
+            System.out.println("term(" + termId + ") = " + lexicon.form(termId));
 
 
-            final BytesRef termBytes = new BytesRef(lexicon.term(termId));
+            final BytesRef termBytes = new BytesRef(lexicon.form(termId));
             final PostingsEnum postings = MultiTerms.getTermPostingsEnum(
                 reader,
                 field,
diff --git a/test/src/main/java/com/github/oeuvres/alix/lucene/terms/ThemeTermsDemo.java b/test/src/main/java/com/github/oeuvres/alix/lucene/terms/ThemeTermsDemo.java
@@ -81,7 +81,7 @@ public static void main(final String[] args) throws Exception {
                 fieldStats.termWeights(luceneReader, scorer);
                 TopTerms top = TopTerms.theme(fieldStats, lexicon, topK);
                 for(TermEntry term: top) {
-                    System.out.print(term.term() + ", ");
+                    System.out.print(term.form() + ", ");
                 }
             }
 
diff --git a/web/src/main/java/com/github/oeuvres/alix/web/OpSuggest.java b/web/src/main/java/com/github/oeuvres/alix/web/OpSuggest.java
@@ -132,9 +132,12 @@ protected void json(
                 for (TermEntry term : topTerms) {
                     jw.beginObject();
                     jw.name("rank").value(rank++);
-                    jw.name("term").value(term.term());
+                    jw.name("term").value(term.form());
                     jw.name("html").value(term.hilite());
-                    jw.name("count").value(term.freq());
+                    jw.name("docs").value(term.docs());
+                    jw.name("freq").value(term.freq());
+                    jw.name("fieldDocs").value(term.fieldDocs());
+                    jw.name("fieldFreq").value(term.fieldFreq());
                     // score has no sense here
                     jw.endObject();
                 }
diff --git a/web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java b/web/src/main/java/com/github/oeuvres/alix/web/OpTerms.java
@@ -139,12 +139,7 @@ else if ("part4".equals(scorerName)) {
                 topTerms.focus(index.reader(), focusDocs);
                 return topTerms.rank(new KeynessScorer.LogLikelihood(), topK);
             }
-            else if ("rsj".equals(scorerName)) {
-                return topTerms.focus(index.reader(), focusDocs, new IdfTermScorer.BM25(idfExp, IdfTermScorer.BM25.Mode.RSJ), topK);
-            }
-            else if ("irdf".equals(scorerName)) {
-                return topTerms.focus(index.reader(), focusDocs, new IdfTermScorer.BM25(idfExp, IdfTermScorer.BM25.Mode.IRDF), topK);
-            }
+
             else if ("chi2".equals(scorerName)) {
                 topTerms.focus(index.reader(), focusDocs);
                 return topTerms.rank(new KeynessScorer.Chi2(), topK);
@@ -260,7 +255,7 @@ protected void html(LuceneIndex index, HttpServletRequest request, HttpServletRe
             for (TermEntry term : topTerms) {
                 writer.append("  <tr>\n")
                   .append("    <th class=\"term\">%d</th>\n".formatted(rank++))
-                  .append("    <td class=\"term\">%s</td>\n".formatted(term.term()))
+                  .append("    <td class=\"term\">%s</td>\n".formatted(term.form()))
                   .append("    <td class=\"count\" align=\"right\">%d</td>\n".formatted(term.freq()))
                   .append("    <td class=\"score\" align=\"right\">%f</td>\n".formatted(term.score()))
                   .append("  </tr>\n");
@@ -302,9 +297,12 @@ protected void json(
                 for (TermEntry term : topTerms) {
                     jw.beginObject();
                     jw.name("rank").value(rank++);
-                    jw.name("term").value(term.term());
-                    jw.name("count").value(term.freq());
+                    jw.name("form").value(term.form());
+                    jw.name("docs").value(term.docs());
+                    jw.name("freq").value(term.freq());
                     jw.name("score").value(term.score());
+                    jw.name("fieldDocs").value(term.fieldDocs());
+                    jw.name("fieldFreq").value(term.fieldFreq());
                     jw.endObject();
                 }
                 jw.endArray();

Original file line number	Diff line number	Diff line change
`@@ -234,7 +234,7 @@ public static void main(String[] args) throws Exception {`
`234`	`234`	`final int limit = Math.min(topN, entries.size());`
`235`	`235`	`for (int i = 0; i < limit; i++) {`
`236`	`236`	`final Entry e = entries.get(i);`
`237`		`- System.out.printf(" %-28s %8d %8.4f%n", lex.term(e.termId()), e.count(), e.score());`
	`237`	`+ System.out.printf(" %-28s %8d %8.4f%n", lex.form(e.termId()), e.count(), e.score());`
`238`	`238`	`}`
`239`	`239`	`if (entries.isEmpty()) {`
`240`	`240`	`System.out.println(" (no co-occurrents above minCount=" + minCount + ")");`
Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ public static void main(final String[] args) throws Exception {`
`81`	`81`	`fieldStats.termWeights(luceneReader, scorer);`
`82`	`82`	`TopTerms top = TopTerms.theme(fieldStats, lexicon, topK);`
`83`	`83`	`for(TermEntry term: top) {`
`84`		`- System.out.print(term.term() + ", ");`
	`84`	`+ System.out.print(term.form() + ", ");`
`85`	`85`	`}`
`86`	`86`	`}`
`87`	`87`