oeuvres
diff --git a/‎common/src/java/com/github/oeuvres/alix/lucene/terms/TermScorer.java‎
Lines changed: 161 additions & 123 deletions b/‎common/src/java/com/github/oeuvres/alix/lucene/terms/TermScorer.java‎
Lines changed: 161 additions & 123 deletions
@@ -1,229 +1,267 @@
 package com.github.oeuvres.alix.lucene.terms;
 
 /**
- * Local scorer for one term on one part.
+ * Local scorer for one term across documents (or parts).
  *
  * <p>Intended lifecycle:</p>
  * <ol>
- *   <li>prepare one scorer instance for one term with corpus-level statistics,</li>
- *   <li>call {@link #score(long, long)} for each part,</li>
- *   <li>aggregate local part scores outside this class.</li>
+ *   <li>call {@link #corpus(long, int)} once with corpus-level statistics,</li>
+ *   <li>for each term:
+ *     <ol>
+ *       <li>call {@link #term(long, int)} — resets the accumulator,</li>
+ *       <li>call {@link #collect(long, long)} for each document/part,</li>
+ *       <li>call {@link #result()} to obtain the aggregated score.</li>
+ *     </ol>
+ *   </li>
  * </ol>
  *
- * <p>This class is stateful. One instance must not be reused concurrently
- * for different terms.</p>
+ * <p>{@link #score(long, long)} remains available for callers who need
+ * the raw local score without accumulation.</p>
+ *
+ * <p>This class is stateful. One instance must not be reused concurrently.</p>
  */
 public abstract class TermScorer {
-    /**
-     * Aggregation rule used to reduce local part scores to one score per term.
-     */
-    public enum Aggregation {
-        /** Sum local scores over all parts. */
-        SUM,
-
-        /** Sum only positive local scores. */
-        SUM_POSITIVE,
 
-        /** Maximum local score over all parts. */
-        MAX,
+    // =========================================================================
+    // Corpus-level state (set once)
+    // =========================================================================
 
-        /** Maximum positive local score; negative local scores are ignored. */
-        MAX_POSITIVE,
-
-        /** Arithmetic mean of local scores over all parts. */
-        MEAN
-    }
     /** Total token count of the full corpus/field. */
     protected long corpusTokens;
-    
-    /** */
-    protected int corpusPartCount;
-    
 
-    /** Cached idf-like value derived from corpus statistics. */
-    protected double corpusIdf;
+    /** Number of scoring units (documents or parts). */
+    protected int corpusPartCount;
 
-    /** Average token count of one part. */
+    /** Average token count per scoring unit. */
     protected double partTokensAvg;
-    
-    /** Total occurrences of the current term in the full corpus/field. */
+
+    // =========================================================================
+    // Term-level state (reset per term)
+    // =========================================================================
+
+    /** Total occurrences of the current term in the corpus. */
     protected long corpusTermFreq;
 
-    /** Number of corpus documents containing the current term. */
+    /** Number of scoring units containing the current term. */
     protected int corpusTermDocs;
 
-    /** Global relative frequency of the current term in the corpus. */
+    /** Relative frequency of the current term: corpusTermFreq / corpusTokens. */
     protected double corpusTermRate;
 
+    /** Cached IDF-like factor, computed per term by subclasses that need it. */
+    protected double corpusIdf;
+
+    // =========================================================================
+    // Accumulator state (reset per term, updated per collect)
+    // =========================================================================
+
+    /** Running accumulator. Semantics depend on the subclass. */
+    protected double acc;
+
+    /** Number of scoring units observed via {@link #collect}. */
+    protected int collectCount;
+
+    // =========================================================================
+    // Corpus-level setup
+    // =========================================================================
 
     /**
-     * Prepare this scorer for one term.
+     * Set corpus-level statistics. Must be called once before any
+     * {@link #term(long, int)} call.
      *
-     * @param corpusTermFreq total occurrences of the term in the corpus
-     * @param corpusTermDocs number of corpus documents containing the term
-     * @param corpusTokens total token count in the corpus
-     * @param corpusDocs total live document count in the corpus
-     * @param avgPartTokens average token count of one part
+     * @param corpusTokens    total token count in the corpus
+     * @param corpusPartCount number of scoring units (documents or parts)
      */
-    public final void corpus(
-        final long corpusTokens,
-        final int corpusPartCount
-    ) {
+    public final void corpus(final long corpusTokens, final int corpusPartCount) {
         this.corpusTokens = corpusTokens;
         this.corpusPartCount = corpusPartCount;
-        this.partTokensAvg = (double) corpusTokens / (double) corpusPartCount;
-
-        this.corpusTermRate = 0d;
-        this.corpusIdf = 0d;
-
-        configure();
+        this.partTokensAvg = (corpusPartCount > 0)
+            ? (double) corpusTokens / (double) corpusPartCount
+            : 0d;
     }
 
+    // =========================================================================
+    // Term-level setup (resets accumulator)
+    // =========================================================================
+
     /**
-     * Initialize the global term rate of the current term:
-     * corpusTermFreq / corpusTokens.
+     * Prepare this scorer for a new term. Resets the accumulator.
+     *
+     * <p>Subclasses that compute per-term derived values (e.g. IDF) should
+     * override this method, call {@code super.term()} first, then set
+     * their derived fields.</p>
+     *
+     * @param corpusTermFreq total occurrences of the term in the corpus
+     * @param corpusTermDocs number of scoring units containing the term
      */
-    public void term(
-        final long corpusTermFreq,
-        final int corpusTermDocs
-    ) {
+    public void term(final long corpusTermFreq, final int corpusTermDocs) {
         this.corpusTermFreq = corpusTermFreq;
         this.corpusTermDocs = corpusTermDocs;
-        if (corpusTokens <= 0L) {
-            this.corpusTermRate = 0d;
-            return;
-        }
-        this.corpusTermRate = (double) corpusTermFreq / (double) corpusTokens;
+        this.corpusTermRate = (corpusTokens > 0L)
+            ? (double) corpusTermFreq / (double) corpusTokens
+            : 0d;
+        this.corpusIdf = 0d;
+        this.acc = accInit();
+        this.collectCount = 0;
     }
 
+    // =========================================================================
+    // Accumulation protocol: accInit / collect / result
+    // =========================================================================
+
     /**
-     * Optional hook after corpusTermRate and corpusIdf have been initialized.
+     * Initial accumulator value before any observation.
+     * Default is {@code 0.0} (suitable for sum-based aggregation).
+     *
+     * @return seed value for the accumulator
      */
-    protected void configure() {
-        // no-op
+    protected double accInit() {
+        return 0d;
     }
 
     /**
-     * Score one part for the prepared term.
+     * Finalize and return the aggregated score for the current term.
      *
-     * @param partTermFreq occurrences of the term in the part
-     * @param partTokens total token count of the part
-     * @return local score for that part
+     * <p>Default returns the raw accumulator (= sum).
+     * Subclasses may override for mean, clamping, etc.</p>
+     *
+     * @return aggregated corpus-level score for the current term
      */
-    public abstract double score(final long partTermFreq, final long partTokens);
+    public double result() {
+        return acc;
+    }
+
+    // =========================================================================
+    // Pure local score (no side effect on accumulator)
+    // =========================================================================
 
     /**
-     * Signed G-style contribution against the global corpus expectation.
+     * Compute the local score for one document/part and fold it into the
+     * accumulator.
      *
-     * <p>Local expectation in one part:</p>
-     * <pre>
-     * partExpectedTermFreq = corpusTermRate * partTokens
-     * </pre>
+     * @param partTermFreq occurrences of the term in the document/part
+     * @param partTokens   total token count of the document/part
+     */
+    public abstract double score(final long partTermFreq, final long partTokens);
+
+    // =========================================================================
+    // Concrete scorers
+    // =========================================================================
+
+    /**
+     * Signed G-test contribution against the corpus expectation.
      *
-     * <p>Score:</p>
      * <pre>
-     * 2 * partTermFreq * ln(partTermFreq / partExpectedTermFreq)
+     * score = 2 × partTermFreq × ln(partTermFreq / expectedFreq)
      * </pre>
      *
-     * <p>Positive when the term is over-represented in the part,
-     * negative when under-represented.</p>
+     * <p>Positive when over-represented, negative when under-represented.
+     * Default aggregation: sum of positive contributions only.</p>
      */
-    public static final class G extends TermScorer {
+    public static class G extends TermScorer {
+
+        /**
+         * Only accumulate positive contributions (over-represented parts).
+         * Negative G values indicate under-representation; including them
+         * in the sum would dilute the keyword signal.
+         */
         @Override
         public double score(final long partTermFreq, final long partTokens) {
-            if (partTokens <= 0L || corpusTermRate <= 0d) {
+
+            if (partTokens <= 0L || corpusTermRate <= 0d || partTermFreq <= 0L) {
                 return 0d;
             }
-
-            final double partExpectedTermFreq = corpusTermRate * (double) partTokens;
-
-            if (partExpectedTermFreq <= 0d || partTermFreq <= 0L) {
+            final double expected = corpusTermRate * (double) partTokens;
+            if (expected <= 0d) {
                 return 0d;
             }
-
-            return 2d * (double) partTermFreq
-                * Math.log((double) partTermFreq / partExpectedTermFreq);
+            final double local =  2d * (double) partTermFreq * Math.log((double) partTermFreq / expected);
+            acc += local;
+            collectCount++;
+            return local;
         }
     }
 
     /**
      * Count-form Jaccard coefficient.
      *
-     * <p>This is not an expectation scorer. It treats:</p>
      * <pre>
-     * intersection = partTermFreq
-     * union        = partTokens + corpusTermFreq - partTermFreq
+     * score = partTermFreq / (partTokens + corpusTermFreq - partTermFreq)
      * </pre>
      *
-     * <p>Result is in [0, 1] when inputs are coherent.</p>
+     * <p>Default aggregation: sum.</p>
      */
-    public static final class Jaccard extends TermScorer {
+    public static class Jaccard extends TermScorer {
+
         @Override
         public double score(final long partTermFreq, final long partTokens) {
             if (partTermFreq <= 0L || partTokens <= 0L || corpusTermFreq <= 0L) {
                 return 0d;
             }
-
             final long union = partTokens + corpusTermFreq - partTermFreq;
             if (union <= 0L) {
                 return 0d;
             }
-
-            return (double) partTermFreq / (double) union;
+            final double local = partTermFreq / (double) union;
+            acc += local;
+            collectCount++;
+            return local;
         }
     }
 
     /**
-     * BM25-like local score on one part.
+     * BM25-style scorer.
      *
-     * <p>Length normalization uses avgPartTokens.</p>
+     * <pre>
+     * score = IDF × tf × (k1 + 1) / (tf + k1 × (1 - b + b × dl / avgdl))
+     * </pre>
+     *
+     * <p>IDF is computed per term in {@link #term(long, int)}.
+     * Default aggregation: sum (the "summed BM25" corpus keyword score).</p>
      */
-    public static final class BM25 extends TermScorer {
-        private final double k1;
-        private final double b;
+    public static class BM25 extends TermScorer {
 
+        /** Default IR parameters: k1=1.2, poor effect with aggregation */
+        private final double k1 = 1.2d;
+        /** Default IR parameters: b=0.75, poor effect with aggregation */
+        private final double b = 0.75d;
+        private final double idfExp;
+        
         public BM25() {
-            this(1.2d, 0.75d);
+            this(1);
         }
 
-        public BM25(final double k1, final double b) {
-            if (k1 < 0d) {
-                throw new IllegalArgumentException("k1 must be >= 0, got " + k1);
-            }
-            if (b < 0d || b > 1d) {
-                throw new IllegalArgumentException("b must be in [0,1], got " + b);
-            }
-            this.k1 = k1;
-            this.b = b;
+        /**
+         * @param k1 term frequency saturation (≥ 0). Lower = faster saturation.
+         */
+        public BM25(final double idfExp) {
+            this.idfExp = idfExp;
         }
 
         @Override
-        public final void term(
-            final long corpusTermFreq,
-            final int corpusTermDocs
-        ) {
+        public void term(final long corpusTermFreq, final int corpusTermDocs) {
             super.term(corpusTermFreq, corpusTermDocs);
             if (corpusPartCount <= 0) {
                 this.corpusIdf = 0d;
                 return;
             }
-
-            this.corpusIdf = Math.log(
-                1.0d + ((double) corpusPartCount - (double) corpusTermDocs + 0.5d)
-                    / ((double) corpusTermDocs + 0.5d)
-            );
+            final double n = corpusPartCount;
+            final double df = corpusTermDocs;
+            double rawIdf = Math.log(1.0d + (n - df + 0.5d) / (df + 0.5d));
+            this.corpusIdf = Math.pow(rawIdf, idfExp);
         }
 
         @Override
         public double score(final long partTermFreq, final long partTokens) {
             if (partTermFreq <= 0L || partTokens <= 0L || partTokensAvg <= 0d || corpusIdf <= 0d) {
                 return 0d;
             }
-
             final double tf = (double) partTermFreq;
             final double norm = k1 * (1d - b + b * ((double) partTokens / partTokensAvg));
-
-            return corpusIdf * (tf * (k1 + 1d)) / (tf + norm);
+            final double local = corpusIdf * (tf * (k1 + 1d)) / (tf + norm);
+            acc += local;
+            collectCount++;
+            return local;
         }
     }
-}
+}