11package com .github .oeuvres .alix .lucene .terms ;
22
3+ import java .util .Arrays ;
4+
35/**
46 * Computes a keyness score for one term from a partitioned corpus,
57 * scoring the focus part against the other parts.
68 *
79 * <p>
8- * All counts are raw token occurrences, not document frequencies. Inputs
9- * are aligned by part id: {@code partTermFreq[p]} occurrences in part
10- * {@code p}, with {@code partTokens[p]} total tokens in that part.
10+ * All counts are raw token occurrences except where doc counts are
11+ * explicitly named. Inputs are aligned by part id: {@code partTermFreq[p]}
12+ * occurrences in part {@code p}, with {@code partTokens[p]} total tokens
13+ * in that part.
1114 * </p>
1215 *
1316 * <p>
1417 * Implementations assume {@code partTermFreq.length == partTokens.length},
15- * {@code 0 <= partTermFreq[p] <= partTokens[p]}, and a valid {@code focusPart}.
16- * Caller bugs (length mismatch, out-of-range index, tf > tokens) throw
17- * {@link IllegalArgumentException}; valid but degenerate inputs (zero focus
18- * tokens, no usable comparison part) return {@link Double#NaN}.
18+ * a valid {@code focusPart}, and {@code 0 <= partTermFreq[p] <= partTokens[p]}
19+ * for every part. Caller bugs throw {@link IllegalArgumentException};
20+ * valid but degenerate inputs return {@link Double#NaN}.
21+ * </p>
22+ *
23+ * <p>
24+ * The {@code focusTermDocs} and {@code focusDocs} arguments support
25+ * focus-internal weighting and dispersion checks. Scorers that don't use
26+ * doc structure (e.g. {@link Pearson}) ignore them.
1927 * </p>
2028 *
2129 * <p>
22- * Counts are aggregated as {@code double} for chi-square arithmetic.
23- * Realistic text-corpus counts stay well below 2^53, so the cast is harmless.
30+ * Counts aggregate as {@code double} for chi-square arithmetic. Realistic
31+ * text-corpus counts stay well below 2^53; the cast is harmless.
2432 * </p>
2533 */
2634public interface PartScorer {
2735
2836 /**
29- * @param partTermFreq term occurrences per part for the current term
30- * @param partTokens total token count per part
31- * @param focusPart part id to score
37+ * @param partTermFreq term occurrences per part for the current term
38+ * @param partTokens total token count per part
39+ * @param focusPart part id to score
40+ * @param focusTermDocs number of focus-part documents containing the
41+ * term (0 if unknown or unused)
42+ * @param focusDocs total number of documents in the focus part
43+ * (0 if unknown or unused)
3244 * @return signed score; positive when the term is over-represented in
3345 * {@code focusPart}; {@link Double#NaN} when no signal is computable
3446 */
35- double score (long [] partTermFreq , long [] partTokens , int focusPart );
47+ double score (
48+ long [] partTermFreq ,
49+ long [] partTokens ,
50+ int focusPart ,
51+ int focusTermDocs ,
52+ int focusDocs
53+ );
3654
3755 /**
3856 * Pearson chi-square goodness-of-fit of the term's per-part counts
@@ -65,7 +83,9 @@ class Pearson implements PartScorer {
6583 public double score (
6684 final long [] partTermFreq ,
6785 final long [] partTokens ,
68- final int focusPart
86+ final int focusPart ,
87+ final int focusTermDocs ,
88+ final int focusDocs
6989 ) {
7090 checkInputs (partTermFreq , partTokens , focusPart );
7191
@@ -97,52 +117,123 @@ public double score(
97117 }
98118
99119 /**
100- * Minimum signed pairwise G² between the focus part and each other
101- * part taken individually.
120+ * Configurable pairwise log-likelihood scorer for the focus part.
102121 *
103122 * <p>
104- * Conservative dominance criterion: the focus must out-rank every
105- * non-focus part to receive a positive score. <strong>Anti-robust to
106- * outliers</strong>: a single non-focus part with a high local rate
107- * flips the score regardless of behaviour elsewhere. Pairs with the
108- * other part holding fewer than {@code minPartTokens} tokens are
109- * skipped to keep noisy small parts out of the minimum.
123+ * Base statistic: signed 2×2 G² between the focus part and every other
124+ * non-focus part taken individually. Returns the {@code k}-th worst
125+ * (smallest) pairwise G². With {@code k=1} this is strict dominance —
126+ * the focus must out-rank every other part. With larger {@code k} the
127+ * scorer tolerates a few outlier non-focus parts; setting {@code k} to
128+ * the median position gives a robust dominance criterion that ignores
129+ * a single bursty non-focus year.
130+ * </p>
131+ *
132+ * <p>
133+ * Optional focus-document weighting multiplies the pairwise score by
134+ * {@code log(1 + focusTermDocs)}, surfacing terms that occur in many
135+ * focus documents (typological terms, recurring motifs) rather than
136+ * terms whose elevated rate comes from a few high-frequency documents.
137+ * Mirrors the IDF half of BM25 ranking.
138+ * </p>
139+ *
140+ * <p>
141+ * Optional focus-dispersion demotion divides the score by a penalty
142+ * when the term occurs in only a small fraction of focus documents.
143+ * Penalty is {@code (focusDocs / focusTermDocs)^a}, with {@code a} the
144+ * dispersion exponent (0 = no penalty, 1 = inverse coverage, 0.5 a
145+ * gentler middle). This catches terms that look characteristic only
146+ * because of one or two outlier documents inside the focus.
110147 * </p>
111148 */
112149 class LogLikelihood implements PartScorer {
113- /** Default minimum tokens for a part to enter the pairwise comparison. */
150+
151+ /** Default minimum tokens for a non-focus part to enter the comparison. */
114152 public static final long DEFAULT_MIN_PART_TOKENS = 1000L ;
115153
154+ /** Aggregation strategy across the per-part pairwise G² values. */
155+ public enum Aggregation {
156+ /** Strict dominance: smallest pairwise G² wins. */
157+ MIN ,
158+ /** Robust dominance: median pairwise G² (k = (n+1)/2). */
159+ MEDIAN ,
160+ /** Configurable: k-th worst pairwise G², with k set explicitly. */
161+ KTH_WORST
162+ }
163+
116164 private final long minPartTokens ;
165+ private final Aggregation aggregation ;
166+ private final int kthWorst ;
167+ private final boolean docFreqWeight ;
168+ private final double dispersionExponent ;
117169
118- /** Uses {@link #DEFAULT_MIN_PART_TOKENS}. */
119- public LogLikelihood () { this (DEFAULT_MIN_PART_TOKENS ); }
170+ /** Strict dominance, no doc-weighting, no dispersion penalty. */
171+ public LogLikelihood () {
172+ this (DEFAULT_MIN_PART_TOKENS , Aggregation .MIN , 1 , false , 0d );
173+ }
120174
121175 /**
122- * @param minPartTokens minimum tokens for a non-focus part to be
123- * considered; must be {@code >= 0}
176+ * Full configuration.
177+ *
178+ * @param minPartTokens minimum tokens for a non-focus part to
179+ * enter the comparison; must be >= 0
180+ * @param aggregation pairwise aggregation strategy
181+ * @param kthWorst with {@link Aggregation#KTH_WORST}, the
182+ * 1-based rank of pairwise G² to return;
183+ * ignored otherwise
184+ * @param docFreqWeight multiply score by log(1 + focusTermDocs)
185+ * @param dispersionExponent exponent {@code a} in the focus-coverage
186+ * penalty {@code (focusDocs/focusTermDocs)^a};
187+ * 0 disables; 0.5 gentle; 1 inverse coverage
124188 */
125- public LogLikelihood (final long minPartTokens ) {
189+ public LogLikelihood (
190+ final long minPartTokens ,
191+ final Aggregation aggregation ,
192+ final int kthWorst ,
193+ final boolean docFreqWeight ,
194+ final double dispersionExponent
195+ ) {
126196 if (minPartTokens < 0L ) {
127- throw new IllegalArgumentException ("minPartTokens < 0: " + minPartTokens );
197+ throw new IllegalArgumentException (
198+ "minPartTokens < 0: " + minPartTokens );
199+ }
200+ if (aggregation == null ) {
201+ throw new IllegalArgumentException ("aggregation is null" );
202+ }
203+ if (aggregation == Aggregation .KTH_WORST && kthWorst < 1 ) {
204+ throw new IllegalArgumentException (
205+ "kthWorst must be >= 1, got " + kthWorst );
206+ }
207+ if (dispersionExponent < 0d || Double .isNaN (dispersionExponent )) {
208+ throw new IllegalArgumentException (
209+ "dispersionExponent must be >= 0, got " + dispersionExponent );
128210 }
129211 this .minPartTokens = minPartTokens ;
212+ this .aggregation = aggregation ;
213+ this .kthWorst = kthWorst ;
214+ this .docFreqWeight = docFreqWeight ;
215+ this .dispersionExponent = dispersionExponent ;
130216 }
131217
132218 @ Override
133219 public double score (
134220 final long [] partTermFreq ,
135221 final long [] partTokens ,
136- final int focusPart
222+ final int focusPart ,
223+ final int focusTermDocs ,
224+ final int focusDocs
137225 ) {
138226 checkInputs (partTermFreq , partTokens , focusPart );
139227
140228 final long focusTermFreq = partTermFreq [focusPart ];
141229 final long focusTokens = partTokens [focusPart ];
142230 if (focusTokens <= 0L ) return Double .NaN ;
143231
144- double minG2 = Double .POSITIVE_INFINITY ;
145- boolean seen = false ;
232+ // Collect signed pairwise G² values against every usable non-focus part.
233+ // Capacity is partCount - 1; actual fill may be smaller after the
234+ // minPartTokens filter.
235+ final double [] pairwise = new double [partTokens .length - 1 ];
236+ int n = 0 ;
146237 for (int p = 0 ; p < partTokens .length ; p ++) {
147238 if (p == focusPart ) continue ;
148239 if (partTokens [p ] < minPartTokens ) continue ;
@@ -151,13 +242,58 @@ public double score(
151242 partTermFreq [p ], partTokens [p ]
152243 );
153244 if (Double .isNaN (g2 )) continue ;
154- if (g2 < minG2 ) minG2 = g2 ;
155- seen = true ;
245+ pairwise [n ++] = g2 ;
246+ }
247+ if (n == 0 ) return Double .NaN ;
248+
249+ // Aggregate. For MIN, a single linear scan beats sorting; for MEDIAN
250+ // and KTH_WORST, sort once and index. Sort cost is O(n log n) over
251+ // partCount values — partCount is small (dozens), this is cheap.
252+ double base ;
253+ switch (aggregation ) {
254+ case MIN :
255+ base = pairwise [0 ];
256+ for (int i = 1 ; i < n ; i ++) {
257+ if (pairwise [i ] < base ) base = pairwise [i ];
258+ }
259+ break ;
260+ case MEDIAN :
261+ Arrays .sort (pairwise , 0 , n );
262+ // (n+1)/2 in 1-based ranks → index (n-1)/2 in 0-based
263+ base = pairwise [(n - 1 ) / 2 ];
264+ break ;
265+ case KTH_WORST :
266+ Arrays .sort (pairwise , 0 , n );
267+ // k=1 → smallest → index 0; k=2 → next → index 1; ...
268+ // Cap at n if caller asked for a higher k than available.
269+ final int idx = Math .min (kthWorst , n ) - 1 ;
270+ base = pairwise [idx ];
271+ break ;
272+ default :
273+ throw new IllegalStateException (
274+ "unhandled aggregation: " + aggregation );
156275 }
157- return seen ? minG2 : Double .NaN ;
276+
277+ // Optional doc-frequency weight. log(1 + d) gives 0 for d=0 and
278+ // grows slowly — same shape as IDF damping in BM25.
279+ if (docFreqWeight && focusTermDocs > 0 ) {
280+ base *= Math .log (1d + focusTermDocs );
281+ }
282+
283+ // Optional focus-internal dispersion penalty. coverage = fraction
284+ // of focus docs that contain the term. Penalty divides by
285+ // (1/coverage)^a; equivalent to multiplying by coverage^a. With
286+ // a=0 the penalty is 1 (no effect). With a=1 a term in 10% of
287+ // focus docs is demoted by 10x.
288+ if (dispersionExponent > 0d && focusDocs > 0 && focusTermDocs > 0 ) {
289+ final double coverage = (double ) focusTermDocs / (double ) focusDocs ;
290+ base *= Math .pow (coverage , dispersionExponent );
291+ }
292+
293+ return base ;
158294 }
159295 }
160-
296+
161297 /**
162298 * Validates per-part vector shapes and per-cell invariants. Called
163299 * once per term during ranking; kept to a single pass.
0 commit comments