Skip to content

Commit d598fda

Browse files
committed
Test a part scorer with more parameters
1 parent b26eff0 commit d598fda

7 files changed

Lines changed: 220 additions & 59 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/terms/PartScorer.java

Lines changed: 172 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,56 @@
11
package com.github.oeuvres.alix.lucene.terms;
22

3+
import java.util.Arrays;
4+
35
/**
46
* Computes a keyness score for one term from a partitioned corpus,
57
* scoring the focus part against the other parts.
68
*
79
* <p>
8-
* All counts are raw token occurrences, not document frequencies. Inputs
9-
* are aligned by part id: {@code partTermFreq[p]} occurrences in part
10-
* {@code p}, with {@code partTokens[p]} total tokens in that part.
10+
* All counts are raw token occurrences except where doc counts are
11+
* explicitly named. Inputs are aligned by part id: {@code partTermFreq[p]}
12+
* occurrences in part {@code p}, with {@code partTokens[p]} total tokens
13+
* in that part.
1114
* </p>
1215
*
1316
* <p>
1417
* Implementations assume {@code partTermFreq.length == partTokens.length},
15-
* {@code 0 <= partTermFreq[p] <= partTokens[p]}, and a valid {@code focusPart}.
16-
* Caller bugs (length mismatch, out-of-range index, tf > tokens) throw
17-
* {@link IllegalArgumentException}; valid but degenerate inputs (zero focus
18-
* tokens, no usable comparison part) return {@link Double#NaN}.
18+
* a valid {@code focusPart}, and {@code 0 <= partTermFreq[p] <= partTokens[p]}
19+
* for every part. Caller bugs throw {@link IllegalArgumentException};
20+
* valid but degenerate inputs return {@link Double#NaN}.
21+
* </p>
22+
*
23+
* <p>
24+
* The {@code focusTermDocs} and {@code focusDocs} arguments support
25+
* focus-internal weighting and dispersion checks. Scorers that don't use
26+
* doc structure (e.g. {@link Pearson}) ignore them.
1927
* </p>
2028
*
2129
* <p>
22-
* Counts are aggregated as {@code double} for chi-square arithmetic.
23-
* Realistic text-corpus counts stay well below 2^53, so the cast is harmless.
30+
* Counts aggregate as {@code double} for chi-square arithmetic. Realistic
31+
* text-corpus counts stay well below 2^53; the cast is harmless.
2432
* </p>
2533
*/
2634
public interface PartScorer {
2735

2836
/**
29-
* @param partTermFreq term occurrences per part for the current term
30-
* @param partTokens total token count per part
31-
* @param focusPart part id to score
37+
* @param partTermFreq term occurrences per part for the current term
38+
* @param partTokens total token count per part
39+
* @param focusPart part id to score
40+
* @param focusTermDocs number of focus-part documents containing the
41+
* term (0 if unknown or unused)
42+
* @param focusDocs total number of documents in the focus part
43+
* (0 if unknown or unused)
3244
* @return signed score; positive when the term is over-represented in
3345
* {@code focusPart}; {@link Double#NaN} when no signal is computable
3446
*/
35-
double score(long[] partTermFreq, long[] partTokens, int focusPart);
47+
double score(
48+
long[] partTermFreq,
49+
long[] partTokens,
50+
int focusPart,
51+
int focusTermDocs,
52+
int focusDocs
53+
);
3654

3755
/**
3856
* Pearson chi-square goodness-of-fit of the term's per-part counts
@@ -65,7 +83,9 @@ class Pearson implements PartScorer {
6583
public double score(
6684
final long[] partTermFreq,
6785
final long[] partTokens,
68-
final int focusPart
86+
final int focusPart,
87+
final int focusTermDocs,
88+
final int focusDocs
6989
) {
7090
checkInputs(partTermFreq, partTokens, focusPart);
7191

@@ -97,52 +117,123 @@ public double score(
97117
}
98118

99119
/**
100-
* Minimum signed pairwise G² between the focus part and each other
101-
* part taken individually.
120+
* Configurable pairwise log-likelihood scorer for the focus part.
102121
*
103122
* <p>
104-
* Conservative dominance criterion: the focus must out-rank every
105-
* non-focus part to receive a positive score. <strong>Anti-robust to
106-
* outliers</strong>: a single non-focus part with a high local rate
107-
* flips the score regardless of behaviour elsewhere. Pairs with the
108-
* other part holding fewer than {@code minPartTokens} tokens are
109-
* skipped to keep noisy small parts out of the minimum.
123+
* Base statistic: signed 2×2 G² between the focus part and every other
124+
* non-focus part taken individually. Returns the {@code k}-th worst
125+
* (smallest) pairwise G². With {@code k=1} this is strict dominance —
126+
* the focus must out-rank every other part. With larger {@code k} the
127+
* scorer tolerates a few outlier non-focus parts; setting {@code k} to
128+
* the median position gives a robust dominance criterion that ignores
129+
* a single bursty non-focus year.
130+
* </p>
131+
*
132+
* <p>
133+
* Optional focus-document weighting multiplies the pairwise score by
134+
* {@code log(1 + focusTermDocs)}, surfacing terms that occur in many
135+
* focus documents (typological terms, recurring motifs) rather than
136+
* terms whose elevated rate comes from a few high-frequency documents.
137+
* Mirrors the IDF half of BM25 ranking.
138+
* </p>
139+
*
140+
* <p>
141+
* Optional focus-dispersion demotion divides the score by a penalty
142+
* when the term occurs in only a small fraction of focus documents.
143+
* Penalty is {@code (focusDocs / focusTermDocs)^a}, with {@code a} the
144+
* dispersion exponent (0 = no penalty, 1 = inverse coverage, 0.5 a
145+
* gentler middle). This catches terms that look characteristic only
146+
* because of one or two outlier documents inside the focus.
110147
* </p>
111148
*/
112149
class LogLikelihood implements PartScorer {
113-
/** Default minimum tokens for a part to enter the pairwise comparison. */
150+
151+
/** Default minimum tokens for a non-focus part to enter the comparison. */
114152
public static final long DEFAULT_MIN_PART_TOKENS = 1000L;
115153

154+
/** Aggregation strategy across the per-part pairwise G² values. */
155+
public enum Aggregation {
156+
/** Strict dominance: smallest pairwise G² wins. */
157+
MIN,
158+
/** Robust dominance: median pairwise G² (k = (n+1)/2). */
159+
MEDIAN,
160+
/** Configurable: k-th worst pairwise G², with k set explicitly. */
161+
KTH_WORST
162+
}
163+
116164
private final long minPartTokens;
165+
private final Aggregation aggregation;
166+
private final int kthWorst;
167+
private final boolean docFreqWeight;
168+
private final double dispersionExponent;
117169

118-
/** Uses {@link #DEFAULT_MIN_PART_TOKENS}. */
119-
public LogLikelihood() { this(DEFAULT_MIN_PART_TOKENS); }
170+
/** Strict dominance, no doc-weighting, no dispersion penalty. */
171+
public LogLikelihood() {
172+
this(DEFAULT_MIN_PART_TOKENS, Aggregation.MIN, 1, false, 0d);
173+
}
120174

121175
/**
122-
* @param minPartTokens minimum tokens for a non-focus part to be
123-
* considered; must be {@code >= 0}
176+
* Full configuration.
177+
*
178+
* @param minPartTokens minimum tokens for a non-focus part to
179+
* enter the comparison; must be >= 0
180+
* @param aggregation pairwise aggregation strategy
181+
* @param kthWorst with {@link Aggregation#KTH_WORST}, the
182+
* 1-based rank of pairwise G² to return;
183+
* ignored otherwise
184+
* @param docFreqWeight multiply score by log(1 + focusTermDocs)
185+
* @param dispersionExponent exponent {@code a} in the focus-coverage
186+
* penalty {@code (focusDocs/focusTermDocs)^a};
187+
* 0 disables; 0.5 gentle; 1 inverse coverage
124188
*/
125-
public LogLikelihood(final long minPartTokens) {
189+
public LogLikelihood(
190+
final long minPartTokens,
191+
final Aggregation aggregation,
192+
final int kthWorst,
193+
final boolean docFreqWeight,
194+
final double dispersionExponent
195+
) {
126196
if (minPartTokens < 0L) {
127-
throw new IllegalArgumentException("minPartTokens < 0: " + minPartTokens);
197+
throw new IllegalArgumentException(
198+
"minPartTokens < 0: " + minPartTokens);
199+
}
200+
if (aggregation == null) {
201+
throw new IllegalArgumentException("aggregation is null");
202+
}
203+
if (aggregation == Aggregation.KTH_WORST && kthWorst < 1) {
204+
throw new IllegalArgumentException(
205+
"kthWorst must be >= 1, got " + kthWorst);
206+
}
207+
if (dispersionExponent < 0d || Double.isNaN(dispersionExponent)) {
208+
throw new IllegalArgumentException(
209+
"dispersionExponent must be >= 0, got " + dispersionExponent);
128210
}
129211
this.minPartTokens = minPartTokens;
212+
this.aggregation = aggregation;
213+
this.kthWorst = kthWorst;
214+
this.docFreqWeight = docFreqWeight;
215+
this.dispersionExponent = dispersionExponent;
130216
}
131217

132218
@Override
133219
public double score(
134220
final long[] partTermFreq,
135221
final long[] partTokens,
136-
final int focusPart
222+
final int focusPart,
223+
final int focusTermDocs,
224+
final int focusDocs
137225
) {
138226
checkInputs(partTermFreq, partTokens, focusPart);
139227

140228
final long focusTermFreq = partTermFreq[focusPart];
141229
final long focusTokens = partTokens[focusPart];
142230
if (focusTokens <= 0L) return Double.NaN;
143231

144-
double minG2 = Double.POSITIVE_INFINITY;
145-
boolean seen = false;
232+
// Collect signed pairwise G² values against every usable non-focus part.
233+
// Capacity is partCount - 1; actual fill may be smaller after the
234+
// minPartTokens filter.
235+
final double[] pairwise = new double[partTokens.length - 1];
236+
int n = 0;
146237
for (int p = 0; p < partTokens.length; p++) {
147238
if (p == focusPart) continue;
148239
if (partTokens[p] < minPartTokens) continue;
@@ -151,13 +242,58 @@ public double score(
151242
partTermFreq[p], partTokens[p]
152243
);
153244
if (Double.isNaN(g2)) continue;
154-
if (g2 < minG2) minG2 = g2;
155-
seen = true;
245+
pairwise[n++] = g2;
246+
}
247+
if (n == 0) return Double.NaN;
248+
249+
// Aggregate. For MIN, a single linear scan beats sorting; for MEDIAN
250+
// and KTH_WORST, sort once and index. Sort cost is O(n log n) over
251+
// partCount values — partCount is small (dozens), this is cheap.
252+
double base;
253+
switch (aggregation) {
254+
case MIN:
255+
base = pairwise[0];
256+
for (int i = 1; i < n; i++) {
257+
if (pairwise[i] < base) base = pairwise[i];
258+
}
259+
break;
260+
case MEDIAN:
261+
Arrays.sort(pairwise, 0, n);
262+
// (n+1)/2 in 1-based ranks → index (n-1)/2 in 0-based
263+
base = pairwise[(n - 1) / 2];
264+
break;
265+
case KTH_WORST:
266+
Arrays.sort(pairwise, 0, n);
267+
// k=1 → smallest → index 0; k=2 → next → index 1; ...
268+
// Cap at n if caller asked for a higher k than available.
269+
final int idx = Math.min(kthWorst, n) - 1;
270+
base = pairwise[idx];
271+
break;
272+
default:
273+
throw new IllegalStateException(
274+
"unhandled aggregation: " + aggregation);
156275
}
157-
return seen ? minG2 : Double.NaN;
276+
277+
// Optional doc-frequency weight. log(1 + d) gives 0 for d=0 and
278+
// grows slowly — same shape as IDF damping in BM25.
279+
if (docFreqWeight && focusTermDocs > 0) {
280+
base *= Math.log(1d + focusTermDocs);
281+
}
282+
283+
// Optional focus-internal dispersion penalty. coverage = fraction
284+
// of focus docs that contain the term. Penalty divides by
285+
// (1/coverage)^a; equivalent to multiplying by coverage^a. With
286+
// a=0 the penalty is 1 (no effect). With a=1 a term in 10% of
287+
// focus docs is demoted by 10x.
288+
if (dispersionExponent > 0d && focusDocs > 0 && focusTermDocs > 0) {
289+
final double coverage = (double) focusTermDocs / (double) focusDocs;
290+
base *= Math.pow(coverage, dispersionExponent);
291+
}
292+
293+
return base;
158294
}
159295
}
160-
296+
161297
/**
162298
* Validates per-part vector shapes and per-cell invariants. Called
163299
* once per term during ranking; kept to a single pass.

common/src/java/com/github/oeuvres/alix/lucene/terms/TopTerms.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,13 @@ public TopTerms partScore(
518518
if (focusFreq > 0) {
519519
focusTermFreq[termId] = focusFreq;
520520
focusTermDocs[termId] = focusDocsForTerm;
521-
final double s = scorer.score(partTermFreq, partTokens, focusPart);
521+
final double s = scorer.score(
522+
partTermFreq,
523+
partTokens,
524+
focusPart,
525+
focusDocsForTerm,
526+
partition.docs(focusPart)
527+
);
522528
if (!Double.isNaN(s)) {
523529
termScores[termId] = s;
524530
top.push(termId, s);

fr/src/resources/com/github/oeuvres/alix/fr/word.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104934,10 +104934,10 @@ cornichonneries,NOUN,cornichonnerie
104934104934
cornichons,NOUN,cornichon
104935104935
cornier,NOUN,cornier
104936104936
cornier,ADJ,cornier
104937-
cornière,NOUN,cornier
104938-
cornière,ADJ,cornier
104939-
cornières,NOUN,cornier
104940-
cornières,ADJ,cornier
104937+
cornière,NOUN,cornière
104938+
cornière,ADJ,cornière
104939+
cornières,NOUN,cornière
104940+
cornières,ADJ,cornière
104941104941
corniers,NOUN,cornier
104942104942
corniers,ADJ,cornier
104943104943
corniez,VERB,corner

web/src/main/java/com/github/oeuvres/alix/web/AlixServlet.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ protected void doGet(
105105
final HttpServletResponse response
106106
) throws IOException
107107
{
108+
response.setContentType("text/html; charset=UTF-8");
108109
final String pathInfo = (request.getPathInfo() != null) ? request.getPathInfo() : "/";
109110
final String[] segments = pathInfo.split("/");
110111

web/src/main/java/com/github/oeuvres/alix/web/Op.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,24 +77,24 @@ public boolean offer(
7777
* @param index the target index
7878
* @param format the requested format extension, or {@code null}
7979
* for the default full HTML page
80-
* @param req servlet request
81-
* @param resp servlet response
80+
* @param request servlet request
81+
* @param response servlet response
8282
*/
8383
public final void dispatch(
8484
final LuceneIndex index,
8585
final String format,
86-
final HttpServletRequest req,
87-
final HttpServletResponse resp) throws IOException
86+
final HttpServletRequest request,
87+
final HttpServletResponse response) throws IOException
8888
{
8989
if (format == null) {
90-
page(index, req, resp);
90+
page(index, request, response);
9191
} else
9292
switch (format) {
93-
case "json" -> json(index, req, resp);
94-
case "html" -> html(index, req, resp);
95-
case "jsonl" -> jsonl(index, req, resp);
96-
case "csv" -> csv(index, req, resp);
97-
default -> AlixServlet.jsonError(resp, 406,
93+
case "json" -> json(index, request, response);
94+
case "html" -> html(index, request, response);
95+
case "jsonl" -> jsonl(index, request, response);
96+
case "csv" -> csv(index, request, response);
97+
default -> AlixServlet.jsonError(response, 406,
9898
getClass().getSimpleName() + ": unsupported format: " + format);
9999
}
100100
}

web/src/main/java/com/github/oeuvres/alix/web/OpResults.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ protected void page(LuceneIndex index, HttpServletRequest request, HttpServletRe
3939
throws IOException
4040
{
4141
final HttpPars pars = new HttpPars(request, response);
42-
response.setContentType("text/html; charset=UTF-8");
4342
Writer writer = response.getWriter();
4443
writer.write("""
4544
<!DOCTYPE html>

0 commit comments

Comments
 (0)