Skip to content

Commit b26eff0

Browse files
committed
PartScorer works, but doesn’t do yet interesting things
1 parent bfed214 commit b26eff0

6 files changed

Lines changed: 496 additions & 262 deletions

File tree

common/src/java/com/github/oeuvres/alix/lucene/FlucNum.java

Lines changed: 76 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -384,21 +384,9 @@ public int numBytes()
384384
* </pre>
385385
*
386386
* <p>
387-
* The returned {@link Partition} is query-specific. Documents are assigned
388-
* only if all of the following are true:
389-
* </p>
390-
*
391-
* <ul>
392-
* <li>the document has a value for this numeric field;</li>
393-
* <li>the document is present in {@code acceptedDocs}, unless
394-
* {@code acceptedDocs == null};</li>
395-
* <li>the document's value belongs to a retained part.</li>
396-
* </ul>
397-
*
398-
* <p>
399387
* Documents outside the partition are assigned {@link Partition#NO_PART}.
400-
* Valid part ids are chronological/value order. The focus part is the part
401-
* corresponding to {@code [start, end]} and is available through
388+
* Valid part ids are in chronological/value order. The focus part is the
389+
* part corresponding to {@code [start, end]} and is available through
402390
* {@link Partition#focusPart()}.
403391
* </p>
404392
*
@@ -424,11 +412,11 @@ public int numBytes()
424412
public Partition partition(
425413
final int start,
426414
final int end,
427-
final PartialExtremityPolicy policy,
415+
PartialExtremityPolicy policy,
428416
final FixedBitSet acceptedDocs
429417
) throws IOException {
430418
if (policy == null) {
431-
throw new NullPointerException("policy");
419+
policy = PartialExtremityPolicy.ABSORB;
432420
}
433421
if (start > end) {
434422
throw new IllegalArgumentException(
@@ -445,69 +433,72 @@ public Partition partition(
445433
+ " < reader.maxDoc()=" + maxDoc);
446434
}
447435

448-
if (end < min || start > max) {
436+
// After cacheDense() we know min/max round-trip exactly through int.
437+
final int intMin = (int) min;
438+
final int intMax = (int) max;
439+
440+
if (end < intMin || start > intMax) {
449441
throw new IllegalArgumentException(
450442
"Focus interval [" + start + ',' + end
451443
+ "] does not overlap field range ["
452-
+ min + ',' + max + ']');
444+
+ intMin + ',' + intMax + ']');
453445
}
454446

455-
final int width = end - start + 1;
456-
if (width <= 0L) {
447+
// Compute width in long to detect overflow when start/end span the int
448+
// range. start <= end has already been validated, so widthLong >= 1.
449+
final long widthLong = (long) end - (long) start + 1L;
450+
if (widthLong > Integer.MAX_VALUE) {
457451
throw new IllegalArgumentException(
458-
"Invalid focus width for interval [" + start + ',' + end + ']');
452+
"Focus width too large: " + widthLong);
459453
}
454+
final int width = (int) widthLong;
460455

461-
final long kMin = Math.floorDiv((long) min - (long) start, width);
462-
final long kMax = Math.floorDiv((long) max - (long) start, width);
463-
464-
if (kMin > 0L || kMax < 0L) {
465-
throw new IllegalArgumentException(
466-
"Focus interval [" + start + ',' + end
467-
+ "] is not represented in field range ["
468-
+ min + ',' + max + ']');
469-
}
456+
// Anchored part index k: the focus is at k = 0, parts before are k < 0,
457+
// parts after are k > 0. Each k stands for the inclusive value range
458+
// [start + k*width, start + (k+1)*width - 1].
459+
// The overlap test above guarantees kMin <= 0 <= kMax.
460+
final long kMin = Math.floorDiv((long) intMin - (long) start, width);
461+
final long kMax = Math.floorDiv((long) intMax - (long) start, width);
470462

471463
final long rawCountLong = kMax - kMin + 1L;
472-
if (rawCountLong < 1L || rawCountLong > Integer.MAX_VALUE) {
464+
if (rawCountLong > Integer.MAX_VALUE) {
473465
throw new IllegalArgumentException(
474-
"Invalid raw part count: " + rawCountLong);
466+
"Too many raw parts: " + rawCountLong);
475467
}
476-
477468
final int rawCount = (int) rawCountLong;
478-
final long[] groupStart = new long[rawCount];
479-
final long[] groupEnd = new long[rawCount];
480469

481-
for (int i = 0; i < rawCount; i++) {
482-
final long k = kMin + i;
483-
groupStart[i] = k;
484-
groupEnd[i] = k;
485-
}
486-
487-
int first = 0;
488-
int last = rawCount - 1;
489-
490-
final boolean leftPartial = rawPartStart(start, width, kMin) < min;
491-
final boolean rightPartial = rawPartEnd(start, width, kMax) > max;
470+
// A leftmost / rightmost raw part is "partial" when the corpus does not
471+
// cover its full anchored width.
472+
final boolean leftPartial =
473+
((long) start + kMin * width) < intMin;
474+
final boolean rightPartial =
475+
((long) start + (kMax + 1L) * width - 1L) > intMax;
476+
477+
// Active raw range. Indices are into [0, rawCount); raw index i
478+
// corresponds to k = kMin + i. firstRaw/lastRaw narrow under DROP and
479+
// ABSORB; the absorbed raw is then redirected back into its neighbour
480+
// through mergeLeft/mergeRight.
481+
int firstRaw = 0;
482+
int lastRaw = rawCount - 1;
483+
boolean mergeLeft = false;
484+
boolean mergeRight = false;
492485

493486
if (leftPartial && kMin < 0L) {
494487
switch (policy) {
495488
case KEEP:
496489
break;
497490
case DROP:
498-
first++;
491+
firstRaw++;
499492
break;
500493
case ABSORB:
501-
if (kMin + 1L == 0L) {
502-
// Never absorb a partial extremity into the focus part.
503-
first++;
504-
} else {
505-
groupStart[1] = groupStart[0];
506-
first++;
494+
// Refuse to grow the focus part: when the only neighbour of
495+
// the partial extremity is the focus (kMin == -1), fall back
496+
// to DROP rather than absorb.
497+
firstRaw++;
498+
if (kMin != -1L) {
499+
mergeLeft = true;
507500
}
508501
break;
509-
default:
510-
throw new IllegalStateException("Unhandled policy: " + policy);
511502
}
512503
}
513504

@@ -516,23 +507,18 @@ public Partition partition(
516507
case KEEP:
517508
break;
518509
case DROP:
519-
last--;
510+
lastRaw--;
520511
break;
521512
case ABSORB:
522-
if (kMax - 1L == 0L) {
523-
// Never absorb a partial extremity into the focus part.
524-
last--;
525-
} else {
526-
groupEnd[last - 1] = groupEnd[last];
527-
last--;
513+
lastRaw--;
514+
if (kMax != 1L) {
515+
mergeRight = true;
528516
}
529517
break;
530-
default:
531-
throw new IllegalStateException("Unhandled policy: " + policy);
532518
}
533519
}
534520

535-
final int partCount = last - first + 1;
521+
final int partCount = lastRaw - firstRaw + 1;
536522
if (partCount < 1) {
537523
throw new IllegalArgumentException("No part left after applying " + policy);
538524
}
@@ -541,46 +527,53 @@ public Partition partition(
541527
"Too many parts for byte partition: " + partCount);
542528
}
543529

544-
int focusPart = Partition.NO_FOCUS;
530+
// Map each raw index to its part id. Indices outside [firstRaw, lastRaw]
531+
// are NO_PART unless absorbed into a neighbour. The focus part is the
532+
// one that contains k = 0.
545533
final int[] rawToPart = new int[rawCount];
546534
Arrays.fill(rawToPart, Partition.NO_PART);
547535

548-
for (int part = 0; part < partCount; part++) {
549-
final int groupIndex = first + part;
550-
final long from = groupStart[groupIndex];
551-
final long to = groupEnd[groupIndex];
552-
553-
if (from <= 0L && 0L <= to) {
536+
int focusPart = Partition.NO_FOCUS;
537+
for (int i = firstRaw; i <= lastRaw; i++) {
538+
final int part = i - firstRaw;
539+
rawToPart[i] = part;
540+
if ((kMin + i) == 0L) {
554541
focusPart = part;
555542
}
556-
557-
for (long k = from; k <= to; k++) {
558-
rawToPart[(int) (k - kMin)] = part;
559-
}
543+
}
544+
if (mergeLeft) {
545+
rawToPart[firstRaw - 1] = 0;
546+
}
547+
if (mergeRight) {
548+
rawToPart[lastRaw + 1] = partCount - 1;
560549
}
561550

562551
if (focusPart == Partition.NO_FOCUS) {
552+
// Unreachable: kMin <= 0 <= kMax and the focus part is never dropped
553+
// or absorbed. Defensive only.
563554
throw new IllegalStateException(
564-
"Focus part disappeared while building partition.");
555+
"Focus part missing while building partition.");
565556
}
566557

558+
// Assign documents. When acceptedDocs is null we walk only docs that
559+
// have a value; otherwise we walk the filter and skip docs without a
560+
// value.
567561
final Partition partition = new Partition(maxDoc, partCount, focusPart);
568-
569562
final FixedBitSet docsToScan =
570563
acceptedDocs == null ? densint.docHasValue : acceptedDocs;
564+
final int scanLimit = Math.min(docsToScan.length(), maxDoc);
571565

572566
for (int docId = docsToScan.nextSetBit(0);
573-
docId != DocIdSetIterator.NO_MORE_DOCS;
567+
docId != DocIdSetIterator.NO_MORE_DOCS && docId < scanLimit;
574568
docId = docsToScan.nextSetBit(docId + 1)) {
575569

576570
if (!densint.docHasValue.get(docId)) {
577571
continue;
578572
}
579573

580574
final int value = densint.docValues[docId];
581-
final long k = Math.floorDiv((long) value - (long) start, width);
582-
final long rawIndexLong = k - kMin;
583-
575+
final long rawIndexLong =
576+
Math.floorDiv((long) value - (long) start, (long) width) - kMin;
584577
if (rawIndexLong < 0L || rawIndexLong >= rawCount) {
585578
continue;
586579
}
@@ -595,7 +588,7 @@ public Partition partition(
595588

596589
return partition;
597590
}
598-
591+
599592
/**
600593
* Returns a human-readable point type label.
601594
*
@@ -741,50 +734,6 @@ private static boolean probeStored(
741734
return false;
742735
}
743736

744-
/**
745-
* Returns the inclusive end value of one raw anchored part.
746-
*
747-
* <p>
748-
* Raw part {@code k == 0} is the focus interval
749-
* {@code [start, start + width - 1]}. Negative parts precede it; positive
750-
* parts follow it.
751-
* </p>
752-
*
753-
* @param start focus start value
754-
* @param width part width
755-
* @param k raw anchored part index
756-
* @return inclusive raw part end
757-
*/
758-
private static long rawPartEnd(
759-
final int start,
760-
final long width,
761-
final long k
762-
) {
763-
return (long) start + (k + 1L) * width - 1L;
764-
}
765-
766-
/**
767-
* Returns the inclusive start value of one raw anchored part.
768-
*
769-
* <p>
770-
* Raw part {@code k == 0} is the focus interval
771-
* {@code [start, start + width - 1]}. Negative parts precede it; positive
772-
* parts follow it.
773-
* </p>
774-
*
775-
* @param start focus start value
776-
* @param width part width
777-
* @param k raw anchored part index
778-
* @return inclusive raw part start
779-
*/
780-
private static long rawPartStart(
781-
final int start,
782-
final long width,
783-
final long k
784-
) {
785-
return (long) start + k * width;
786-
}
787-
788737
/**
789738
* Policy for incomplete extremity parts when a fixed-width value partition is
790739
* anchored on a focus interval.

common/src/java/com/github/oeuvres/alix/lucene/terms/KeynessScorer.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,87 @@ public double score(
8585
return ppmFocus / ppmOther;
8686
}
8787
}
88+
89+
/**
90+
* Signed Pearson chi-square X² (Pearson 1900), 2×2 contingency.
91+
*
92+
* <p>
93+
* Same null model as {@link LogLikelihood} — independence of "term" and
94+
* "focus", expected counts from row/column marginals — but using the
95+
* Pearson statistic
96+
* </p>
97+
*
98+
* <pre>{@code
99+
* X² = Σ (O − E)² / E
100+
* }</pre>
101+
*
102+
* <p>
103+
* instead of the LL G². Sign convention matches {@code LogLikelihood}:
104+
* positive when the term's focus rate meets or exceeds its other rate.
105+
* </p>
106+
*
107+
* <p>
108+
* Pearson and LL agree closely for moderate counts but diverge on the
109+
* rare-term tail. Pearson's variance estimate {@code E} (a single
110+
* cell's expectation) reacts more harshly than LL's log-ratio when an
111+
* observed count is very far from a small expected count, which tends
112+
* to push very rare terms to the top of a Pearson ranking. LL's
113+
* {@code O · log(O/E)} is gentler in the same regime. This is the
114+
* usual reason corpus-linguistics work prefers LL over X² for keyness;
115+
* Dunning (1993) makes the argument explicit.
116+
* </p>
117+
*
118+
* <p>
119+
* Returns {@link Double#NaN} on invalid inputs (negative counts, count
120+
* exceeding tokens). Returns {@code 0} on degenerate marginals where
121+
* no expectation can be formed.
122+
* </p>
123+
*/
124+
class Chi2 implements KeynessScorer {
125+
@Override
126+
public double score(
127+
final long focusTermCount,
128+
final long focusTokens,
129+
final long otherTermCount,
130+
final long otherTokens
131+
) {
132+
if (focusTokens <= 0L || otherTokens <= 0L) return 0d;
133+
if (focusTermCount < 0L || otherTermCount < 0L) return Double.NaN;
134+
if (focusTermCount > focusTokens || otherTermCount > otherTokens) return Double.NaN;
135+
136+
final long focusNonTermCount = focusTokens - focusTermCount;
137+
final long otherNonTermCount = otherTokens - otherTermCount;
138+
139+
final long allTokens = focusTokens + otherTokens;
140+
final long allTermCount = focusTermCount + otherTermCount;
141+
final long allNonTermCount = focusNonTermCount + otherNonTermCount;
142+
143+
final double expectedFocusTerm = (double) focusTokens * allTermCount / allTokens;
144+
final double expectedOtherTerm = (double) otherTokens * allTermCount / allTokens;
145+
final double expectedFocusNonTerm = (double) focusTokens * allNonTermCount / allTokens;
146+
final double expectedOtherNonTerm = (double) otherTokens * allNonTermCount / allTokens;
147+
148+
double x2 = 0d;
149+
x2 += cell(focusTermCount, expectedFocusTerm);
150+
x2 += cell(otherTermCount, expectedOtherTerm);
151+
x2 += cell(focusNonTermCount, expectedFocusNonTerm);
152+
x2 += cell(otherNonTermCount, expectedOtherNonTerm);
153+
154+
return ((double) focusTermCount / focusTokens
155+
>= (double) otherTermCount / otherTokens) ? x2 : -x2;
156+
}
157+
158+
/**
159+
* One cell of the Pearson sum. Returns 0 when expected is non-positive
160+
* (degenerate marginal — corresponding row or column is empty).
161+
*/
162+
private static double cell(final long observed, final double expected)
163+
{
164+
if (expected <= 0d) return 0d;
165+
final double d = observed - expected;
166+
return (d * d) / expected;
167+
}
168+
}
88169

89170
/**
90171
* Log-Likelihood G² (Dunning 1993), for use as a significance pre-filter,

0 commit comments

Comments
 (0)