Skip to content

Commit 3527856

Browse files
committed
Fix KWIC capping
The properties krill.match.max.token and krill.context.max.token and, correspondingly variables, and parameters like maxTokenMatchSize, were introduced to configure the maximum visible token length of search hits with context ("KWICs") and exports, to adhere with copyright and license restrictions, which are very important. However, the implementation was flawed and apparently based on a misunderstanding between linguists, lawyers and programmers. The only point that matters legally is the total number of tokens shown in a KWIC snippet (left context + match + right context). If an actual match is larger than krill.kwic.max.token, it must be cut down to krill.kwic.max.token, if not the remaining token budget should be distributed between left and right context, either equally or in such a way that the total number of capped words in minimized. Change-Id: Ib0afd476fcd84144d4d9db18839ed8b9952f92e3
1 parent cd3fb7e commit 3527856

File tree

11 files changed

+436
-103
lines changed

11 files changed

+436
-103
lines changed

src/main/java/de/ids_mannheim/korap/KrillIndex.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -997,8 +997,9 @@ public Match getMatchInfo (String idString, String field, boolean info,
997997
if (DEBUG)
998998
log.trace("Get info on {}", idString);
999999

1000-
int maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
1001-
Match match = new Match(maxTokenMatchSize, idString, includeHighlights);
1000+
// Use total KWIC cap to limit match size at most to the total snippet cap
1001+
int kwicMax = de.ids_mannheim.korap.util.KrillProperties.getMaxTokenKwicSize();
1002+
Match match = new Match(kwicMax, idString, includeHighlights);
10021003

10031004
if (this.getVersion() != null)
10041005
match.setVersion(this.getVersion());
@@ -1569,11 +1570,8 @@ public Result search (Krill ks) {
15691570
? lreader.document(localDocID, fieldsSet)
15701571
: lreader.document(localDocID);
15711572

1572-
int maxMatchSize = ks.getMaxTokenMatchSize();
1573-
if (maxMatchSize <= 0
1574-
|| maxMatchSize > KrillProperties.maxTokenMatchSize) {
1575-
maxMatchSize = KrillProperties.maxTokenMatchSize;
1576-
};
1573+
// Use total KWIC cap for match capping, ignore per-query match limits
1574+
int maxMatchSize = KrillProperties.maxTokenKwicSize;
15771575

15781576
// Create new Match
15791577
final Match match = new Match(maxMatchSize, pto, localDocID,

src/main/java/de/ids_mannheim/korap/response/Match.java

Lines changed: 245 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,60 @@ public ObjectNode getSnippetTokens () {
16721672
log.debug("Set endContext {}", endContext);
16731673
};
16741674

1675+
// Enforce total KWIC token cap (left + match + right)
1676+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
1677+
if (kwicMax > 0) {
1678+
// Convert endContext to exclusive bound for iteration ease
1679+
int leftLen = (startContext < this.startPos) ? (this.startPos - startContext) : 0;
1680+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
1681+
int rightLen = (endContext > this.endPos) ? (endContext - this.endPos) : 0;
1682+
int total = leftLen + matchLen + rightLen;
1683+
1684+
if (matchLen >= kwicMax) {
1685+
// Cut match to kwicMax, drop all context
1686+
this.endPos = this.startPos + kwicMax;
1687+
this.endCutted = true;
1688+
startContext = this.startPos;
1689+
endContext = this.endPos; // exclusive bound
1690+
}
1691+
else if (total > kwicMax) {
1692+
int toReduce = total - kwicMax;
1693+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
1694+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
1695+
1696+
int rest = toReduce - (reduceLeft + reduceRight);
1697+
if (rest > 0) {
1698+
int extraRight = Math.min(rest, rightLen - reduceRight);
1699+
reduceRight += extraRight;
1700+
rest -= extraRight;
1701+
}
1702+
if (rest > 0) {
1703+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
1704+
reduceLeft += extraLeft;
1705+
rest -= extraLeft;
1706+
}
1707+
1708+
startContext += reduceLeft;
1709+
endContext -= reduceRight;
1710+
1711+
if (rest > 0) {
1712+
// Trim remaining from match end
1713+
int newMatchLen = matchLen - rest;
1714+
if (newMatchLen < 0) newMatchLen = 0;
1715+
this.endPos = this.startPos + newMatchLen;
1716+
this.endCutted = true;
1717+
if (endContext < this.endPos)
1718+
endContext = this.endPos;
1719+
}
1720+
}
1721+
else {
1722+
// No trimming necessary
1723+
if (DEBUG)
1724+
log.debug("KWIC cap not reached: total={} ≤ cap={}",
1725+
leftLen + matchLen + rightLen, kwicMax);
1726+
}
1727+
}
1728+
16751729
// Retrieve the character offsets for all tokens
16761730
for (int i = startContext; i < endContext; i++) {
16771731
pto.add(ldid, i);
@@ -1706,6 +1760,9 @@ public ObjectNode getSnippetTokens () {
17061760
tokens = json.putArray("left");
17071761
for (i = startContext; i < this.startPos; i++) {
17081762
offsets = pto.span(ldid,i);
1763+
if (offsets == null) {
1764+
continue;
1765+
}
17091766
tokens.add(
17101767
codePointSubstring(this.tempSnippet,
17111768
offsets[0]- startContextChar, offsets[1] - startContextChar)
@@ -1774,6 +1831,38 @@ public ObjectNode getSnippetTokens () {
17741831

17751832
@JsonIgnore
17761833
public String getSnippetHTML () {
1834+
// Failsafe: enforce total KWIC cap by rebuilding context if necessary
1835+
int kwicMaxFS = KrillProperties.getMaxTokenKwicSize();
1836+
if (kwicMaxFS > 0) {
1837+
// Build tokens once to measure current KWIC size
1838+
ObjectNode tok = this.getSnippetTokens();
1839+
if (tok != null) {
1840+
int left = tok.has("left") ? tok.get("left").size() : 0;
1841+
int match = tok.has("match") ? tok.get("match").size() : 0;
1842+
int right = tok.has("right") ? tok.get("right").size() : 0;
1843+
int total = left + match + right;
1844+
if (total > kwicMaxFS) {
1845+
int allowedCtx = Math.max(kwicMaxFS - match, 0);
1846+
int leftAllowed = Math.min(left, (allowedCtx + 1) / 2);
1847+
int rightAllowed = Math.min(right, allowedCtx - leftAllowed);
1848+
int rest = allowedCtx - (leftAllowed + rightAllowed);
1849+
if (rest > 0) {
1850+
int extraRight = Math.min(rest, right - rightAllowed);
1851+
rightAllowed += extraRight;
1852+
rest -= extraRight;
1853+
}
1854+
if (rest > 0) {
1855+
int extraLeft = Math.min(rest, left - leftAllowed);
1856+
leftAllowed += extraLeft;
1857+
}
1858+
// Force token-based context (disable span) and rebuild
1859+
this.context = new SearchContext();
1860+
this.context.left.setToken(true).setLength(leftAllowed);
1861+
this.context.right.setToken(true).setLength(rightAllowed);
1862+
this._reset();
1863+
}
1864+
}
1865+
}
17771866

17781867
if (!this._processHighlight())
17791868
return null;
@@ -1908,6 +1997,36 @@ public String getSnippetHTML () {
19081997

19091998
@JsonIgnore
19101999
public String getSnippetBrackets () {
2000+
// Failsafe: enforce total KWIC cap also for bracket snippets
2001+
int kwicMaxFS = KrillProperties.getMaxTokenKwicSize();
2002+
if (kwicMaxFS > 0) {
2003+
ObjectNode tok = this.getSnippetTokens();
2004+
if (tok != null) {
2005+
int left = tok.has("left") ? tok.get("left").size() : 0;
2006+
int match = tok.has("match") ? tok.get("match").size() : 0;
2007+
int right = tok.has("right") ? tok.get("right").size() : 0;
2008+
int total = left + match + right;
2009+
if (total > kwicMaxFS) {
2010+
int allowedCtx = Math.max(kwicMaxFS - match, 0);
2011+
int leftAllowed = Math.min(left, (allowedCtx + 1) / 2);
2012+
int rightAllowed = Math.min(right, allowedCtx - leftAllowed);
2013+
int rest = allowedCtx - (leftAllowed + rightAllowed);
2014+
if (rest > 0) {
2015+
int extraRight = Math.min(rest, right - rightAllowed);
2016+
rightAllowed += extraRight;
2017+
rest -= extraRight;
2018+
}
2019+
if (rest > 0) {
2020+
int extraLeft = Math.min(rest, left - leftAllowed);
2021+
leftAllowed += extraLeft;
2022+
}
2023+
this.context = new SearchContext();
2024+
this.context.left.setToken(true).setLength(leftAllowed);
2025+
this.context.right.setToken(true).setLength(rightAllowed);
2026+
this._reset();
2027+
}
2028+
}
2029+
}
19112030

19122031
if (!this._processHighlight())
19132032
return null;
@@ -2347,6 +2466,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23472466

23482467
int startOffsetChar = -1, endOffsetChar = -1;
23492468
int startOffset = -1, endOffset = -1;
2469+
PositionsToOffset pto = this.positionsToOffset;
23502470

23512471
// The offset is defined by a span
23522472
if (this.getContext().isSpanDefined()) {
@@ -2368,20 +2488,23 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23682488
if (DEBUG)
23692489
log.trace("Got context based on span {}-{}/{}-{}",
23702490
startOffset, endOffset, startOffsetChar, endOffsetChar);
2491+
// Make sure we can (re)compute character offsets after adjustments
2492+
this.positionsToOffset.add(ldid, startOffset);
2493+
this.positionsToOffset.add(ldid, endOffset);
23712494
};
23722495

23732496
// The offset is defined by tokens or characters
23742497
if (endOffset == -1) {
23752498

2376-
PositionsToOffset pto = this.positionsToOffset;
2499+
PositionsToOffset ptoTok = pto;
23772500

23782501
// The left offset is defined by tokens
23792502
if (this.context.left.isToken()) {
23802503
startOffset = this.startPos - this.context.left.getLength();
23812504
if (DEBUG)
23822505
log.trace("PTO will retrieve {} (Left context)",
23832506
startOffset);
2384-
pto.add(ldid, startOffset);
2507+
ptoTok.add(ldid, startOffset);
23852508
}
23862509

23872510
// The left offset is defined by characters
@@ -2395,7 +2518,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23952518
if (DEBUG)
23962519
log.trace("PTO will retrieve {} (Right context)",
23972520
endOffset);
2398-
pto.add(ldid, endOffset);
2521+
ptoTok.add(ldid, endOffset);
23992522
}
24002523

24012524
// The right context is defined by characters
@@ -2404,24 +2527,138 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24042527
: endPosChar + this.context.right.getLength();
24052528
};
24062529

2407-
if (startOffset != -1)
2408-
startOffsetChar = pto.start(ldid, startOffset);
2530+
// Enforce total KWIC token cap (left + match + right) on token offsets
2531+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
2532+
if (kwicMax > 0) {
2533+
int leftLen = (startOffset != -1) ? (this.startPos - startOffset) : 0;
2534+
if (leftLen < 0) leftLen = 0;
2535+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
2536+
int rightLen = (endOffset != -1) ? (endOffset - (this.endPos - 1)) : 0;
2537+
if (rightLen < 0) rightLen = 0;
2538+
int total = leftLen + matchLen + rightLen;
2539+
2540+
if (matchLen >= kwicMax) {
2541+
// Cut match to kwicMax and drop context
2542+
this.endPos = this.startPos + kwicMax;
2543+
this.endCutted = true;
2544+
startOffset = this.startPos;
2545+
endOffset = this.endPos - 1;
2546+
}
2547+
else if (total > kwicMax) {
2548+
int toReduce = total - kwicMax;
2549+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
2550+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
2551+
2552+
int rest = toReduce - (reduceLeft + reduceRight);
2553+
if (rest > 0) {
2554+
int extraRight = Math.min(rest, rightLen - reduceRight);
2555+
reduceRight += extraRight;
2556+
rest -= extraRight;
2557+
}
2558+
if (rest > 0) {
2559+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
2560+
reduceLeft += extraLeft;
2561+
rest -= extraLeft;
2562+
}
2563+
2564+
if (startOffset != -1)
2565+
startOffset += reduceLeft;
2566+
if (endOffset != -1)
2567+
endOffset -= reduceRight;
2568+
2569+
if (rest > 0) {
2570+
int newMatchLen = matchLen - rest;
2571+
if (newMatchLen < 0) newMatchLen = 0;
2572+
this.endPos = this.startPos + newMatchLen;
2573+
this.endCutted = true;
2574+
if (endOffset != -1)
2575+
endOffset = Math.max(endOffset, this.endPos - 1);
2576+
}
2577+
else {
2578+
if (DEBUG)
2579+
log.debug("KWIC cap not reached (offset path): total={} ≤ cap={}",
2580+
leftLen + matchLen + rightLen, kwicMax);
2581+
}
2582+
}
2583+
}
24092584

2410-
if (endOffset != -1)
2411-
endOffsetChar = pto.end(ldid, endOffset);
24122585
};
24132586

2587+
// Enforce total KWIC token cap (left + match + right), regardless of span or token context
2588+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
2589+
if (kwicMax > 0) {
2590+
int leftLen = (startOffset != -1) ? (this.startPos - startOffset) : 0;
2591+
if (leftLen < 0) leftLen = 0;
2592+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
2593+
int rightLen = (endOffset != -1) ? (endOffset - (this.endPos - 1)) : 0;
2594+
if (rightLen < 0) rightLen = 0;
2595+
int total = leftLen + matchLen + rightLen;
2596+
2597+
if (matchLen >= kwicMax) {
2598+
this.endPos = this.startPos + kwicMax;
2599+
this.endCutted = true;
2600+
startOffset = this.startPos;
2601+
endOffset = this.endPos - 1;
2602+
}
2603+
else if (total > kwicMax) {
2604+
int toReduce = total - kwicMax;
2605+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
2606+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
2607+
int rest = toReduce - (reduceLeft + reduceRight);
2608+
if (rest > 0) {
2609+
int extraRight = Math.min(rest, rightLen - reduceRight);
2610+
reduceRight += extraRight;
2611+
rest -= extraRight;
2612+
}
2613+
if (rest > 0) {
2614+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
2615+
reduceLeft += extraLeft;
2616+
rest -= extraLeft;
2617+
}
2618+
2619+
if (startOffset != -1)
2620+
startOffset += reduceLeft;
2621+
if (endOffset != -1)
2622+
endOffset -= reduceRight;
2623+
2624+
if (rest > 0) {
2625+
int newMatchLen = matchLen - rest;
2626+
if (newMatchLen < 0) newMatchLen = 0;
2627+
this.endPos = this.startPos + newMatchLen;
2628+
this.endCutted = true;
2629+
if (endOffset != -1)
2630+
endOffset = Math.max(endOffset, this.endPos - 1);
2631+
}
2632+
}
2633+
else {
2634+
if (DEBUG)
2635+
log.debug("KWIC cap not reached (unified path): total={} ≤ cap={}", total, kwicMax);
2636+
}
2637+
}
2638+
2639+
// Compute character offsets according to potentially adjusted token offsets
2640+
if (startOffset != -1)
2641+
startOffsetChar = pto.start(ldid, startOffset);
2642+
if (endOffset != -1)
2643+
endOffsetChar = pto.end(ldid, endOffset);
2644+
24142645
if (DEBUG)
24152646
log.trace("Premature found offsets at {}-{}", startOffsetChar,
24162647
endOffsetChar);
24172648

2649+
// Ensure zero-context means match-only and not full document
2650+
if (startOffset == -1 && (startOffsetChar < 0 || this.context.left.getLength() == 0))
2651+
startOffsetChar = startPosChar;
2652+
if (endOffset == -1 && (endOffsetChar < 0 || this.context.right.getLength() == 0))
2653+
endOffsetChar = endPosChar;
2654+
24182655

24192656
// This can happen in case of non-token characters
24202657
// in the match and null offsets
24212658
if (startOffsetChar > startPosChar)
24222659
startOffsetChar = startPosChar;
24232660
else if (startOffsetChar < 0)
2424-
startOffsetChar = 0;
2661+
startOffsetChar = startPosChar;
24252662

24262663
// No "..." at the beginning
24272664
if (startOffsetChar == 0)

0 commit comments

Comments
 (0)