Skip to content

Commit a969800

Browse files
committed
Fix KWIC capping
The properties krill.match.max.token and krill.context.max.token and, correspondingly variables, and parameters like maxTokenMatchSize, were introduced to configure the maximum visible token length of search hits with context ("KWICs") and exports, to adhere with copyright and license restrictions, which are very important. However, the implementation was flawed and apparently based on a misunderstanding between linguists, lawyers and programmers. The only point that matters legally is the total number of tokens shown in a KWIC snippet (left context + match + right context). If an actual match is larger than krill.kwic.max.token, it must be cut down to krill.kwic.max.token, if not the remaining token budget should be distributed between left and right context, either equally or in such a way that the total number of capped words in minimized. Change-Id: Ib0afd476fcd84144d4d9db18839ed8b9952f92e3
1 parent cd3fb7e commit a969800

File tree

11 files changed

+371
-103
lines changed

11 files changed

+371
-103
lines changed

src/main/java/de/ids_mannheim/korap/KrillIndex.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -997,8 +997,9 @@ public Match getMatchInfo (String idString, String field, boolean info,
997997
if (DEBUG)
998998
log.trace("Get info on {}", idString);
999999

1000-
int maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
1001-
Match match = new Match(maxTokenMatchSize, idString, includeHighlights);
1000+
// Use total KWIC cap to limit match size at most to the total snippet cap
1001+
int kwicMax = de.ids_mannheim.korap.util.KrillProperties.getMaxTokenKwicSize();
1002+
Match match = new Match(kwicMax, idString, includeHighlights);
10021003

10031004
if (this.getVersion() != null)
10041005
match.setVersion(this.getVersion());
@@ -1569,11 +1570,8 @@ public Result search (Krill ks) {
15691570
? lreader.document(localDocID, fieldsSet)
15701571
: lreader.document(localDocID);
15711572

1572-
int maxMatchSize = ks.getMaxTokenMatchSize();
1573-
if (maxMatchSize <= 0
1574-
|| maxMatchSize > KrillProperties.maxTokenMatchSize) {
1575-
maxMatchSize = KrillProperties.maxTokenMatchSize;
1576-
};
1573+
// Use total KWIC cap for match capping, ignore per-query match limits
1574+
int maxMatchSize = KrillProperties.maxTokenKwicSize;
15771575

15781576
// Create new Match
15791577
final Match match = new Match(maxMatchSize, pto, localDocID,

src/main/java/de/ids_mannheim/korap/response/Match.java

Lines changed: 180 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,60 @@ public ObjectNode getSnippetTokens () {
16721672
log.debug("Set endContext {}", endContext);
16731673
};
16741674

1675+
// Enforce total KWIC token cap (left + match + right)
1676+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
1677+
if (kwicMax > 0) {
1678+
// Convert endContext to exclusive bound for iteration ease
1679+
int leftLen = (startContext < this.startPos) ? (this.startPos - startContext) : 0;
1680+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
1681+
int rightLen = (endContext > this.endPos) ? (endContext - this.endPos) : 0;
1682+
int total = leftLen + matchLen + rightLen;
1683+
1684+
if (matchLen >= kwicMax) {
1685+
// Cut match to kwicMax, drop all context
1686+
this.endPos = this.startPos + kwicMax;
1687+
this.endCutted = true;
1688+
startContext = this.startPos;
1689+
endContext = this.endPos; // exclusive bound
1690+
}
1691+
else if (total > kwicMax) {
1692+
int toReduce = total - kwicMax;
1693+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
1694+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
1695+
1696+
int rest = toReduce - (reduceLeft + reduceRight);
1697+
if (rest > 0) {
1698+
int extraRight = Math.min(rest, rightLen - reduceRight);
1699+
reduceRight += extraRight;
1700+
rest -= extraRight;
1701+
}
1702+
if (rest > 0) {
1703+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
1704+
reduceLeft += extraLeft;
1705+
rest -= extraLeft;
1706+
}
1707+
1708+
startContext += reduceLeft;
1709+
endContext -= reduceRight;
1710+
1711+
if (rest > 0) {
1712+
// Trim remaining from match end
1713+
int newMatchLen = matchLen - rest;
1714+
if (newMatchLen < 0) newMatchLen = 0;
1715+
this.endPos = this.startPos + newMatchLen;
1716+
this.endCutted = true;
1717+
if (endContext < this.endPos)
1718+
endContext = this.endPos;
1719+
}
1720+
}
1721+
else {
1722+
// No trimming necessary
1723+
if (DEBUG)
1724+
log.debug("KWIC cap not reached: total={} ≤ cap={}",
1725+
leftLen + matchLen + rightLen, kwicMax);
1726+
}
1727+
}
1728+
16751729
// Retrieve the character offsets for all tokens
16761730
for (int i = startContext; i < endContext; i++) {
16771731
pto.add(ldid, i);
@@ -2347,6 +2401,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23472401

23482402
int startOffsetChar = -1, endOffsetChar = -1;
23492403
int startOffset = -1, endOffset = -1;
2404+
PositionsToOffset pto = this.positionsToOffset;
23502405

23512406
// The offset is defined by a span
23522407
if (this.getContext().isSpanDefined()) {
@@ -2368,20 +2423,23 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23682423
if (DEBUG)
23692424
log.trace("Got context based on span {}-{}/{}-{}",
23702425
startOffset, endOffset, startOffsetChar, endOffsetChar);
2426+
// Make sure we can (re)compute character offsets after adjustments
2427+
this.positionsToOffset.add(ldid, startOffset);
2428+
this.positionsToOffset.add(ldid, endOffset);
23712429
};
23722430

23732431
// The offset is defined by tokens or characters
23742432
if (endOffset == -1) {
23752433

2376-
PositionsToOffset pto = this.positionsToOffset;
2434+
PositionsToOffset ptoTok = pto;
23772435

23782436
// The left offset is defined by tokens
23792437
if (this.context.left.isToken()) {
23802438
startOffset = this.startPos - this.context.left.getLength();
23812439
if (DEBUG)
23822440
log.trace("PTO will retrieve {} (Left context)",
23832441
startOffset);
2384-
pto.add(ldid, startOffset);
2442+
ptoTok.add(ldid, startOffset);
23852443
}
23862444

23872445
// The left offset is defined by characters
@@ -2395,7 +2453,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23952453
if (DEBUG)
23962454
log.trace("PTO will retrieve {} (Right context)",
23972455
endOffset);
2398-
pto.add(ldid, endOffset);
2456+
ptoTok.add(ldid, endOffset);
23992457
}
24002458

24012459
// The right context is defined by characters
@@ -2404,24 +2462,138 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24042462
: endPosChar + this.context.right.getLength();
24052463
};
24062464

2407-
if (startOffset != -1)
2408-
startOffsetChar = pto.start(ldid, startOffset);
2465+
// Enforce total KWIC token cap (left + match + right) on token offsets
2466+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
2467+
if (kwicMax > 0) {
2468+
int leftLen = (startOffset != -1) ? (this.startPos - startOffset) : 0;
2469+
if (leftLen < 0) leftLen = 0;
2470+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
2471+
int rightLen = (endOffset != -1) ? (endOffset - (this.endPos - 1)) : 0;
2472+
if (rightLen < 0) rightLen = 0;
2473+
int total = leftLen + matchLen + rightLen;
2474+
2475+
if (matchLen >= kwicMax) {
2476+
// Cut match to kwicMax and drop context
2477+
this.endPos = this.startPos + kwicMax;
2478+
this.endCutted = true;
2479+
startOffset = this.startPos;
2480+
endOffset = this.endPos - 1;
2481+
}
2482+
else if (total > kwicMax) {
2483+
int toReduce = total - kwicMax;
2484+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
2485+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
2486+
2487+
int rest = toReduce - (reduceLeft + reduceRight);
2488+
if (rest > 0) {
2489+
int extraRight = Math.min(rest, rightLen - reduceRight);
2490+
reduceRight += extraRight;
2491+
rest -= extraRight;
2492+
}
2493+
if (rest > 0) {
2494+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
2495+
reduceLeft += extraLeft;
2496+
rest -= extraLeft;
2497+
}
2498+
2499+
if (startOffset != -1)
2500+
startOffset += reduceLeft;
2501+
if (endOffset != -1)
2502+
endOffset -= reduceRight;
2503+
2504+
if (rest > 0) {
2505+
int newMatchLen = matchLen - rest;
2506+
if (newMatchLen < 0) newMatchLen = 0;
2507+
this.endPos = this.startPos + newMatchLen;
2508+
this.endCutted = true;
2509+
if (endOffset != -1)
2510+
endOffset = Math.max(endOffset, this.endPos - 1);
2511+
}
2512+
else {
2513+
if (DEBUG)
2514+
log.debug("KWIC cap not reached (offset path): total={} ≤ cap={}",
2515+
leftLen + matchLen + rightLen, kwicMax);
2516+
}
2517+
}
2518+
}
24092519

2410-
if (endOffset != -1)
2411-
endOffsetChar = pto.end(ldid, endOffset);
24122520
};
24132521

2522+
// Enforce total KWIC token cap (left + match + right), regardless of span or token context
2523+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
2524+
if (kwicMax > 0) {
2525+
int leftLen = (startOffset != -1) ? (this.startPos - startOffset) : 0;
2526+
if (leftLen < 0) leftLen = 0;
2527+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
2528+
int rightLen = (endOffset != -1) ? (endOffset - (this.endPos - 1)) : 0;
2529+
if (rightLen < 0) rightLen = 0;
2530+
int total = leftLen + matchLen + rightLen;
2531+
2532+
if (matchLen >= kwicMax) {
2533+
this.endPos = this.startPos + kwicMax;
2534+
this.endCutted = true;
2535+
startOffset = this.startPos;
2536+
endOffset = this.endPos - 1;
2537+
}
2538+
else if (total > kwicMax) {
2539+
int toReduce = total - kwicMax;
2540+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
2541+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
2542+
int rest = toReduce - (reduceLeft + reduceRight);
2543+
if (rest > 0) {
2544+
int extraRight = Math.min(rest, rightLen - reduceRight);
2545+
reduceRight += extraRight;
2546+
rest -= extraRight;
2547+
}
2548+
if (rest > 0) {
2549+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
2550+
reduceLeft += extraLeft;
2551+
rest -= extraLeft;
2552+
}
2553+
2554+
if (startOffset != -1)
2555+
startOffset += reduceLeft;
2556+
if (endOffset != -1)
2557+
endOffset -= reduceRight;
2558+
2559+
if (rest > 0) {
2560+
int newMatchLen = matchLen - rest;
2561+
if (newMatchLen < 0) newMatchLen = 0;
2562+
this.endPos = this.startPos + newMatchLen;
2563+
this.endCutted = true;
2564+
if (endOffset != -1)
2565+
endOffset = Math.max(endOffset, this.endPos - 1);
2566+
}
2567+
}
2568+
else {
2569+
if (DEBUG)
2570+
log.debug("KWIC cap not reached (unified path): total={} ≤ cap={}", total, kwicMax);
2571+
}
2572+
}
2573+
2574+
// Compute character offsets according to potentially adjusted token offsets
2575+
if (startOffset != -1)
2576+
startOffsetChar = pto.start(ldid, startOffset);
2577+
if (endOffset != -1)
2578+
endOffsetChar = pto.end(ldid, endOffset);
2579+
24142580
if (DEBUG)
24152581
log.trace("Premature found offsets at {}-{}", startOffsetChar,
24162582
endOffsetChar);
24172583

2584+
// Ensure zero-context means match-only and not full document
2585+
if (startOffset == -1 && (startOffsetChar < 0 || this.context.left.getLength() == 0))
2586+
startOffsetChar = startPosChar;
2587+
if (endOffset == -1 && (endOffsetChar < 0 || this.context.right.getLength() == 0))
2588+
endOffsetChar = endPosChar;
2589+
24182590

24192591
// This can happen in case of non-token characters
24202592
// in the match and null offsets
24212593
if (startOffsetChar > startPosChar)
24222594
startOffsetChar = startPosChar;
24232595
else if (startOffsetChar < 0)
2424-
startOffsetChar = 0;
2596+
startOffsetChar = startPosChar;
24252597

24262598
// No "..." at the beginning
24272599
if (startOffsetChar == 0)

src/main/java/de/ids_mannheim/korap/util/KrillProperties.java

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ public class KrillProperties {
2323

2424
public static int maxTokenMatchSize = 50;
2525
public static int maxTokenContextSize = 60;
26+
// New: Total KWIC size cap (match + left + right)
27+
// Default to derived value even if properties are never loaded
28+
public static int maxTokenKwicSize = (2 * maxTokenContextSize) + maxTokenMatchSize;
2629
public static int maxCharContextSize = 500;
2730
public static int defaultSearchContextLength = 6;
2831
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
@@ -89,19 +92,21 @@ public static Properties loadProperties (String propFile) {
8992
public static void updateConfigurations (Properties prop) {
9093
String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
9194
String maxTokenContextSize = prop.getProperty("krill.context.max.token");
95+
String maxTokenKwicSize = prop.getProperty("krill.kwic.max.token");
9296
// EM: not implemented yet
9397
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
9498
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
9599
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
96100

97101
try {
98102
if (maxTokenMatchSize != null) {
99-
KrillProperties.maxTokenMatchSize = Integer
100-
.parseInt(maxTokenMatchSize);
103+
KrillProperties.maxTokenMatchSize = Integer.parseInt(maxTokenMatchSize);
101104
}
102105
if (maxTokenContextSize != null) {
103-
KrillProperties.maxTokenContextSize = Integer
104-
.parseInt(maxTokenContextSize);
106+
KrillProperties.maxTokenContextSize = Integer.parseInt(maxTokenContextSize);
107+
}
108+
if (maxTokenKwicSize != null) {
109+
KrillProperties.maxTokenKwicSize = Integer.parseInt(maxTokenKwicSize);
105110
}
106111
// if (maxCharContextSize != null) {
107112
// KrillProperties.maxCharContextSize = Integer
@@ -128,6 +133,34 @@ public static void updateConfigurations (Properties prop) {
128133
log.error("A Krill property expects numerical values: "
129134
+ e.getMessage());
130135
};
136+
137+
// Always ensure kwic cap has a sensible value, even if not configured
138+
if (KrillProperties.maxTokenKwicSize <= 0) {
139+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
140+
+ KrillProperties.maxTokenMatchSize;
141+
}
142+
143+
// Handle deprecation and fallback for KWIC size
144+
if (KrillProperties.maxTokenKwicSize <= 0) {
145+
boolean legacyMatchSet = (maxTokenMatchSize != null);
146+
boolean legacyContextSet = (maxTokenContextSize != null);
147+
if (legacyMatchSet || legacyContextSet) {
148+
if (legacyMatchSet)
149+
log.warn("Property 'krill.match.max.token' is deprecated. Use 'krill.kwic.max.token'.");
150+
if (legacyContextSet)
151+
log.warn("Property 'krill.context.max.token' is deprecated. Use 'krill.kwic.max.token'.");
152+
// Compute sensible default from deprecated settings
153+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
154+
+ KrillProperties.maxTokenMatchSize;
155+
log.warn("Computed 'krill.kwic.max.token' as {} from deprecated settings.",
156+
KrillProperties.maxTokenKwicSize);
157+
}
158+
else {
159+
// Neither new nor legacy; derive from current defaults
160+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
161+
+ KrillProperties.maxTokenMatchSize;
162+
}
163+
}
131164

132165
String p = prop.getProperty("krill.test", "false");
133166
isTest = Boolean.parseBoolean(p);
@@ -139,6 +172,15 @@ public static void updateConfigurations (Properties prop) {
139172
matchExpansionIncludeContextSize = Boolean.parseBoolean(matchExpansion);
140173

141174
secret = prop.getProperty("krill.secretB64", "");
175+
176+
log.info("Effective krill.kwic.max.token = {}", KrillProperties.maxTokenKwicSize);
177+
}
178+
179+
public static int getMaxTokenKwicSize() {
180+
// In case properties were never loaded, return a derived sensible default
181+
if (maxTokenKwicSize <= 0)
182+
maxTokenKwicSize = (2 * maxTokenContextSize) + maxTokenMatchSize;
183+
return maxTokenKwicSize;
142184
}
143185

144186

src/main/resources/krill.properties.info

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,15 @@ krill.index.commit.log = log/krill.commit.log
1515
krill.index.commit.auto = 500
1616
krill.index.relations.max = 100
1717
krill.index.textSize.max = 20000000
18+
19+
# Snippet (KWIC) settings
20+
# New: Maximum total number of tokens per KWIC snippet (left + match + right)
21+
# If unset, and deprecated properties below are set, Krill will compute this as
22+
# 2*krill.context.max.token + krill.match.max.token and log a deprecation warning.
23+
# krill.kwic.max.token = 100
24+
25+
# Deprecated: These are ignored when 'krill.kwic.max.token' is set and will be removed.
26+
# They were previously used to cap match length and per-side context lengths, but
27+
# licensing limits apply to the total snippet size, not to the match alone.
28+
#krill.match.max.token = [DEPRECATED]
29+
#krill.context.max.token = [DEPRECATED]

0 commit comments

Comments
 (0)