Skip to content

Commit ac10c2e

Browse files
committed
Fix KWIC capping
The properties krill.match.max.token and krill.context.max.token and, correspondingly variables, and parameters like maxTokenMatchSize, were introduced to configure the maximum visible token length of search hits with context ("KWICs") and exports, to adhere with copyright and license restrictions, which are very important. However, the implementation was flawed and apparently based on a misunderstanding between linguists, lawyers and programmers. The only point that matters legally is the total number of tokens shown in a KWIC snippet (left context + match + right context). If an actual match is larger than krill.kwic.max.token, it must be cut down to krill.kwic.max.token, if not the remaining token budget should be distributed between left and right context, either equally or in such a way that the total number of capped words in minimized. Change-Id: Ib0afd476fcd84144d4d9db18839ed8b9952f92e3
1 parent cd3fb7e commit ac10c2e

File tree

7 files changed

+235
-67
lines changed

7 files changed

+235
-67
lines changed

src/main/java/de/ids_mannheim/korap/KrillIndex.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -997,8 +997,9 @@ public Match getMatchInfo (String idString, String field, boolean info,
997997
if (DEBUG)
998998
log.trace("Get info on {}", idString);
999999

1000-
int maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
1001-
Match match = new Match(maxTokenMatchSize, idString, includeHighlights);
1000+
// Use total KWIC cap to limit match size at most to the total snippet cap
1001+
int kwicMax = de.ids_mannheim.korap.util.KrillProperties.getMaxTokenKwicSize();
1002+
Match match = new Match(kwicMax, idString, includeHighlights);
10021003

10031004
if (this.getVersion() != null)
10041005
match.setVersion(this.getVersion());
@@ -1569,11 +1570,8 @@ public Result search (Krill ks) {
15691570
? lreader.document(localDocID, fieldsSet)
15701571
: lreader.document(localDocID);
15711572

1572-
int maxMatchSize = ks.getMaxTokenMatchSize();
1573-
if (maxMatchSize <= 0
1574-
|| maxMatchSize > KrillProperties.maxTokenMatchSize) {
1575-
maxMatchSize = KrillProperties.maxTokenMatchSize;
1576-
};
1573+
// Use total KWIC cap for match capping, ignore per-query match limits
1574+
int maxMatchSize = KrillProperties.maxTokenKwicSize;
15771575

15781576
// Create new Match
15791577
final Match match = new Match(maxMatchSize, pto, localDocID,

src/main/java/de/ids_mannheim/korap/response/Match.java

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,60 @@ public ObjectNode getSnippetTokens () {
16721672
log.debug("Set endContext {}", endContext);
16731673
};
16741674

1675+
// Enforce total KWIC token cap (left + match + right)
1676+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
1677+
if (kwicMax > 0) {
1678+
// Convert endContext to exclusive bound for iteration ease
1679+
int leftLen = (startContext < this.startPos) ? (this.startPos - startContext) : 0;
1680+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
1681+
int rightLen = (endContext > this.endPos) ? (endContext - this.endPos) : 0;
1682+
int total = leftLen + matchLen + rightLen;
1683+
1684+
if (matchLen >= kwicMax) {
1685+
// Cut match to kwicMax, drop all context
1686+
this.endPos = this.startPos + kwicMax;
1687+
this.endCutted = true;
1688+
startContext = this.startPos;
1689+
endContext = this.endPos; // exclusive bound
1690+
}
1691+
else if (total > kwicMax) {
1692+
int toReduce = total - kwicMax;
1693+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
1694+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
1695+
1696+
int rest = toReduce - (reduceLeft + reduceRight);
1697+
if (rest > 0) {
1698+
int extraRight = Math.min(rest, rightLen - reduceRight);
1699+
reduceRight += extraRight;
1700+
rest -= extraRight;
1701+
}
1702+
if (rest > 0) {
1703+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
1704+
reduceLeft += extraLeft;
1705+
rest -= extraLeft;
1706+
}
1707+
1708+
startContext += reduceLeft;
1709+
endContext -= reduceRight;
1710+
1711+
if (rest > 0) {
1712+
// Trim remaining from match end
1713+
int newMatchLen = matchLen - rest;
1714+
if (newMatchLen < 0) newMatchLen = 0;
1715+
this.endPos = this.startPos + newMatchLen;
1716+
this.endCutted = true;
1717+
if (endContext < this.endPos)
1718+
endContext = this.endPos;
1719+
}
1720+
}
1721+
else {
1722+
// No trimming necessary
1723+
if (DEBUG)
1724+
log.debug("KWIC cap not reached: total={} ≤ cap={}",
1725+
leftLen + matchLen + rightLen, kwicMax);
1726+
}
1727+
}
1728+
16751729
// Retrieve the character offsets for all tokens
16761730
for (int i = startContext; i < endContext; i++) {
16771731
pto.add(ldid, i);
@@ -2404,6 +2458,61 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24042458
: endPosChar + this.context.right.getLength();
24052459
};
24062460

2461+
// Enforce total KWIC token cap (left + match + right) on token offsets
2462+
int kwicMax = KrillProperties.getMaxTokenKwicSize();
2463+
if (kwicMax > 0) {
2464+
int leftLen = (startOffset != -1) ? (this.startPos - startOffset) : 0;
2465+
if (leftLen < 0) leftLen = 0;
2466+
int matchLen = (this.endPos > this.startPos) ? (this.endPos - this.startPos) : 0;
2467+
int rightLen = (endOffset != -1) ? (endOffset - (this.endPos - 1)) : 0;
2468+
if (rightLen < 0) rightLen = 0;
2469+
int total = leftLen + matchLen + rightLen;
2470+
2471+
if (matchLen >= kwicMax) {
2472+
// Cut match to kwicMax and drop context
2473+
this.endPos = this.startPos + kwicMax;
2474+
this.endCutted = true;
2475+
startOffset = this.startPos;
2476+
endOffset = this.endPos - 1;
2477+
}
2478+
else if (total > kwicMax) {
2479+
int toReduce = total - kwicMax;
2480+
int reduceLeft = Math.min((toReduce + 1) / 2, leftLen);
2481+
int reduceRight = Math.min(toReduce - reduceLeft, rightLen);
2482+
2483+
int rest = toReduce - (reduceLeft + reduceRight);
2484+
if (rest > 0) {
2485+
int extraRight = Math.min(rest, rightLen - reduceRight);
2486+
reduceRight += extraRight;
2487+
rest -= extraRight;
2488+
}
2489+
if (rest > 0) {
2490+
int extraLeft = Math.min(rest, leftLen - reduceLeft);
2491+
reduceLeft += extraLeft;
2492+
rest -= extraLeft;
2493+
}
2494+
2495+
if (startOffset != -1)
2496+
startOffset += reduceLeft;
2497+
if (endOffset != -1)
2498+
endOffset -= reduceRight;
2499+
2500+
if (rest > 0) {
2501+
int newMatchLen = matchLen - rest;
2502+
if (newMatchLen < 0) newMatchLen = 0;
2503+
this.endPos = this.startPos + newMatchLen;
2504+
this.endCutted = true;
2505+
if (endOffset != -1)
2506+
endOffset = Math.max(endOffset, this.endPos - 1);
2507+
}
2508+
else {
2509+
if (DEBUG)
2510+
log.debug("KWIC cap not reached (offset path): total={} ≤ cap={}",
2511+
leftLen + matchLen + rightLen, kwicMax);
2512+
}
2513+
}
2514+
}
2515+
24072516
if (startOffset != -1)
24082517
startOffsetChar = pto.start(ldid, startOffset);
24092518

@@ -2415,13 +2524,19 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24152524
log.trace("Premature found offsets at {}-{}", startOffsetChar,
24162525
endOffsetChar);
24172526

2527+
// Ensure zero-context means match-only and not full document
2528+
if (startOffset == -1 && (startOffsetChar < 0 || this.context.left.getLength() == 0))
2529+
startOffsetChar = startPosChar;
2530+
if (endOffset == -1 && (endOffsetChar < 0 || this.context.right.getLength() == 0))
2531+
endOffsetChar = endPosChar;
2532+
24182533

24192534
// This can happen in case of non-token characters
24202535
// in the match and null offsets
24212536
if (startOffsetChar > startPosChar)
24222537
startOffsetChar = startPosChar;
24232538
else if (startOffsetChar < 0)
2424-
startOffsetChar = 0;
2539+
startOffsetChar = startPosChar;
24252540

24262541
// No "..." at the beginning
24272542
if (startOffsetChar == 0)

src/main/java/de/ids_mannheim/korap/util/KrillProperties.java

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ public class KrillProperties {
2323

2424
public static int maxTokenMatchSize = 50;
2525
public static int maxTokenContextSize = 60;
26+
// New: Total KWIC size cap (match + left + right)
27+
// Default to derived value even if properties are never loaded
28+
public static int maxTokenKwicSize = (2 * maxTokenContextSize) + maxTokenMatchSize;
2629
public static int maxCharContextSize = 500;
2730
public static int defaultSearchContextLength = 6;
2831
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
@@ -89,19 +92,21 @@ public static Properties loadProperties (String propFile) {
8992
public static void updateConfigurations (Properties prop) {
9093
String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
9194
String maxTokenContextSize = prop.getProperty("krill.context.max.token");
95+
String maxTokenKwicSize = prop.getProperty("krill.kwic.max.token");
9296
// EM: not implemented yet
9397
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
9498
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
9599
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
96100

97101
try {
98102
if (maxTokenMatchSize != null) {
99-
KrillProperties.maxTokenMatchSize = Integer
100-
.parseInt(maxTokenMatchSize);
103+
KrillProperties.maxTokenMatchSize = Integer.parseInt(maxTokenMatchSize);
101104
}
102105
if (maxTokenContextSize != null) {
103-
KrillProperties.maxTokenContextSize = Integer
104-
.parseInt(maxTokenContextSize);
106+
KrillProperties.maxTokenContextSize = Integer.parseInt(maxTokenContextSize);
107+
}
108+
if (maxTokenKwicSize != null) {
109+
KrillProperties.maxTokenKwicSize = Integer.parseInt(maxTokenKwicSize);
105110
}
106111
// if (maxCharContextSize != null) {
107112
// KrillProperties.maxCharContextSize = Integer
@@ -128,6 +133,34 @@ public static void updateConfigurations (Properties prop) {
128133
log.error("A Krill property expects numerical values: "
129134
+ e.getMessage());
130135
};
136+
137+
// Always ensure kwic cap has a sensible value, even if not configured
138+
if (KrillProperties.maxTokenKwicSize <= 0) {
139+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
140+
+ KrillProperties.maxTokenMatchSize;
141+
}
142+
143+
// Handle deprecation and fallback for KWIC size
144+
if (KrillProperties.maxTokenKwicSize <= 0) {
145+
boolean legacyMatchSet = (maxTokenMatchSize != null);
146+
boolean legacyContextSet = (maxTokenContextSize != null);
147+
if (legacyMatchSet || legacyContextSet) {
148+
if (legacyMatchSet)
149+
log.warn("Property 'krill.match.max.token' is deprecated. Use 'krill.kwic.max.token'.");
150+
if (legacyContextSet)
151+
log.warn("Property 'krill.context.max.token' is deprecated. Use 'krill.kwic.max.token'.");
152+
// Compute sensible default from deprecated settings
153+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
154+
+ KrillProperties.maxTokenMatchSize;
155+
log.warn("Computed 'krill.kwic.max.token' as {} from deprecated settings.",
156+
KrillProperties.maxTokenKwicSize);
157+
}
158+
else {
159+
// Neither new nor legacy; derive from current defaults
160+
KrillProperties.maxTokenKwicSize = (2 * KrillProperties.maxTokenContextSize)
161+
+ KrillProperties.maxTokenMatchSize;
162+
}
163+
}
131164

132165
String p = prop.getProperty("krill.test", "false");
133166
isTest = Boolean.parseBoolean(p);
@@ -139,6 +172,15 @@ public static void updateConfigurations (Properties prop) {
139172
matchExpansionIncludeContextSize = Boolean.parseBoolean(matchExpansion);
140173

141174
secret = prop.getProperty("krill.secretB64", "");
175+
176+
log.info("Effective krill.kwic.max.token = {}", KrillProperties.maxTokenKwicSize);
177+
}
178+
179+
public static int getMaxTokenKwicSize() {
180+
// In case properties were never loaded, return a derived sensible default
181+
if (maxTokenKwicSize <= 0)
182+
maxTokenKwicSize = (2 * maxTokenContextSize) + maxTokenMatchSize;
183+
return maxTokenKwicSize;
142184
}
143185

144186

src/main/resources/krill.properties.info

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,15 @@ krill.index.commit.log = log/krill.commit.log
1515
krill.index.commit.auto = 500
1616
krill.index.relations.max = 100
1717
krill.index.textSize.max = 20000000
18+
19+
# Snippet (KWIC) settings
20+
# New: Maximum total number of tokens per KWIC snippet (left + match + right)
21+
# If unset, and deprecated properties below are set, Krill will compute this as
22+
# 2*krill.context.max.token + krill.match.max.token and log a deprecation warning.
23+
# krill.kwic.max.token = 100
24+
25+
# Deprecated: These are ignored when 'krill.kwic.max.token' is set and will be removed.
26+
# They were previously used to cap match length and per-side context lengths, but
27+
# licensing limits apply to the total snippet size, not to the match alone.
28+
#krill.match.max.token = [DEPRECATED]
29+
#krill.context.max.token = [DEPRECATED]

0 commit comments

Comments
 (0)