Skip to content

Commit ea62048

Browse files
committed
Fix KWIC capping
New property: krill.kwic.max.token Added as a single cap for total tokens in a KWIC: left + match + right. Default computes from existing defaults: 2 * krill.context.max.token + krill.match.max.token. Deprecation and migration logic: If krill.context.max.token and/or krill.match.max.token are set, a deprecation warning is logged and (if krill.kwic.max.token is not set) the latter is derived as 2*context + match. Change-Id: I135121afeaa26a8e9681d19de390029394fabf1f
1 parent cd3fb7e commit ea62048

File tree

5 files changed

+242
-0
lines changed

5 files changed

+242
-0
lines changed

src/main/java/de/ids_mannheim/korap/response/Match.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,7 @@ public void expandContextToSpan (String element) {
11361136
if (spanContext[0] >= 0
11371137
&& spanContext[0] < spanContext[1]) {
11381138

1139+
// Respect max match size and optionally context in expansions
11391140
int maxExpansionSize = KrillProperties.maxTokenMatchSize;
11401141
if (KrillProperties.matchExpansionIncludeContextSize) {
11411142
maxExpansionSize += KrillProperties.maxTokenContextSize;
@@ -1672,6 +1673,39 @@ public ObjectNode getSnippetTokens () {
16721673
log.debug("Set endContext {}", endContext);
16731674
};
16741675

1676+
// Enforce KWIC total token budget (left + match + right)
1677+
try {
1678+
int kwicMax = KrillProperties.maxTokenKwicSize;
1679+
int matchLen = Math.max(0, this.getEndPos() - this.getStartPos());
1680+
int total = endContext - startContext;
1681+
if (total > kwicMax) {
1682+
int allowedContext = Math.max(0, kwicMax - matchLen);
1683+
int leftAvail = Math.max(0, this.startPos - startContext);
1684+
int rightAvail = Math.max(0, endContext - this.endPos);
1685+
1686+
int left = Math.min(leftAvail, allowedContext / 2);
1687+
int right = Math.min(rightAvail, allowedContext - left);
1688+
1689+
// Distribute remaining allowance to the side that still has room
1690+
int remainder = allowedContext - (left + right);
1691+
if (remainder > 0) {
1692+
int addLeft = Math.min(remainder, Math.max(0, leftAvail - left));
1693+
left += addLeft;
1694+
remainder -= addLeft;
1695+
if (remainder > 0) {
1696+
int addRight = Math.min(remainder, Math.max(0, rightAvail - right));
1697+
right += addRight;
1698+
}
1699+
}
1700+
1701+
startContext = Math.max(0, this.startPos - left);
1702+
endContext = this.endPos + right;
1703+
}
1704+
}
1705+
catch (Exception e) {
1706+
// Be safe in case of unexpected values; fall back to original
1707+
}
1708+
16751709
// Retrieve the character offsets for all tokens
16761710
for (int i = startContext; i < endContext; i++) {
16771711
pto.add(ldid, i);

src/main/java/de/ids_mannheim/korap/util/KrillProperties.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ public class KrillProperties {
2323

2424
public static int maxTokenMatchSize = 50;
2525
public static int maxTokenContextSize = 60;
26+
// Maximum number of tokens for KWIC (left + match + right)
27+
// Defaults to match + 2 * context
28+
public static int maxTokenKwicSize = 2 * maxTokenContextSize + maxTokenMatchSize;
2629
public static int maxCharContextSize = 500;
2730
public static int defaultSearchContextLength = 6;
2831
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
@@ -89,19 +92,42 @@ public static Properties loadProperties (String propFile) {
8992
public static void updateConfigurations (Properties prop) {
9093
String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
9194
String maxTokenContextSize = prop.getProperty("krill.context.max.token");
95+
String maxTokenKwicSize = prop.getProperty("krill.kwic.max.token");
9296
// EM: not implemented yet
9397
// String maxCharContextSize = prop.getProperty("krill.context.max.char");
9498
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
9599
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
96100

97101
try {
102+
boolean legacyMatchSet = false;
103+
boolean legacyContextSet = false;
98104
if (maxTokenMatchSize != null) {
99105
KrillProperties.maxTokenMatchSize = Integer
100106
.parseInt(maxTokenMatchSize);
107+
legacyMatchSet = true;
101108
}
102109
if (maxTokenContextSize != null) {
103110
KrillProperties.maxTokenContextSize = Integer
104111
.parseInt(maxTokenContextSize);
112+
legacyContextSet = true;
113+
}
114+
// New unified KWIC limit
115+
if (maxTokenKwicSize != null) {
116+
KrillProperties.maxTokenKwicSize = Integer.parseInt(maxTokenKwicSize);
117+
} else {
118+
// If legacy properties are provided, compute KWIC and warn
119+
if (legacyMatchSet || legacyContextSet) {
120+
log.warn("Properties 'krill.match.max.token' and 'krill.context.max.token' are deprecated. Use 'krill.kwic.max.token' instead.");
121+
// Compute kwic from available values (fallback to defaults for missing)
122+
KrillProperties.maxTokenKwicSize =
123+
(2 * KrillProperties.maxTokenContextSize)
124+
+ KrillProperties.maxTokenMatchSize;
125+
} else {
126+
// No override given, ensure default consistency
127+
KrillProperties.maxTokenKwicSize =
128+
(2 * KrillProperties.maxTokenContextSize)
129+
+ KrillProperties.maxTokenMatchSize;
130+
}
105131
}
106132
// if (maxCharContextSize != null) {
107133
// KrillProperties.maxCharContextSize = Integer

src/main/resources/krill.properties.info

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,12 @@ krill.index.commit.log = log/krill.commit.log
1515
krill.index.commit.auto = 500
1616
krill.index.relations.max = 100
1717
krill.index.textSize.max = 20000000
18+
19+
# KWIC/snippet token limit
20+
# Maximum total number of tokens shown in a KWIC snippet
21+
# (left context + match + right context). This replaces the
22+
# deprecated properties 'krill.match.max.token' and
23+
# 'krill.context.max.token'. If you only set the deprecated
24+
# properties, Krill derives this value as:
25+
# 2 * krill.context.max.token + krill.match.max.token
26+
krill.kwic.max.token = [INTEGER TOTAL TOKENS]
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package de.ids_mannheim.korap.index;
2+
3+
import java.util.HashMap;
4+
5+
import org.apache.lucene.index.LeafReaderContext;
6+
7+
/**
8+
* Lightweight test double for PositionsToOffset that avoids Lucene access.
9+
* Provides direct control over token-to-character offset mappings.
10+
*/
11+
public class FakePositionsToOffset extends PositionsToOffset {
12+
private final HashMap<Integer, Integer[]> offsets = new HashMap<>();
13+
14+
public FakePositionsToOffset() {
15+
super((LeafReaderContext) null, "tokens");
16+
}
17+
18+
public void putOffset(int tokenPosition, int startChar, int endChar) {
19+
offsets.put(tokenPosition, new Integer[] { startChar, endChar });
20+
}
21+
22+
@Override
23+
public void add(int docID, int pos) {
24+
// no-op; test manages offsets explicitly
25+
}
26+
27+
@Override
28+
public int start(int docID, int pos) {
29+
Integer[] pair = offsets.get(pos);
30+
return pair == null ? 0 : pair[0];
31+
}
32+
33+
@Override
34+
public int end(int docID, int pos) {
35+
Integer[] pair = offsets.get(pos);
36+
return pair == null ? -1 : pair[1];
37+
}
38+
39+
@Override
40+
public Integer[] span(int docID, int pos) {
41+
return offsets.get(pos);
42+
}
43+
}
44+
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package de.ids_mannheim.korap.kwic;
2+
3+
import static org.junit.Assert.assertEquals;
4+
import static org.junit.Assert.assertTrue;
5+
6+
import java.util.Properties;
7+
8+
import org.junit.After;
9+
import org.junit.Before;
10+
import org.junit.Test;
11+
12+
import com.fasterxml.jackson.databind.JsonNode;
13+
14+
import de.ids_mannheim.korap.index.AbstractDocument;
15+
import de.ids_mannheim.korap.index.PositionsToOffset;
16+
import de.ids_mannheim.korap.index.FakePositionsToOffset;
17+
import de.ids_mannheim.korap.response.Match;
18+
import de.ids_mannheim.korap.response.SearchContext;
19+
import de.ids_mannheim.korap.util.KrillProperties;
20+
21+
public class TestKwicCap {
22+
23+
private int savedMatch;
24+
private int savedCtx;
25+
private int savedKwic;
26+
27+
@Before
28+
public void rememberDefaults() {
29+
savedMatch = KrillProperties.maxTokenMatchSize;
30+
savedCtx = KrillProperties.maxTokenContextSize;
31+
savedKwic = KrillProperties.maxTokenKwicSize;
32+
}
33+
34+
@After
35+
public void restoreDefaults() {
36+
KrillProperties.maxTokenMatchSize = savedMatch;
37+
KrillProperties.maxTokenContextSize = savedCtx;
38+
KrillProperties.maxTokenKwicSize = savedKwic;
39+
}
40+
41+
@Test
42+
public void testKwicDerivedFromLegacy() {
43+
Properties p = new Properties();
44+
p.setProperty("krill.match.max.token", "50");
45+
p.setProperty("krill.context.max.token", "25");
46+
47+
KrillProperties.updateConfigurations(p);
48+
49+
assertEquals(50, KrillProperties.maxTokenMatchSize);
50+
assertEquals(25, KrillProperties.maxTokenContextSize);
51+
assertEquals(2 * 25 + 50, KrillProperties.maxTokenKwicSize);
52+
}
53+
54+
@Test
55+
public void testExplicitKwicOverridesLegacy() {
56+
Properties p = new Properties();
57+
p.setProperty("krill.match.max.token", "5");
58+
p.setProperty("krill.context.max.token", "3");
59+
p.setProperty("krill.kwic.max.token", "42");
60+
61+
KrillProperties.updateConfigurations(p);
62+
63+
assertEquals(5, KrillProperties.maxTokenMatchSize);
64+
assertEquals(3, KrillProperties.maxTokenContextSize);
65+
assertEquals(42, KrillProperties.maxTokenKwicSize);
66+
}
67+
68+
@Test
69+
public void testMatchTrimUsesKwicCap() {
70+
int cap = 10;
71+
PositionsToOffset pto = new FakePositionsToOffset();
72+
Match m = new Match(cap, pto, /*localDocID*/ 0, /*start*/ 100, /*end*/ 150);
73+
74+
assertEquals(100, m.getStartPos());
75+
assertEquals(110, m.getEndPos());
76+
}
77+
78+
@Test
79+
public void testSnippetTokensRespectsKwicBudget() {
80+
// Build primary text and offsets for 50 tokens: w0 w1 ... w49
81+
StringBuilder sb = new StringBuilder();
82+
int tokenCount = 50;
83+
int[] startChar = new int[tokenCount];
84+
int[] endChar = new int[tokenCount];
85+
for (int i = 0; i < tokenCount; i++) {
86+
startChar[i] = sb.length();
87+
String w = "w" + i;
88+
sb.append(w);
89+
endChar[i] = sb.length();
90+
if (i < tokenCount - 1) sb.append(' ');
91+
}
92+
String primary = sb.toString();
93+
94+
// Fake PTO mapping token positions to char offsets
95+
FakePositionsToOffset pto = new FakePositionsToOffset();
96+
for (int i = 0; i < tokenCount; i++) {
97+
pto.putOffset(i, startChar[i], endChar[i]);
98+
}
99+
100+
// Match covers tokens [10,20): length 10
101+
int matchStart = 10;
102+
int matchEnd = 20;
103+
int kwicCap = 16; // -> allowed context = 6 => 3 left + 3 right
104+
KrillProperties.maxTokenKwicSize = kwicCap;
105+
106+
Match m = new Match(kwicCap, pto, 0, matchStart, matchEnd);
107+
m.setPrimaryData(primary);
108+
109+
// Request very large context to force trimming by KWIC
110+
SearchContext sc = new SearchContext();
111+
sc.left.setToken(true);
112+
sc.left.setLength(50);
113+
sc.right.setToken(true);
114+
sc.right.setLength(50);
115+
m.setContext(sc);
116+
m.hasTokens = true;
117+
118+
JsonNode tokens = m.getSnippetTokens();
119+
int leftSize = tokens.has("left") ? tokens.get("left").size() : 0;
120+
int matchSize = tokens.get("match").size();
121+
int rightSize = tokens.has("right") ? tokens.get("right").size() : 0;
122+
123+
assertEquals(10, matchSize);
124+
assertEquals(3, leftSize);
125+
assertEquals(3, rightSize);
126+
assertTrue(leftSize + matchSize + rightSize <= kwicCap);
127+
}
128+
}
129+

0 commit comments

Comments
 (0)