@@ -1672,6 +1672,60 @@ public ObjectNode getSnippetTokens () {
16721672 log .debug ("Set endContext {}" , endContext );
16731673 };
16741674
1675+ // Enforce total KWIC token cap (left + match + right)
1676+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
1677+ if (kwicMax > 0 ) {
1678+ // Convert endContext to exclusive bound for iteration ease
1679+ int leftLen = (startContext < this .startPos ) ? (this .startPos - startContext ) : 0 ;
1680+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
1681+ int rightLen = (endContext > this .endPos ) ? (endContext - this .endPos ) : 0 ;
1682+ int total = leftLen + matchLen + rightLen ;
1683+
1684+ if (matchLen >= kwicMax ) {
1685+ // Cut match to kwicMax, drop all context
1686+ this .endPos = this .startPos + kwicMax ;
1687+ this .endCutted = true ;
1688+ startContext = this .startPos ;
1689+ endContext = this .endPos ; // exclusive bound
1690+ }
1691+ else if (total > kwicMax ) {
1692+ int toReduce = total - kwicMax ;
1693+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
1694+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
1695+
1696+ int rest = toReduce - (reduceLeft + reduceRight );
1697+ if (rest > 0 ) {
1698+ int extraRight = Math .min (rest , rightLen - reduceRight );
1699+ reduceRight += extraRight ;
1700+ rest -= extraRight ;
1701+ }
1702+ if (rest > 0 ) {
1703+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
1704+ reduceLeft += extraLeft ;
1705+ rest -= extraLeft ;
1706+ }
1707+
1708+ startContext += reduceLeft ;
1709+ endContext -= reduceRight ;
1710+
1711+ if (rest > 0 ) {
1712+ // Trim remaining from match end
1713+ int newMatchLen = matchLen - rest ;
1714+ if (newMatchLen < 0 ) newMatchLen = 0 ;
1715+ this .endPos = this .startPos + newMatchLen ;
1716+ this .endCutted = true ;
1717+ if (endContext < this .endPos )
1718+ endContext = this .endPos ;
1719+ }
1720+ }
1721+ else {
1722+ // No trimming necessary
1723+ if (DEBUG )
1724+ log .debug ("KWIC cap not reached: total={} ≤ cap={}" ,
1725+ leftLen + matchLen + rightLen , kwicMax );
1726+ }
1727+ }
1728+
16751729 // Retrieve the character offsets for all tokens
16761730 for (int i = startContext ; i < endContext ; i ++) {
16771731 pto .add (ldid , i );
@@ -2347,6 +2401,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23472401
23482402 int startOffsetChar = -1 , endOffsetChar = -1 ;
23492403 int startOffset = -1 , endOffset = -1 ;
2404+ PositionsToOffset pto = this .positionsToOffset ;
23502405
23512406 // The offset is defined by a span
23522407 if (this .getContext ().isSpanDefined ()) {
@@ -2368,20 +2423,23 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23682423 if (DEBUG )
23692424 log .trace ("Got context based on span {}-{}/{}-{}" ,
23702425 startOffset , endOffset , startOffsetChar , endOffsetChar );
2426+ // Make sure we can (re)compute character offsets after adjustments
2427+ this .positionsToOffset .add (ldid , startOffset );
2428+ this .positionsToOffset .add (ldid , endOffset );
23712429 };
23722430
23732431 // The offset is defined by tokens or characters
23742432 if (endOffset == -1 ) {
23752433
2376- PositionsToOffset pto = this . positionsToOffset ;
2434+ PositionsToOffset ptoTok = pto ;
23772435
23782436 // The left offset is defined by tokens
23792437 if (this .context .left .isToken ()) {
23802438 startOffset = this .startPos - this .context .left .getLength ();
23812439 if (DEBUG )
23822440 log .trace ("PTO will retrieve {} (Left context)" ,
23832441 startOffset );
2384- pto .add (ldid , startOffset );
2442+ ptoTok .add (ldid , startOffset );
23852443 }
23862444
23872445 // The left offset is defined by characters
@@ -2395,7 +2453,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23952453 if (DEBUG )
23962454 log .trace ("PTO will retrieve {} (Right context)" ,
23972455 endOffset );
2398- pto .add (ldid , endOffset );
2456+ ptoTok .add (ldid , endOffset );
23992457 }
24002458
24012459 // The right context is defined by characters
@@ -2404,24 +2462,138 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24042462 : endPosChar + this .context .right .getLength ();
24052463 };
24062464
2407- if (startOffset != -1 )
2408- startOffsetChar = pto .start (ldid , startOffset );
2465+ // Enforce total KWIC token cap (left + match + right) on token offsets
2466+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
2467+ if (kwicMax > 0 ) {
2468+ int leftLen = (startOffset != -1 ) ? (this .startPos - startOffset ) : 0 ;
2469+ if (leftLen < 0 ) leftLen = 0 ;
2470+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
2471+ int rightLen = (endOffset != -1 ) ? (endOffset - (this .endPos - 1 )) : 0 ;
2472+ if (rightLen < 0 ) rightLen = 0 ;
2473+ int total = leftLen + matchLen + rightLen ;
2474+
2475+ if (matchLen >= kwicMax ) {
2476+ // Cut match to kwicMax and drop context
2477+ this .endPos = this .startPos + kwicMax ;
2478+ this .endCutted = true ;
2479+ startOffset = this .startPos ;
2480+ endOffset = this .endPos - 1 ;
2481+ }
2482+ else if (total > kwicMax ) {
2483+ int toReduce = total - kwicMax ;
2484+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
2485+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
2486+
2487+ int rest = toReduce - (reduceLeft + reduceRight );
2488+ if (rest > 0 ) {
2489+ int extraRight = Math .min (rest , rightLen - reduceRight );
2490+ reduceRight += extraRight ;
2491+ rest -= extraRight ;
2492+ }
2493+ if (rest > 0 ) {
2494+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
2495+ reduceLeft += extraLeft ;
2496+ rest -= extraLeft ;
2497+ }
2498+
2499+ if (startOffset != -1 )
2500+ startOffset += reduceLeft ;
2501+ if (endOffset != -1 )
2502+ endOffset -= reduceRight ;
2503+
2504+ if (rest > 0 ) {
2505+ int newMatchLen = matchLen - rest ;
2506+ if (newMatchLen < 0 ) newMatchLen = 0 ;
2507+ this .endPos = this .startPos + newMatchLen ;
2508+ this .endCutted = true ;
2509+ if (endOffset != -1 )
2510+ endOffset = Math .max (endOffset , this .endPos - 1 );
2511+ }
2512+ else {
2513+ if (DEBUG )
2514+ log .debug ("KWIC cap not reached (offset path): total={} ≤ cap={}" ,
2515+ leftLen + matchLen + rightLen , kwicMax );
2516+ }
2517+ }
2518+ }
24092519
2410- if (endOffset != -1 )
2411- endOffsetChar = pto .end (ldid , endOffset );
24122520 };
24132521
2522+ // Enforce total KWIC token cap (left + match + right), regardless of span or token context
2523+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
2524+ if (kwicMax > 0 ) {
2525+ int leftLen = (startOffset != -1 ) ? (this .startPos - startOffset ) : 0 ;
2526+ if (leftLen < 0 ) leftLen = 0 ;
2527+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
2528+ int rightLen = (endOffset != -1 ) ? (endOffset - (this .endPos - 1 )) : 0 ;
2529+ if (rightLen < 0 ) rightLen = 0 ;
2530+ int total = leftLen + matchLen + rightLen ;
2531+
2532+ if (matchLen >= kwicMax ) {
2533+ this .endPos = this .startPos + kwicMax ;
2534+ this .endCutted = true ;
2535+ startOffset = this .startPos ;
2536+ endOffset = this .endPos - 1 ;
2537+ }
2538+ else if (total > kwicMax ) {
2539+ int toReduce = total - kwicMax ;
2540+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
2541+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
2542+ int rest = toReduce - (reduceLeft + reduceRight );
2543+ if (rest > 0 ) {
2544+ int extraRight = Math .min (rest , rightLen - reduceRight );
2545+ reduceRight += extraRight ;
2546+ rest -= extraRight ;
2547+ }
2548+ if (rest > 0 ) {
2549+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
2550+ reduceLeft += extraLeft ;
2551+ rest -= extraLeft ;
2552+ }
2553+
2554+ if (startOffset != -1 )
2555+ startOffset += reduceLeft ;
2556+ if (endOffset != -1 )
2557+ endOffset -= reduceRight ;
2558+
2559+ if (rest > 0 ) {
2560+ int newMatchLen = matchLen - rest ;
2561+ if (newMatchLen < 0 ) newMatchLen = 0 ;
2562+ this .endPos = this .startPos + newMatchLen ;
2563+ this .endCutted = true ;
2564+ if (endOffset != -1 )
2565+ endOffset = Math .max (endOffset , this .endPos - 1 );
2566+ }
2567+ }
2568+ else {
2569+ if (DEBUG )
2570+ log .debug ("KWIC cap not reached (unified path): total={} ≤ cap={}" , total , kwicMax );
2571+ }
2572+ }
2573+
2574+ // Compute character offsets according to potentially adjusted token offsets
2575+ if (startOffset != -1 )
2576+ startOffsetChar = pto .start (ldid , startOffset );
2577+ if (endOffset != -1 )
2578+ endOffsetChar = pto .end (ldid , endOffset );
2579+
24142580 if (DEBUG )
24152581 log .trace ("Premature found offsets at {}-{}" , startOffsetChar ,
24162582 endOffsetChar );
24172583
2584+ // Ensure zero-context means match-only and not full document
2585+ if (startOffset == -1 && (startOffsetChar < 0 || this .context .left .getLength () == 0 ))
2586+ startOffsetChar = startPosChar ;
2587+ if (endOffset == -1 && (endOffsetChar < 0 || this .context .right .getLength () == 0 ))
2588+ endOffsetChar = endPosChar ;
2589+
24182590
24192591 // This can happen in case of non-token characters
24202592 // in the match and null offsets
24212593 if (startOffsetChar > startPosChar )
24222594 startOffsetChar = startPosChar ;
24232595 else if (startOffsetChar < 0 )
2424- startOffsetChar = 0 ;
2596+ startOffsetChar = startPosChar ;
24252597
24262598 // No "..." at the beginning
24272599 if (startOffsetChar == 0 )
0 commit comments