@@ -1672,6 +1672,60 @@ public ObjectNode getSnippetTokens () {
16721672 log .debug ("Set endContext {}" , endContext );
16731673 };
16741674
1675+ // Enforce total KWIC token cap (left + match + right)
1676+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
1677+ if (kwicMax > 0 ) {
1678+ // Convert endContext to exclusive bound for iteration ease
1679+ int leftLen = (startContext < this .startPos ) ? (this .startPos - startContext ) : 0 ;
1680+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
1681+ int rightLen = (endContext > this .endPos ) ? (endContext - this .endPos ) : 0 ;
1682+ int total = leftLen + matchLen + rightLen ;
1683+
1684+ if (matchLen >= kwicMax ) {
1685+ // Cut match to kwicMax, drop all context
1686+ this .endPos = this .startPos + kwicMax ;
1687+ this .endCutted = true ;
1688+ startContext = this .startPos ;
1689+ endContext = this .endPos ; // exclusive bound
1690+ }
1691+ else if (total > kwicMax ) {
1692+ int toReduce = total - kwicMax ;
1693+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
1694+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
1695+
1696+ int rest = toReduce - (reduceLeft + reduceRight );
1697+ if (rest > 0 ) {
1698+ int extraRight = Math .min (rest , rightLen - reduceRight );
1699+ reduceRight += extraRight ;
1700+ rest -= extraRight ;
1701+ }
1702+ if (rest > 0 ) {
1703+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
1704+ reduceLeft += extraLeft ;
1705+ rest -= extraLeft ;
1706+ }
1707+
1708+ startContext += reduceLeft ;
1709+ endContext -= reduceRight ;
1710+
1711+ if (rest > 0 ) {
1712+ // Trim remaining from match end
1713+ int newMatchLen = matchLen - rest ;
1714+ if (newMatchLen < 0 ) newMatchLen = 0 ;
1715+ this .endPos = this .startPos + newMatchLen ;
1716+ this .endCutted = true ;
1717+ if (endContext < this .endPos )
1718+ endContext = this .endPos ;
1719+ }
1720+ }
1721+ else {
1722+ // No trimming necessary
1723+ if (DEBUG )
1724+ log .debug ("KWIC cap not reached: total={} ≤ cap={}" ,
1725+ leftLen + matchLen + rightLen , kwicMax );
1726+ }
1727+ }
1728+
16751729 // Retrieve the character offsets for all tokens
16761730 for (int i = startContext ; i < endContext ; i ++) {
16771731 pto .add (ldid , i );
@@ -1706,6 +1760,9 @@ public ObjectNode getSnippetTokens () {
17061760 tokens = json .putArray ("left" );
17071761 for (i = startContext ; i < this .startPos ; i ++) {
17081762 offsets = pto .span (ldid ,i );
1763+ if (offsets == null ) {
1764+ continue ;
1765+ }
17091766 tokens .add (
17101767 codePointSubstring (this .tempSnippet ,
17111768 offsets [0 ]- startContextChar , offsets [1 ] - startContextChar )
@@ -1774,6 +1831,38 @@ public ObjectNode getSnippetTokens () {
17741831
17751832 @ JsonIgnore
17761833 public String getSnippetHTML () {
1834+ // Failsafe: enforce total KWIC cap by rebuilding context if necessary
1835+ int kwicMaxFS = KrillProperties .getMaxTokenKwicSize ();
1836+ if (kwicMaxFS > 0 ) {
1837+ // Build tokens once to measure current KWIC size
1838+ ObjectNode tok = this .getSnippetTokens ();
1839+ if (tok != null ) {
1840+ int left = tok .has ("left" ) ? tok .get ("left" ).size () : 0 ;
1841+ int match = tok .has ("match" ) ? tok .get ("match" ).size () : 0 ;
1842+ int right = tok .has ("right" ) ? tok .get ("right" ).size () : 0 ;
1843+ int total = left + match + right ;
1844+ if (total > kwicMaxFS ) {
1845+ int allowedCtx = Math .max (kwicMaxFS - match , 0 );
1846+ int leftAllowed = Math .min (left , (allowedCtx + 1 ) / 2 );
1847+ int rightAllowed = Math .min (right , allowedCtx - leftAllowed );
1848+ int rest = allowedCtx - (leftAllowed + rightAllowed );
1849+ if (rest > 0 ) {
1850+ int extraRight = Math .min (rest , right - rightAllowed );
1851+ rightAllowed += extraRight ;
1852+ rest -= extraRight ;
1853+ }
1854+ if (rest > 0 ) {
1855+ int extraLeft = Math .min (rest , left - leftAllowed );
1856+ leftAllowed += extraLeft ;
1857+ }
1858+ // Force token-based context (disable span) and rebuild
1859+ this .context = new SearchContext ();
1860+ this .context .left .setToken (true ).setLength (leftAllowed );
1861+ this .context .right .setToken (true ).setLength (rightAllowed );
1862+ this ._reset ();
1863+ }
1864+ }
1865+ }
17771866
17781867 if (!this ._processHighlight ())
17791868 return null ;
@@ -1908,6 +1997,36 @@ public String getSnippetHTML () {
19081997
19091998 @ JsonIgnore
19101999 public String getSnippetBrackets () {
2000+ // Failsafe: enforce total KWIC cap also for bracket snippets
2001+ int kwicMaxFS = KrillProperties .getMaxTokenKwicSize ();
2002+ if (kwicMaxFS > 0 ) {
2003+ ObjectNode tok = this .getSnippetTokens ();
2004+ if (tok != null ) {
2005+ int left = tok .has ("left" ) ? tok .get ("left" ).size () : 0 ;
2006+ int match = tok .has ("match" ) ? tok .get ("match" ).size () : 0 ;
2007+ int right = tok .has ("right" ) ? tok .get ("right" ).size () : 0 ;
2008+ int total = left + match + right ;
2009+ if (total > kwicMaxFS ) {
2010+ int allowedCtx = Math .max (kwicMaxFS - match , 0 );
2011+ int leftAllowed = Math .min (left , (allowedCtx + 1 ) / 2 );
2012+ int rightAllowed = Math .min (right , allowedCtx - leftAllowed );
2013+ int rest = allowedCtx - (leftAllowed + rightAllowed );
2014+ if (rest > 0 ) {
2015+ int extraRight = Math .min (rest , right - rightAllowed );
2016+ rightAllowed += extraRight ;
2017+ rest -= extraRight ;
2018+ }
2019+ if (rest > 0 ) {
2020+ int extraLeft = Math .min (rest , left - leftAllowed );
2021+ leftAllowed += extraLeft ;
2022+ }
2023+ this .context = new SearchContext ();
2024+ this .context .left .setToken (true ).setLength (leftAllowed );
2025+ this .context .right .setToken (true ).setLength (rightAllowed );
2026+ this ._reset ();
2027+ }
2028+ }
2029+ }
19112030
19122031 if (!this ._processHighlight ())
19132032 return null ;
@@ -2347,6 +2466,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23472466
23482467 int startOffsetChar = -1 , endOffsetChar = -1 ;
23492468 int startOffset = -1 , endOffset = -1 ;
2469+ PositionsToOffset pto = this .positionsToOffset ;
23502470
23512471 // The offset is defined by a span
23522472 if (this .getContext ().isSpanDefined ()) {
@@ -2368,20 +2488,23 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23682488 if (DEBUG )
23692489 log .trace ("Got context based on span {}-{}/{}-{}" ,
23702490 startOffset , endOffset , startOffsetChar , endOffsetChar );
2491+ // Make sure we can (re)compute character offsets after adjustments
2492+ this .positionsToOffset .add (ldid , startOffset );
2493+ this .positionsToOffset .add (ldid , endOffset );
23712494 };
23722495
23732496 // The offset is defined by tokens or characters
23742497 if (endOffset == -1 ) {
23752498
2376- PositionsToOffset pto = this . positionsToOffset ;
2499+ PositionsToOffset ptoTok = pto ;
23772500
23782501 // The left offset is defined by tokens
23792502 if (this .context .left .isToken ()) {
23802503 startOffset = this .startPos - this .context .left .getLength ();
23812504 if (DEBUG )
23822505 log .trace ("PTO will retrieve {} (Left context)" ,
23832506 startOffset );
2384- pto .add (ldid , startOffset );
2507+ ptoTok .add (ldid , startOffset );
23852508 }
23862509
23872510 // The left offset is defined by characters
@@ -2395,7 +2518,7 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
23952518 if (DEBUG )
23962519 log .trace ("PTO will retrieve {} (Right context)" ,
23972520 endOffset );
2398- pto .add (ldid , endOffset );
2521+ ptoTok .add (ldid , endOffset );
23992522 }
24002523
24012524 // The right context is defined by characters
@@ -2404,24 +2527,138 @@ private int[] _processOffsetChars (int ldid, int startPosChar,
24042527 : endPosChar + this .context .right .getLength ();
24052528 };
24062529
2407- if (startOffset != -1 )
2408- startOffsetChar = pto .start (ldid , startOffset );
2530+ // Enforce total KWIC token cap (left + match + right) on token offsets
2531+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
2532+ if (kwicMax > 0 ) {
2533+ int leftLen = (startOffset != -1 ) ? (this .startPos - startOffset ) : 0 ;
2534+ if (leftLen < 0 ) leftLen = 0 ;
2535+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
2536+ int rightLen = (endOffset != -1 ) ? (endOffset - (this .endPos - 1 )) : 0 ;
2537+ if (rightLen < 0 ) rightLen = 0 ;
2538+ int total = leftLen + matchLen + rightLen ;
2539+
2540+ if (matchLen >= kwicMax ) {
2541+ // Cut match to kwicMax and drop context
2542+ this .endPos = this .startPos + kwicMax ;
2543+ this .endCutted = true ;
2544+ startOffset = this .startPos ;
2545+ endOffset = this .endPos - 1 ;
2546+ }
2547+ else if (total > kwicMax ) {
2548+ int toReduce = total - kwicMax ;
2549+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
2550+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
2551+
2552+ int rest = toReduce - (reduceLeft + reduceRight );
2553+ if (rest > 0 ) {
2554+ int extraRight = Math .min (rest , rightLen - reduceRight );
2555+ reduceRight += extraRight ;
2556+ rest -= extraRight ;
2557+ }
2558+ if (rest > 0 ) {
2559+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
2560+ reduceLeft += extraLeft ;
2561+ rest -= extraLeft ;
2562+ }
2563+
2564+ if (startOffset != -1 )
2565+ startOffset += reduceLeft ;
2566+ if (endOffset != -1 )
2567+ endOffset -= reduceRight ;
2568+
2569+ if (rest > 0 ) {
2570+ int newMatchLen = matchLen - rest ;
2571+ if (newMatchLen < 0 ) newMatchLen = 0 ;
2572+ this .endPos = this .startPos + newMatchLen ;
2573+ this .endCutted = true ;
2574+ if (endOffset != -1 )
2575+ endOffset = Math .max (endOffset , this .endPos - 1 );
2576+ }
2577+ else {
2578+ if (DEBUG )
2579+ log .debug ("KWIC cap not reached (offset path): total={} ≤ cap={}" ,
2580+ leftLen + matchLen + rightLen , kwicMax );
2581+ }
2582+ }
2583+ }
24092584
2410- if (endOffset != -1 )
2411- endOffsetChar = pto .end (ldid , endOffset );
24122585 };
24132586
2587+ // Enforce total KWIC token cap (left + match + right), regardless of span or token context
2588+ int kwicMax = KrillProperties .getMaxTokenKwicSize ();
2589+ if (kwicMax > 0 ) {
2590+ int leftLen = (startOffset != -1 ) ? (this .startPos - startOffset ) : 0 ;
2591+ if (leftLen < 0 ) leftLen = 0 ;
2592+ int matchLen = (this .endPos > this .startPos ) ? (this .endPos - this .startPos ) : 0 ;
2593+ int rightLen = (endOffset != -1 ) ? (endOffset - (this .endPos - 1 )) : 0 ;
2594+ if (rightLen < 0 ) rightLen = 0 ;
2595+ int total = leftLen + matchLen + rightLen ;
2596+
2597+ if (matchLen >= kwicMax ) {
2598+ this .endPos = this .startPos + kwicMax ;
2599+ this .endCutted = true ;
2600+ startOffset = this .startPos ;
2601+ endOffset = this .endPos - 1 ;
2602+ }
2603+ else if (total > kwicMax ) {
2604+ int toReduce = total - kwicMax ;
2605+ int reduceLeft = Math .min ((toReduce + 1 ) / 2 , leftLen );
2606+ int reduceRight = Math .min (toReduce - reduceLeft , rightLen );
2607+ int rest = toReduce - (reduceLeft + reduceRight );
2608+ if (rest > 0 ) {
2609+ int extraRight = Math .min (rest , rightLen - reduceRight );
2610+ reduceRight += extraRight ;
2611+ rest -= extraRight ;
2612+ }
2613+ if (rest > 0 ) {
2614+ int extraLeft = Math .min (rest , leftLen - reduceLeft );
2615+ reduceLeft += extraLeft ;
2616+ rest -= extraLeft ;
2617+ }
2618+
2619+ if (startOffset != -1 )
2620+ startOffset += reduceLeft ;
2621+ if (endOffset != -1 )
2622+ endOffset -= reduceRight ;
2623+
2624+ if (rest > 0 ) {
2625+ int newMatchLen = matchLen - rest ;
2626+ if (newMatchLen < 0 ) newMatchLen = 0 ;
2627+ this .endPos = this .startPos + newMatchLen ;
2628+ this .endCutted = true ;
2629+ if (endOffset != -1 )
2630+ endOffset = Math .max (endOffset , this .endPos - 1 );
2631+ }
2632+ }
2633+ else {
2634+ if (DEBUG )
2635+ log .debug ("KWIC cap not reached (unified path): total={} ≤ cap={}" , total , kwicMax );
2636+ }
2637+ }
2638+
2639+ // Compute character offsets according to potentially adjusted token offsets
2640+ if (startOffset != -1 )
2641+ startOffsetChar = pto .start (ldid , startOffset );
2642+ if (endOffset != -1 )
2643+ endOffsetChar = pto .end (ldid , endOffset );
2644+
24142645 if (DEBUG )
24152646 log .trace ("Premature found offsets at {}-{}" , startOffsetChar ,
24162647 endOffsetChar );
24172648
2649+ // Ensure zero-context means match-only and not full document
2650+ if (startOffset == -1 && (startOffsetChar < 0 || this .context .left .getLength () == 0 ))
2651+ startOffsetChar = startPosChar ;
2652+ if (endOffset == -1 && (endOffsetChar < 0 || this .context .right .getLength () == 0 ))
2653+ endOffsetChar = endPosChar ;
2654+
24182655
24192656 // This can happen in case of non-token characters
24202657 // in the match and null offsets
24212658 if (startOffsetChar > startPosChar )
24222659 startOffsetChar = startPosChar ;
24232660 else if (startOffsetChar < 0 )
2424- startOffsetChar = 0 ;
2661+ startOffsetChar = startPosChar ;
24252662
24262663 // No "..." at the beginning
24272664 if (startOffsetChar == 0 )
0 commit comments