Skip to content

Commit 7846689

Browse files
authored
feat(library): add context bloat detection rail (#1941)
1 parent 48f5eeb commit 7846689

7 files changed

Lines changed: 714 additions & 0 deletions

File tree

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Context Bloat Detection
2+
3+
Detects context-manipulation attacks where attacker-controlled content (retrieved chunks or user
4+
input) is padded, oversized, or repetitively structured to cause system prompt forgetting or
5+
exhaust the token budget.
6+
7+
## Wiring
8+
9+
Add the flows you need to your `config.yml`:
10+
11+
```yaml
12+
rails:
13+
retrieval:
14+
flows:
15+
- context bloat detection on retrieval
16+
17+
input:
18+
flows:
19+
- context bloat detection on input
20+
```
21+
22+
## Configuration
23+
24+
All fields are optional; defaults are shown below.
25+
26+
```yaml
27+
rails:
28+
config:
29+
context_bloat_detection:
30+
# Size cap in characters. Inputs exceeding this are flagged.
31+
# Typically <5k for well-scoped agents.
32+
max_chars: 5000
33+
34+
# Minimum characters before entropy/run/repetition checks apply.
35+
# Shorter texts are only checked against the size cap.
36+
min_chars: 50
37+
38+
# Shannon entropy floor (bits per char).
39+
# English prose is roughly 4.0-4.5; padded/repetitive text drops below ~3.5.
40+
min_entropy: 3.5
41+
42+
# Maximum fraction of repeated n-grams (0.0-1.0).
43+
# Values above 0.4 indicate padding-style repetition.
44+
max_repetition_ratio: 0.4
45+
46+
# N-gram size used for repetition detection.
47+
ngram_size: 3
48+
49+
# Maximum fraction of text that is the longest single-character run.
50+
# Catches "AAAAAAA..." or whitespace padding.
51+
max_run_ratio: 0.1
52+
53+
# Action on detection:
54+
# reject -- abort the flow and return a user-facing message (recommended)
55+
# truncate -- truncate to max_chars at the size cap, reject on all other checks
56+
# warn -- log only, do not modify or block
57+
action: reject
58+
```
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Context bloat detection action
17+
18+
Detects context-manipulation attacks where attacker-controlled content
19+
(retrieved chunks or user input) is padded, oversized, or repetitively
20+
structured to cause system prompt forgetting, bury instructions mid-context
21+
(harder to detect), or exhaust token budget.
22+
23+
Checks:
24+
* Size cap
25+
* Entropy (sampling for very large inputs)
26+
* Longest repeated character
27+
* Repeated n-grams
28+
* Check order: size > entropy > run > repetition
29+
30+
Wire as retrieval rail (RAG chunks) or input rail.
31+
"""
32+
33+
import logging
34+
import math
35+
from collections import Counter
36+
from typing import List, Optional, TypedDict
37+
38+
from nemoguardrails import RailsConfig
39+
from nemoguardrails.actions import action
40+
41+
log = logging.getLogger(__name__)
42+
43+
# Entropy is statistically stable well below this threshold; sampling avoids O(n) work.
44+
ENTROPY_SAMPLE_THRESHOLD = 10000
45+
ENTROPY_SAMPLE_CHARS = 8000
46+
47+
48+
class ContextBloatResult(TypedDict):
49+
is_bloat: bool
50+
action: str
51+
text: str
52+
reason: Optional[str]
53+
detections: List[str]
54+
metrics: dict
55+
56+
57+
def _stratified_sample(text: str, sample_chars: int) -> str:
58+
third = sample_chars // 3
59+
mid = len(text) // 2
60+
return text[:third] + text[mid - third // 2 : mid + third // 2] + text[-third:]
61+
62+
63+
def _shannon_entropy(text: str) -> float:
64+
"""Samples large inputs to bound runtime."""
65+
if not text:
66+
return 0.0
67+
sample = _stratified_sample(text, ENTROPY_SAMPLE_CHARS) if len(text) > ENTROPY_SAMPLE_THRESHOLD else text
68+
counts = Counter(sample)
69+
total = len(sample)
70+
return -sum((c / total) * math.log2(c / total) for c in counts.values())
71+
72+
73+
def _repetition_ratio(text: str, n: int = 3) -> float:
74+
"""High values are a padding-attack signature."""
75+
if not text:
76+
return 0.0
77+
tokens = text.split()
78+
if len(tokens) < n:
79+
return 0.0
80+
ngrams = [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
81+
counter = Counter(ngrams)
82+
repeated = sum(c - 1 for c in counter.values() if c > 1)
83+
return repeated / len(ngrams) if ngrams else 0.0
84+
85+
86+
def _longest_run_ratio(text: str) -> float:
87+
"""Fraction of text that is the longest run of a single character."""
88+
if not text:
89+
return 0.0
90+
n = len(text)
91+
longest = 1
92+
i = 0
93+
while i < n:
94+
j = i + 1
95+
while j < n and text[j] == text[i]:
96+
j += 1
97+
if j - i > longest:
98+
longest = j - i
99+
i = j
100+
return longest / n
101+
102+
103+
def _validate_config(config: RailsConfig) -> None:
104+
cfg = getattr(config.rails.config, "context_bloat_detection", None)
105+
if cfg is None:
106+
raise ValueError("context_bloat_detection configuration is missing in RailsConfig.")
107+
if cfg.action not in {"reject", "truncate", "warn"}:
108+
raise ValueError(f"Expected 'reject', 'truncate', or 'warn' but got {cfg.action!r}.")
109+
110+
111+
def _check_entropy(text: str, cfg, detections: List[str], metrics: dict) -> Optional[ContextBloatResult]:
112+
entropy = _shannon_entropy(text)
113+
metrics["entropy"] = round(entropy, 3)
114+
if entropy < cfg.min_entropy:
115+
detections.append("low_entropy")
116+
if cfg.action in ("reject", "truncate"):
117+
log.info(f"context bloat detected: low_entropy | entropy={entropy:.3f}")
118+
return ContextBloatResult(
119+
is_bloat=True,
120+
action="reject",
121+
text=text,
122+
reason="low_entropy",
123+
detections=detections,
124+
metrics=metrics,
125+
)
126+
return None
127+
128+
129+
def _check_longest_run(text: str, cfg, detections: List[str], metrics: dict) -> Optional[ContextBloatResult]:
130+
run_ratio = _longest_run_ratio(text)
131+
metrics["longest_run_ratio"] = round(run_ratio, 3)
132+
if run_ratio > cfg.max_run_ratio:
133+
detections.append("long_run")
134+
if cfg.action in ("reject", "truncate"):
135+
log.info(f"context bloat detected: long_run | run_ratio={run_ratio:.3f}")
136+
return ContextBloatResult(
137+
is_bloat=True,
138+
action="reject",
139+
text=text,
140+
reason="long_run",
141+
detections=detections,
142+
metrics=metrics,
143+
)
144+
return None
145+
146+
147+
def _check_repetition(text: str, cfg, detections: List[str], metrics: dict) -> Optional[ContextBloatResult]:
148+
rep_ratio = _repetition_ratio(text, n=cfg.ngram_size)
149+
metrics["repetition_ratio"] = round(rep_ratio, 3)
150+
if rep_ratio > cfg.max_repetition_ratio:
151+
detections.append("high_repetition")
152+
if cfg.action in ("reject", "truncate"):
153+
log.info(f"context bloat detected: high_repetition | rep_ratio={rep_ratio:.3f}")
154+
return ContextBloatResult(
155+
is_bloat=True,
156+
action="reject",
157+
text=text,
158+
reason="high_repetition",
159+
detections=detections,
160+
metrics=metrics,
161+
)
162+
return None
163+
164+
165+
@action()
166+
async def context_bloat_detection(text: str, config: RailsConfig) -> ContextBloatResult:
167+
"""Detect context-bloat / context-manipulation attacks.
168+
Check order is cheapest first to enable early-exit.
169+
170+
Args:
171+
text: The text to inspect (joined chunks or user message).
172+
config: RailsConfig with rails.config.context_bloat_detection settings.
173+
174+
Returns:
175+
ContextBloatResult with is_bloat flag, processed text, reason, metrics.
176+
"""
177+
_validate_config(config)
178+
cfg = config.rails.config.context_bloat_detection
179+
180+
char_count = len(text) if text else 0
181+
detections: List[str] = []
182+
metrics: dict = {"chars": char_count}
183+
184+
if char_count > cfg.max_chars:
185+
detections.append("size_cap_exceeded")
186+
log.info(f"context bloat detected: size_cap_exceeded | chars={char_count}")
187+
if cfg.action == "reject":
188+
return ContextBloatResult(
189+
is_bloat=True,
190+
action=cfg.action,
191+
text=text,
192+
reason="size_cap_exceeded",
193+
detections=detections,
194+
metrics=metrics,
195+
)
196+
if cfg.action == "truncate":
197+
text = text[: cfg.max_chars]
198+
199+
if char_count >= cfg.min_chars:
200+
for check in (_check_entropy, _check_longest_run, _check_repetition):
201+
result = check(text, cfg, detections, metrics)
202+
if result is not None:
203+
return result
204+
205+
is_bloat = bool(detections)
206+
reason = ", ".join(detections) if detections else None
207+
if is_bloat:
208+
log.info(f"context bloat detected: {reason} | metrics={metrics}")
209+
return ContextBloatResult(
210+
is_bloat=is_bloat,
211+
action=cfg.action,
212+
text=text,
213+
reason=reason,
214+
detections=detections,
215+
metrics=metrics,
216+
)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
2+
# RETRIEVAL RAILS
3+
4+
flow context bloat detection on retrieval
5+
"""Detect bloated or padded content in retrieved RAG chunks."""
6+
global $relevant_chunks
7+
$bloat_result = await ContextBloatDetectionAction(text=$relevant_chunks)
8+
if $bloat_result.is_bloat and $bloat_result.action == "reject"
9+
bot inform retrieval bloated
10+
abort
11+
if $bloat_result.is_bloat and $bloat_result.action == "truncate"
12+
$relevant_chunks = $bloat_result.text
13+
14+
flow bot inform retrieval bloated
15+
bot say "The retrieved sources for this question appeared oversized or padded. I'm not using them to avoid context manipulation."
16+
17+
# INPUT RAILS
18+
19+
flow context bloat detection on input
20+
"""Detect bloated or padded user-supplied content."""
21+
global $user_message
22+
$bloat_result = await ContextBloatDetectionAction(text=$user_message)
23+
if $bloat_result.is_bloat and $bloat_result.action == "reject"
24+
bot refuse bloated input
25+
abort
26+
if $bloat_result.is_bloat and $bloat_result.action == "truncate"
27+
$user_message = $bloat_result.text
28+
29+
flow bot refuse bloated input
30+
bot say "Your message appears oversized or padded with repetitive content. Please send a shorter message focused on your question."
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
# RETRIEVAL RAILS
3+
4+
define subflow context bloat detection on retrieval
5+
"""Detect bloated or padded content in retrieved RAG chunks."""
6+
$bloat_result = execute context_bloat_detection(text=$relevant_chunks)
7+
if $bloat_result.is_bloat and $bloat_result.action == "reject"
8+
bot inform retrieval bloated
9+
stop
10+
if $bloat_result.is_bloat and $bloat_result.action == "truncate"
11+
$relevant_chunks = $bloat_result.text
12+
13+
define bot inform retrieval bloated
14+
"The retrieved sources for this question appeared oversized or padded. I'm not using them to avoid context manipulation."
15+
16+
# INPUT RAILS
17+
18+
define subflow context bloat detection on input
19+
"""Detect bloated or padded user-supplied content."""
20+
$bloat_result = execute context_bloat_detection(text=$user_message)
21+
if $bloat_result.is_bloat and $bloat_result.action == "reject"
22+
bot refuse bloated input
23+
stop
24+
if $bloat_result.is_bloat and $bloat_result.action == "truncate"
25+
$user_message = $bloat_result.text
26+
27+
define bot refuse bloated input
28+
"Your message appears oversized or padded with repetitive content. Please send a shorter message focused on your question."

0 commit comments

Comments
 (0)