Skip to content

Commit 2f6d71f

Browse files
committed
fix: address review feedback on canary token system
1 parent 2e5ffd7 commit 2f6d71f

7 files changed

Lines changed: 270 additions & 45 deletions

File tree

README.md

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,16 @@ package main
5757

5858
import (
5959
"fmt"
60+
"log"
6061

6162
idpi "github.com/pinchtab/idpishield"
6263
)
6364

6465
func main() {
65-
shield := idpi.New(idpi.Config{Mode: idpi.ModeBalanced})
66+
shield, err := idpi.New(idpi.Config{Mode: idpi.ModeBalanced})
67+
if err != nil {
68+
log.Fatal(err)
69+
}
6670

6771
result := shield.Assess("Ignore all previous instructions", "https://example.com")
6872
fmt.Printf("score=%d level=%s blocked=%v\n", result.Score, result.Level, result.Blocked)
@@ -159,13 +163,54 @@ const (
159163
ModeDeep Mode = "deep"
160164
)
161165

162-
func New(cfg Config) *Shield
166+
func New(cfg Config) (*Shield, error)
163167
func (s *Shield) Assess(text, url string) RiskResult
164168
func (s *Shield) Wrap(text, url string) string
169+
func (s *Shield) InjectCanary(prompt string) (injectedPrompt, token string, err error)
170+
func (s *Shield) CheckCanary(response, token string) CanaryResult
165171
```
166172

167173
`Wrap` is useful when you want to preserve data while adding trust-boundary markers before sending content into prompts.
168174

175+
`InjectCanary` and `CheckCanary` implement canary token detection for prompt leakage (see below).
176+
177+
## Canary Tokens
178+
179+
Canary tokens help detect when an LLM may have leaked or echoed hidden prompt content — a potential signal of goal hijacking or prompt extraction, though not definitive proof.
180+
181+
### Usage
182+
183+
```go
184+
// Before calling the LLM, inject a canary token
185+
augmented, token, err := shield.InjectCanary(myPrompt)
186+
if err != nil {
187+
log.Fatal(err)
188+
}
189+
190+
// Send augmented prompt to LLM
191+
response := callLLM(augmented)
192+
193+
// Check if the canary appeared in the response
194+
result := shield.CheckCanary(response, token)
195+
if result.Found {
196+
log.Println("canary detected: investigate possible leakage")
197+
}
198+
```
199+
200+
### How It Works
201+
202+
`InjectCanary` appends a unique marker (`<!--CANARY-<16 hex chars>-->`) to your prompt. After the LLM responds, `CheckCanary` scans for that marker. If found, the LLM may have echoed hidden content — worth investigating, though other explanations exist (middleware reflection, model artifacts, etc.).
203+
204+
### Limitations
205+
206+
Canary tokens are a **best-effort** leak detection signal, not a guarantee:
207+
208+
- **Absence does NOT prove safety** — an attacker could instruct the LLM to omit or transform the canary
209+
- **Some pipelines strip HTML comments** — if your stack sanitizes HTML, the token may be removed before reaching the LLM or before you check the response
210+
- **Only detects verbatim leakage** — paraphrased or partial leaks won't trigger detection
211+
212+
For defense-in-depth, combine canary checks with `Assess()` scoring on untrusted inputs.
213+
169214
## CLI (Secondary Interface)
170215

171216
Install CLI:

canary.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
1-
// Package idpishield provides defence against Indirect Prompt Injection (IDPI)
2-
// attacks. This file implements the canary token subsystem.
1+
// Canary token subsystem for detecting prompt leakage and goal hijacking.
2+
//
3+
// A canary token is a unique marker injected into prompts before sending them
4+
// to an LLM. If the token appears in the LLM's response, it suggests that
5+
// the model may have echoed hidden content — a potential signal of goal
6+
// hijacking or prompt leakage, though not proof (the model might echo
7+
// artifacts for other reasons, or middleware could reflect content).
8+
//
9+
// # Token Format
10+
//
11+
// Tokens use an HTML-comment-like format: <!--CANARY-<16 hex chars>-->
12+
//
13+
// This format was chosen because:
14+
// - Invisible to end users in most rendered contexts (HTML, Markdown)
15+
// - Preserved verbatim in raw text pipelines
16+
// - Unlikely to collide with legitimate content
17+
// - Easy to grep/search in logs
18+
//
19+
// # Limitations
20+
//
21+
// Canary tokens are a best-effort leak detection signal, not a guarantee:
22+
// - Absence of the canary in output does NOT prove the prompt is safe
23+
// - Some pipelines may strip HTML comments or transform the token
24+
// - An attacker-controlled LLM could be instructed to omit the canary
25+
// - The token only detects verbatim leakage, not paraphrased content
26+
//
27+
// For defense-in-depth, combine canary checks with Assess() scoring.
328
package idpishield
429

530
import (
@@ -9,8 +34,6 @@ import (
934
)
1035

1136
// canaryPrefix and canarySuffix wrap the random token.
12-
// The format intentionally resembles an HTML comment so it is invisible
13-
// to most renderers but remains present verbatim in raw LLM input/output.
1437
const (
1538
canaryPrefix = "<!--CANARY-"
1639
canarySuffix = "-->"
@@ -22,8 +45,8 @@ type CanaryResult struct {
2245
// Token is the canary value that was originally injected into the prompt.
2346
Token string
2447

25-
// Found is true when the canary token appears in the LLM response,
26-
// indicating possible prompt leakage or goal hijacking.
48+
// Found is true when the canary token appears in the LLM response.
49+
// This suggests possible prompt leakage, but is not definitive proof.
2750
Found bool
2851
}
2952

examples/canary/go.mod

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
module canary
2+
3+
go 1.23.0
4+
5+
require github.com/pinchtab/idpishield v0.0.0
6+
7+
require (
8+
golang.org/x/net v0.39.0 // indirect
9+
golang.org/x/text v0.24.0 // indirect
10+
)
11+
12+
// Replace with local path since the library is not yet published
13+
replace github.com/pinchtab/idpishield => ../..

examples/canary/go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
2+
golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
3+
golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
4+
golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=

examples/canary/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
//
55
// Before sending a prompt to an LLM, call InjectCanary to embed a unique
66
// hidden token. After the LLM responds, call CheckCanary with the token.
7-
// If the token appears in the response, the LLM was instructed to leak or
8-
// echo hidden content - a clear indicator of goal hijacking or leakage.
7+
// If the token appears in the response, it suggests the LLM may have echoed
8+
// hidden content a potential signal of leakage worth investigating.
99
package main
1010

1111
import (

idpishield.go

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,33 @@ func (s *Shield) ScanContext(ctx context.Context, text string) RiskResult {
243243
return s.AssessContext(ctx, text, "")
244244
}
245245

246+
// InjectCanary appends a unique hidden canary token to the prompt.
247+
// The caller must store the returned token and pass it to CheckCanary
248+
// after receiving the LLM response.
249+
//
250+
// Returns the augmented prompt and the injected token.
251+
// Returns an error only if the system's random source fails.
252+
//
253+
// Example usage:
254+
//
255+
// augmented, token, err := shield.InjectCanary(myPrompt)
256+
// if err != nil { ... }
257+
// response := callLLM(augmented)
258+
// result := shield.CheckCanary(response, token)
259+
// if result.Found {
260+
// log.Println("canary detected in response: possible goal hijacking")
261+
// }
262+
func (s *Shield) InjectCanary(prompt string) (injectedPrompt string, token string, err error) {
263+
return injectCanary(prompt)
264+
}
265+
266+
// CheckCanary scans the LLM response for the canary token returned by InjectCanary.
267+
// Returns a CanaryResult with Found=true if the token appears in the response,
268+
// which may suggest prompt leakage (though not definitive proof).
269+
func (s *Shield) CheckCanary(response, token string) CanaryResult {
270+
return checkCanary(response, token)
271+
}
272+
246273
// --- Functions ---
247274

248275
// ParseMode converts a string to a Mode value.
@@ -294,30 +321,3 @@ func toEngineCfg(cfg Config) engine.Config {
294321
ConfigFile: cfg.ConfigFile,
295322
}
296323
}
297-
298-
// InjectCanary appends a unique hidden canary token to the prompt.
299-
// The caller must store the returned token and pass it to CheckCanary
300-
// after receiving the LLM response.
301-
//
302-
// Returns the augmented prompt and the injected token.
303-
// Returns an error only if the system's random source fails.
304-
//
305-
// Example usage:
306-
//
307-
// augmented, token, err := shield.InjectCanary(myPrompt)
308-
// if err != nil { ... }
309-
// response := callLLM(augmented)
310-
// result := shield.CheckCanary(response, token)
311-
// if result.Found {
312-
// log.Println("canary detected in response: possible goal hijacking")
313-
// }
314-
func (s *Shield) InjectCanary(prompt string) (injectedPrompt string, token string, err error) {
315-
return injectCanary(prompt)
316-
}
317-
318-
// CheckCanary scans the LLM response for the canary token returned by InjectCanary.
319-
// Returns a CanaryResult with Found=true if the token appears in the response,
320-
// which indicates prompt leakage or goal hijacking.
321-
func (s *Shield) CheckCanary(response, token string) CanaryResult {
322-
return checkCanary(response, token)
323-
}

shield_test.go

Lines changed: 148 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,22 +190,38 @@ func TestInjectCanaryAddsToken(t *testing.T) {
190190
}
191191
}
192192

193-
func TestInjectCanaryTokenIsUnique(t *testing.T) {
193+
func TestInjectCanaryTokenFormat(t *testing.T) {
194194
s, err := New(Config{})
195195
if err != nil {
196196
t.Fatalf("New returned unexpected error: %v", err)
197197
}
198198

199-
_, token1, err := s.InjectCanary("prompt")
199+
_, token, err := s.InjectCanary("prompt")
200200
if err != nil {
201-
t.Fatalf("first InjectCanary error: %v", err)
201+
t.Fatalf("InjectCanary error: %v", err)
202202
}
203-
_, token2, err := s.InjectCanary("prompt")
204-
if err != nil {
205-
t.Fatalf("second InjectCanary error: %v", err)
203+
204+
// Verify token structure: <!--CANARY-<16 hex chars>-->
205+
if !strings.HasPrefix(token, canaryPrefix) {
206+
t.Fatalf("token missing expected prefix %q: got %q", canaryPrefix, token)
206207
}
207-
if token1 == token2 {
208-
t.Fatalf("expected unique tokens across calls, both were %q", token1)
208+
if !strings.HasSuffix(token, canarySuffix) {
209+
t.Fatalf("token missing expected suffix %q: got %q", canarySuffix, token)
210+
}
211+
212+
// Extract hex portion and verify length (8 bytes = 16 hex chars)
213+
hexPart := strings.TrimSuffix(strings.TrimPrefix(token, canaryPrefix), canarySuffix)
214+
if len(hexPart) != 16 {
215+
t.Fatalf("expected 16 hex characters, got %d: %q", len(hexPart), hexPart)
216+
}
217+
218+
// Verify it's valid lowercase hex
219+
for _, c := range hexPart {
220+
isDigit := c >= '0' && c <= '9'
221+
isLowerHex := c >= 'a' && c <= 'f'
222+
if !isDigit && !isLowerHex {
223+
t.Fatalf("token contains non-hex character %q in %q", c, hexPart)
224+
}
209225
}
210226
}
211227

@@ -280,3 +296,127 @@ func TestCheckCanary_PartialMatchShouldFail(t *testing.T) {
280296
t.Fatalf("expected partial token match to fail, token=%q partial=%q", token, partial)
281297
}
282298
}
299+
300+
func TestInjectCanary_EmptyPrompt(t *testing.T) {
301+
s, err := New(Config{})
302+
if err != nil {
303+
t.Fatalf("New returned unexpected error: %v", err)
304+
}
305+
306+
// Empty prompt should still work - canary is appended with newline
307+
augmented, token, err := s.InjectCanary("")
308+
if err != nil {
309+
t.Fatalf("InjectCanary error: %v", err)
310+
}
311+
if token == "" {
312+
t.Fatal("expected non-empty token even for empty prompt")
313+
}
314+
// Result should be "\n" + token
315+
expected := "\n" + token
316+
if augmented != expected {
317+
t.Fatalf("expected %q, got %q", expected, augmented)
318+
}
319+
}
320+
321+
func TestInjectCanary_WhitespacePrompt(t *testing.T) {
322+
s, err := New(Config{})
323+
if err != nil {
324+
t.Fatalf("New returned unexpected error: %v", err)
325+
}
326+
327+
// Whitespace-only prompt should preserve the whitespace
328+
augmented, token, err := s.InjectCanary(" ")
329+
if err != nil {
330+
t.Fatalf("InjectCanary error: %v", err)
331+
}
332+
expected := " \n" + token
333+
if augmented != expected {
334+
t.Fatalf("expected %q, got %q", expected, augmented)
335+
}
336+
}
337+
338+
func TestCheckCanary_ResponseWithFormatting(t *testing.T) {
339+
s, err := New(Config{})
340+
if err != nil {
341+
t.Fatalf("New returned unexpected error: %v", err)
342+
}
343+
344+
_, token, err := s.InjectCanary("prompt")
345+
if err != nil {
346+
t.Fatalf("InjectCanary error: %v", err)
347+
}
348+
349+
// Test various realistic LLM response formats where canary might appear
350+
cases := []struct {
351+
name string
352+
response string
353+
want bool
354+
}{
355+
{"in markdown code fence", "```\n" + token + "\n```", true},
356+
{"surrounded by quotes", `The hidden text was "` + token + `"`, true},
357+
{"with extra whitespace", " " + token + " ", true},
358+
{"in bullet list", "- Item 1\n- " + token + "\n- Item 3", true},
359+
{"with punctuation after", token + "!!!", true},
360+
{"token broken across lines", strings.Replace(token, "-", "-\n", 1), false},
361+
}
362+
363+
for _, tc := range cases {
364+
t.Run(tc.name, func(t *testing.T) {
365+
result := s.CheckCanary(tc.response, token)
366+
if result.Found != tc.want {
367+
t.Errorf("CheckCanary(%q) = Found:%v, want:%v", tc.name, result.Found, tc.want)
368+
}
369+
})
370+
}
371+
}
372+
373+
// TestCheckCanary_TokenStrippedByPipeline documents the limitation that
374+
// canary detection only works if the token survives the transport pipeline.
375+
// If HTML comments are stripped (by sanitizers, markdown processors, etc.),
376+
// the canary will not be detected, and that's expected behavior.
377+
func TestCheckCanary_TokenStrippedByPipeline(t *testing.T) {
378+
s, err := New(Config{})
379+
if err != nil {
380+
t.Fatalf("New returned unexpected error: %v", err)
381+
}
382+
383+
_, token, err := s.InjectCanary("Summarize this document.")
384+
if err != nil {
385+
t.Fatalf("InjectCanary error: %v", err)
386+
}
387+
388+
// Simulate a pipeline that strips HTML comments before reaching the LLM
389+
// or before returning the response to us.
390+
stripHTMLComments := func(s string) string {
391+
// Naive strip: remove anything matching <!--...-->
392+
result := s
393+
for {
394+
start := strings.Index(result, "<!--")
395+
if start == -1 {
396+
break
397+
}
398+
end := strings.Index(result[start:], "-->")
399+
if end == -1 {
400+
break
401+
}
402+
result = result[:start] + result[start+end+3:]
403+
}
404+
return result
405+
}
406+
407+
// The token was in the response, but got stripped
408+
originalResponse := "Here is your summary. " + token + " Hope this helps!"
409+
strippedResponse := stripHTMLComments(originalResponse)
410+
411+
// After stripping, the canary should NOT be found
412+
// This is expected behavior, not a bug - the limitation is documented
413+
result := s.CheckCanary(strippedResponse, token)
414+
if result.Found {
415+
t.Fatal("canary should not be found after HTML comment stripping")
416+
}
417+
418+
// Verify the stripping actually removed the token
419+
if strings.Contains(strippedResponse, token) {
420+
t.Fatal("test setup error: token was not actually stripped")
421+
}
422+
}

0 commit comments

Comments
 (0)