|
| 1 | +// Canary token subsystem for detecting prompt leakage and goal hijacking. |
| 2 | +// |
| 3 | +// A canary token is a unique marker injected into prompts before sending them |
| 4 | +// to an LLM. If the token appears in the LLM's response, it suggests that |
| 5 | +// the model may have echoed hidden content — a potential signal of goal |
| 6 | +// hijacking or prompt leakage, though not proof (the model might echo |
| 7 | +// artifacts for other reasons, or middleware could reflect content). |
| 8 | +// |
| 9 | +// # Token Format |
| 10 | +// |
| 11 | +// Tokens use an HTML-comment-like format: <!--CANARY-<16 hex chars>--> |
| 12 | +// |
| 13 | +// This format was chosen because: |
| 14 | +// - Invisible to end users in most rendered contexts (HTML, Markdown) |
| 15 | +// - Preserved verbatim in raw text pipelines |
| 16 | +// - Unlikely to collide with legitimate content |
| 17 | +// - Easy to grep/search in logs |
| 18 | +// |
| 19 | +// # Limitations |
| 20 | +// |
| 21 | +// Canary tokens are a best-effort leak detection signal, not a guarantee: |
| 22 | +// - Absence of the canary in output does NOT prove the prompt is safe |
| 23 | +// - Some pipelines may strip HTML comments or transform the token |
| 24 | +// - An attacker-controlled LLM could be instructed to omit the canary |
| 25 | +// - The token only detects verbatim leakage, not paraphrased content |
| 26 | +// |
| 27 | +// For defense-in-depth, combine canary checks with Assess() scoring. |
| 28 | +package idpishield |
| 29 | + |
| 30 | +import ( |
| 31 | + "crypto/rand" |
| 32 | + "encoding/hex" |
| 33 | + "strings" |
| 34 | +) |
| 35 | + |
| 36 | +// canaryPrefix and canarySuffix wrap the random token. |
| 37 | +const ( |
| 38 | + canaryPrefix = "<!--CANARY-" |
| 39 | + canarySuffix = "-->" |
| 40 | +) |
| 41 | + |
| 42 | +// CanaryResult is returned by CheckCanary and reports whether the injected |
| 43 | +// canary token was detected in the LLM response. |
| 44 | +type CanaryResult struct { |
| 45 | + // Token is the canary value that was originally injected into the prompt. |
| 46 | + Token string |
| 47 | + |
| 48 | + // Found is true when the canary token appears in the LLM response. |
| 49 | + // This suggests possible prompt leakage, but is not definitive proof. |
| 50 | + Found bool |
| 51 | +} |
| 52 | + |
| 53 | +// generateCanaryToken returns a cryptographically random canary token with |
| 54 | +// the format: <!--CANARY-<16 lowercase hex chars>--> |
| 55 | +// 8 random bytes produce 16 hex characters, giving 2^64 unique values. |
| 56 | +// Returns a non-nil error only if the system entropy source fails. |
| 57 | +func generateCanaryToken() (string, error) { |
| 58 | + b := make([]byte, 8) |
| 59 | + if _, err := rand.Read(b); err != nil { |
| 60 | + return "", err |
| 61 | + } |
| 62 | + return canaryPrefix + hex.EncodeToString(b) + canarySuffix, nil |
| 63 | +} |
| 64 | + |
| 65 | +// injectCanary appends the canary token on a new line at the end of prompt. |
| 66 | +// Returns: |
| 67 | +// - injectedPrompt : original prompt with the token appended |
| 68 | +// - token : the canary string the caller must hold for later checking |
| 69 | +// - err : non-nil only if entropy generation fails |
| 70 | +func injectCanary(prompt string) (injectedPrompt string, token string, err error) { |
| 71 | + token, err = generateCanaryToken() |
| 72 | + if err != nil { |
| 73 | + return prompt, "", err |
| 74 | + } |
| 75 | + return prompt + "\n" + token, token, nil |
| 76 | +} |
| 77 | + |
| 78 | +// checkCanary reports whether token is present in response. |
| 79 | +// An empty token always returns Found=false to prevent false positives. |
| 80 | +func checkCanary(response, token string) CanaryResult { |
| 81 | + if token == "" { |
| 82 | + return CanaryResult{Token: token, Found: false} |
| 83 | + } |
| 84 | + trimmedResponse := strings.TrimSpace(response) |
| 85 | + if trimmedResponse == "" { |
| 86 | + return CanaryResult{Token: token, Found: false} |
| 87 | + } |
| 88 | + return CanaryResult{ |
| 89 | + Token: token, |
| 90 | + Found: strings.Contains(trimmedResponse, token), |
| 91 | + } |
| 92 | +} |
0 commit comments