|
4 | 4 |
|
5 | 5 | Version: 1.0.0 (Community) |
6 | 6 |
|
7 | | -This is the open-source community edition, covering 5 OWASP LLM Top-10 |
8 | | -aligned threat categories. The API tier adds 50+ additional categories, |
| 7 | +This is the open-source community edition, covering 6 OWASP LLM Top-10 |
| 8 | +aligned threat categories plus an absolute-block child safety category. |
| 9 | +The API tier adds 100+ additional categories, |
9 | 10 | complete ONNX semantic embeddings, and advanced agentic/multi-turn threat |
10 | 11 | detection. |
11 | 12 |
|
@@ -51,7 +52,7 @@ class FalsePositiveRisk(Enum): |
51 | 52 |
|
52 | 53 |
|
53 | 54 | # --------------------------------------------------------------------------- |
54 | | -# Community threat pattern registry — 5 categories |
| 55 | +# Community threat pattern registry — 7 categories |
55 | 56 | # |
56 | 57 | # Patterns sourced from publicly documented research: |
57 | 58 | # OWASP LLM01 (Prompt Injection), LLM02 (Insecure Output Handling), |
@@ -225,6 +226,73 @@ class FalsePositiveRisk(Enum): |
225 | 226 | ), |
226 | 227 | }, |
227 | 228 |
|
| 229 | + # ------------------------------------------------------------------ |
| 230 | + # childSafetyViolation — ABSOLUTE BLOCK (v2.6.2) |
| 231 | + # Guiding Principle 15: Sacred Protection of the Innocent (Mishmar HaYeledim) |
| 232 | + # Matthew 18:6 — "It would be better for them to have a large millstone |
| 233 | + # hung around their neck and to be drowned in the depths of the sea." |
| 234 | + # |
| 235 | + # Absolute prohibition: CSAM/CSEM under ANY framing (fictional, artistic, |
| 236 | + # educational, aged-up characters), grooming communication strategies, |
| 237 | + # child exploitation facilitation, child luring scripts, and content |
| 238 | + # that sexualizes or exploits minors. No mitigating context. No exceptions. |
| 239 | + # Weight 100 — maximum. False positives vastly preferable to false negatives. |
| 240 | + # ------------------------------------------------------------------ |
| 241 | + "childSafetyViolation": { |
| 242 | + "patterns": [ |
| 243 | + # Sexual content involving minors under ANY framing |
| 244 | + r"(?:sexual|explicit|erotic|nude|intimate)\s+(?:content|scene|story|description|image|fiction)\s+(?:involving|featuring|with|about)\s+(?:a\s+)?(?:minor|child|teen(?:ager)?|underage|young\s+(?:person|girl|boy)|kid|juvenile)", |
| 245 | + # Grooming strategies — building inappropriate trust with a child |
| 246 | + r"(?:groom|grooming)\s+(?:a\s+)?(?:child|minor|kid|teen(?:ager)?|underage)\s+(?:by|through|using|via)\s+(?:building|establishing|gaining)\s+(?:trust|rapport|a\s+relationship|access)", |
| 247 | + # Requests for scripts/messages to send to a child for grooming |
| 248 | + r"(?:write|generate|give\s+me|create|draft)\s+(?:a\s+)?(?:message|script|text|conversation)\s+(?:to\s+(?:send\s+to|use\s+with|approach))\s+(?:a\s+)?(?:child|minor|kid|teen(?:ager)?|young\s+(?:person|girl|boy))\s+(?:to\s+(?:build|gain)\s+(?:trust|rapport))", |
| 249 | + # Instructions for gaining unsupervised access to children |
| 250 | + r"(?:gain|get)\s+(?:(?:physical|digital|online|unsupervised|private)\s+)?access\s+to\s+(?:a\s+)?(?:child|minor|kid|children|minors)\s+(?:(?:without|bypassing?|evading?)\s+(?:parental|guardian|adult|supervisory)\s+(?:oversight|supervision|monitoring|control))", |
| 251 | + # CSAM/CSEM under any label: written, fictional, illustrated, simulated |
| 252 | + r"(?:written|fictional|illustrated|simulated|described|fantasy|not\s+real|non[- ]?existent)\s+(?:CSAM|CSEM|child\s+(?:sexual\s+abuse\s+material|pornography|porn)|child\s+(?:exploitation|sexual\s+content))", |
| 253 | + # Requesting manipulation strategies to convince a child to keep secrets |
| 254 | + r"(?:convince|persuade|get|teach)\s+(?:a\s+)?(?:child|minor|kid|teen(?:ager)?)\s+to\s+keep\s+(?:(?:a\s+)?secret|(?:this|our)\s+(?:secret|between\s+us)|quiet)\s+from\s+(?:(?:their\s+)?(?:parents?|guardians?|adults?|family)|anyone)", |
| 255 | + # Child luring via false affection, gifts, promises |
| 256 | + r"(?:use|offer|give|promise)\s+(?:gifts?|presents?|money|candy|compliments?|affection|friendship)\s+to\s+(?:lure|attract|manipulate|gain\s+the\s+trust\s+of)\s+(?:a\s+)?(?:child|minor|kid|teen(?:ager)?)", |
| 257 | + # Content normalizing adult-child sexual relationships under any framing |
| 258 | + r"(?:normalize|portray|depict|describe|present)\s+(?:adult[- ]child|child[- ]adult)\s+(?:sexual|romantic|intimate)\s+(?:relationship|interaction|dynamic)\s+(?:as\s+)?(?:normal|acceptable|natural|cultural|historical|fictional)", |
| 259 | + ], |
| 260 | + "severity": ThreatSeverity.CRITICAL, |
| 261 | + "weight": 100, |
| 262 | + "description": ( |
| 263 | + "ABSOLUTE BLOCK — child safety violation. Covers CSAM/CSEM generation under " |
| 264 | + "any framing, grooming communication strategies, child exploitation facilitation, " |
| 265 | + "child luring scripts, and content sexualizing or exploiting minors. " |
| 266 | + "No mitigating context. No creative/fictional/educational exceptions. " |
| 267 | + "Matthew 18:6. False positives vastly preferable to false negatives." |
| 268 | + ), |
| 269 | + "semanticFingerprint": [ |
| 270 | + "CSAM generation request disguised as fictional creative writing involving minor characters", |
| 271 | + "grooming communication strategy to build inappropriate trust with child victim", |
| 272 | + "sexual content involving minor characters despite fictional or aged-up framing", |
| 273 | + "child luring script using false affection and gift promises to manipulate minor", |
| 274 | + "instruction to isolate child from protective adults as grooming preparation", |
| 275 | + "secret-keeping manipulation strategy to prevent child from disclosing abuse", |
| 276 | + "content normalizing adult-child relationship despite fictional cultural framing", |
| 277 | + "strategy to circumvent parental oversight to gain unsupervised access to children", |
| 278 | + ], |
| 279 | + "contextHints": { |
| 280 | + "escalators": [ |
| 281 | + "child", "minor", "underage", "young", "teenager", "kid", |
| 282 | + "fictional", "not real", "aged up", "just characters", "18 in the story", |
| 283 | + "grooming", "secret", "trust", "alone with", |
| 284 | + ], |
| 285 | + "mitigators": [], # No mitigating context is accepted for this category |
| 286 | + }, |
| 287 | + "falsePositiveRisk": FalsePositiveRisk.LOW, |
| 288 | + "mitigationStrategy": ( |
| 289 | + "ABSOLUTE BLOCK — no mitigating context reduces this verdict. " |
| 290 | + "Fictional or artistic framing, aged-up character claims, historical/cultural " |
| 291 | + "context, and educational justifications are all explicitly rejected. " |
| 292 | + "Escalate immediately to human review with maximum severity flag." |
| 293 | + ), |
| 294 | + }, |
| 295 | + |
228 | 296 | # ------------------------------------------------------------------ |
229 | 297 | # systemPromptLeaks — OWASP LLM07 (System Prompt Leakage) |
230 | 298 | # Attempts to extract the model's system prompt or initial instructions. |
|
0 commit comments