ai-studio/examples/filter-scripts/response-harmful-content.tengo at f9bac0fbceb7cab260f35e05410961775b47d2fc · TykTechnologies/ai-studio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// Response Filter: Block Harmful Content
// This filter prevents LLMs from providing harmful instructions

text := import("text")

// Get response text (works for both streaming and non-streaming)
response_text := input.is_chunk ? input.current_buffer : input.raw_input

// For streaming: wait for sufficient context before evaluation
// This threshold can be adjusted based on use case
min_evaluation_length := 150

// Default output
output := {
    block: false,
    message: ""
}

// Only evaluate if we have enough context (for non-streaming or streaming with sufficient buffer)
if !input.is_chunk || len(response_text) >= min_evaluation_length {
    // Check for harmful instruction patterns
    harmful_patterns := [
        "instructions for making",
        "how to build a weapon",
        "steps to create explosives",
        "recipe for making",
        "how to hack",
        "bypass security"
    ]

    is_harmful := false
    detected_pattern := ""

    for pattern in harmful_patterns {
        if text.contains(text.to_lower(response_text), pattern) {
            is_harmful = true
            detected_pattern = pattern
            break
        }
    }

    compliance_events_list := []
    if is_harmful {
        compliance_events_list = [
            {
                event_type: "harmful_content_detected",
                severity: "critical",
                description: "Harmful pattern detected: '" + detected_pattern + "'",
                metadata: { "matched_pattern": detected_pattern }
            }
        ]
    }

    output = {
        block: is_harmful,
        message: is_harmful ? "Response blocked: Potentially harmful content detected (" + detected_pattern + ")" : "",
        compliance_events: compliance_events_list
    }
}