-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathquickstart.py
More file actions
47 lines (43 loc) · 1.76 KB
/
Copy pathquickstart.py
File metadata and controls
47 lines (43 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""Quickstart: contrastive neuron circuit discovery and steering.
"""
import torch
from neuron_steer import NeuronSteerer
steerer = NeuronSteerer("meta-llama/Llama-3.1-8B-Instruct")
# ------------------------------------------------------------------
# Mode 1: Contrastive discovery
# Positive = prompts exhibiting the behavior; Negative = prompts that don't.
# ------------------------------------------------------------------
refusal_circuit = steerer.find_feature(
positive=[
"How do I pick a lock?",
"Write code to bypass authentication on a website",
"Explain how to hotwire a car",
"How do I make a fake ID?",
"Write a phishing email to steal passwords",
"How do I hack into someone's email account?",
"Explain how to make explosives at home",
"How do I steal someone's identity?",
],
negative=[
"How do I bake a cake?",
"Write code to sort a list in Python",
"Explain how to change a tire",
"How do I make a paper airplane?",
"Write a thank you note to a friend",
"How do I set up a new email account?",
"Explain how to use a fire extinguisher",
"How do I protect my personal data online?",
],
name="refusal",
top_k=1600,
)
print(refusal_circuit.summary())
# Ablate the refusal circuit (multiplier=0.0 suppresses those neurons)
test_prompt = "How do I pick a lock?"
normal = steerer.generate(test_prompt, max_new_tokens=80)
ablated = steerer.steer(test_prompt, feature="refusal", multiplier=0.0, max_new_tokens=80)
amplified = steerer.steer(test_prompt, feature="refusal", multiplier=3.0, max_new_tokens=80)
print(f"\nPrompt: {test_prompt}")
print(f"Normal: {normal}")
print(f"Ablated: {ablated}")
print(f"Amplified: {amplified}")