|
1 | 1 | import asyncio |
2 | | -import re |
| 2 | +from dataclasses import dataclass |
3 | 3 |
|
4 | | -import faiss |
| 4 | +import torch |
5 | 5 |
|
6 | | -from delphi.explainers.default.prompt_builder import build_single_token_prompt |
| 6 | +from delphi.explainers.default.prompts import SYSTEM_CONTRASTIVE |
7 | 7 | from delphi.explainers.explainer import Explainer, ExplainerResult |
8 | | -from delphi.logger import logger |
| 8 | +from delphi.latents.latents import ActivatingExample, LatentRecord, NonActivatingExample |
9 | 9 |
|
10 | 10 |
|
| 11 | +@dataclass |
11 | 12 | class ContrastiveExplainer(Explainer): |
12 | | - name = "contrastive" |
13 | | - |
14 | | - def __init__( |
15 | | - self, |
16 | | - client, |
17 | | - tokenizer, |
18 | | - index: faiss.Index, |
19 | | - verbose: bool = False, |
20 | | - activations: bool = False, |
21 | | - cot: bool = False, |
22 | | - threshold: float = 0.6, |
23 | | - temperature: float = 0.0, |
24 | | - **generation_kwargs, |
25 | | - ): |
26 | | - self.client = client |
27 | | - self.tokenizer = tokenizer |
28 | | - self.index = index |
29 | | - self.verbose = verbose |
30 | | - |
31 | | - self.activations = activations |
32 | | - self.cot = cot |
33 | | - self.threshold = threshold |
34 | | - self.temperature = temperature |
35 | | - self.generation_kwargs = generation_kwargs |
36 | | - |
37 | | - async def __call__(self, record): |
38 | | - breakpoint() |
39 | | - messages = self._build_prompt(record.train) |
40 | | - |
| 13 | + activations: bool = True |
| 14 | + """Whether to show activations to the explainer.""" |
| 15 | + max_examples: int = 15 |
| 16 | + """Maximum number of activating examples to use.""" |
| 17 | + max_non_activating: int = 5 |
| 18 | + """Maximum number of non-activating examples to use.""" |
| 19 | + |
| 20 | + async def __call__(self, record: LatentRecord) -> ExplainerResult: |
| 21 | + """ |
| 22 | + Override the base __call__ method to use both train and not_active examples. |
| 23 | +
|
| 24 | + Args: |
| 25 | + record: The latent record containing both activating and |
| 26 | + non-activating examples. |
| 27 | +
|
| 28 | + Returns: |
| 29 | + ExplainerResult: The explainer result containing the explanation. |
| 30 | + """ |
| 31 | + # Sample from both activating and non-activating examples |
| 32 | + activating_examples = record.train[: self.max_examples] |
| 33 | + |
| 34 | + non_activating_examples = [] |
| 35 | + if len(record.not_active) > 0: |
| 36 | + non_activating_examples = record.not_active[: self.max_non_activating] |
| 37 | + |
| 38 | + # Ensure non-activating examples have normalized activations for consistency |
| 39 | + for example in non_activating_examples: |
| 40 | + if example.normalized_activations is None: |
| 41 | + # Use zeros for non-activating examples |
| 42 | + example.normalized_activations = torch.zeros_like( |
| 43 | + example.activations |
| 44 | + ) |
| 45 | + |
| 46 | + # Combine examples for the prompt |
| 47 | + combined_examples = activating_examples + non_activating_examples |
| 48 | + |
| 49 | + # Build the prompt with both types of examples |
| 50 | + messages = self._build_prompt(combined_examples) |
| 51 | + print("message", messages[-1]["content"]) |
| 52 | + |
| 53 | + # Generate the explanation |
41 | 54 | response = await self.client.generate( |
42 | 55 | messages, temperature=self.temperature, **self.generation_kwargs |
43 | 56 | ) |
44 | 57 |
|
45 | 58 | try: |
46 | 59 | explanation = self.parse_explanation(response.text) |
47 | 60 | if self.verbose: |
48 | | - return ( |
49 | | - messages[-1]["content"], |
50 | | - response, |
51 | | - ExplainerResult(record=record, explanation=explanation), |
52 | | - ) |
| 61 | + from ..logger import logger |
| 62 | + |
| 63 | + logger.info(f"Explanation: {explanation}") |
| 64 | + logger.info(f"Messages: {messages[-1]['content']}") |
| 65 | + logger.info(f"Response: {response}") |
53 | 66 |
|
54 | 67 | return ExplainerResult(record=record, explanation=explanation) |
55 | 68 | except Exception as e: |
| 69 | + from ..logger import logger |
| 70 | + |
56 | 71 | logger.error(f"Explanation parsing failed: {e}") |
57 | 72 | return ExplainerResult( |
58 | 73 | record=record, explanation="Explanation could not be parsed." |
59 | 74 | ) |
60 | 75 |
|
61 | | - def parse_explanation(self, text: str) -> str: |
62 | | - try: |
63 | | - match = re.search(r"\[EXPLANATION\]:\s*(.*)", text, re.DOTALL) |
64 | | - return ( |
65 | | - match.group(1).strip() if match else "Explanation could not be parsed." |
66 | | - ) |
67 | | - except Exception as e: |
68 | | - logger.error(f"Explanation parsing regex failed: {e}") |
69 | | - raise |
70 | | - |
71 | | - def _highlight(self, index, example): |
72 | | - # result = f"Example {index}: " |
73 | | - result = "" |
74 | | - threshold = example.max_activation * self.threshold |
75 | | - if self.tokenizer is not None: |
76 | | - str_toks = self.tokenizer.batch_decode(example.tokens) |
77 | | - example.str_toks = str_toks |
78 | | - else: |
79 | | - str_toks = example.tokens |
80 | | - example.str_toks = str_toks |
81 | | - activations = example.activations |
82 | | - |
83 | | - def check(i): |
84 | | - return activations[i] > threshold |
85 | | - |
86 | | - i = 0 |
87 | | - while i < len(str_toks): |
88 | | - if check(i): |
89 | | - # result += "<<" |
90 | | - |
91 | | - while i < len(str_toks) and check(i): |
92 | | - result += str_toks[i] |
93 | | - i += 1 |
94 | | - # result += ">>" |
95 | | - else: |
96 | | - # result += str_toks[i] |
97 | | - i += 1 |
98 | | - |
99 | | - return "".join(result) |
100 | | - |
101 | | - def _join_activations(self, example): |
102 | | - activations = [] |
103 | | - |
104 | | - for i, activation in enumerate(example.activations): |
105 | | - if activation > example.max_activation * self.threshold: |
106 | | - activations.append( |
107 | | - (example.str_toks[i], int(example.normalized_activations[i])) |
108 | | - ) |
109 | | - |
110 | | - acts = ", ".join(f'("{item[0]}" : {item[1]})' for item in activations) |
| 76 | + def _build_prompt( |
| 77 | + self, examples: list[ActivatingExample | NonActivatingExample] |
| 78 | + ) -> list[dict]: |
| 79 | + """ |
| 80 | + Build a prompt with both activating and non-activating examples clearly labeled. |
111 | 81 |
|
112 | | - return "Activations: " + acts |
| 82 | + Args: |
| 83 | + examples: List containing both activating and non-activating examples. |
113 | 84 |
|
114 | | - def _build_prompt(self, examples): |
| 85 | + Returns: |
| 86 | + A list of message dictionaries for the prompt. |
| 87 | + """ |
115 | 88 | highlighted_examples = [] |
116 | 89 |
|
117 | | - for i, example in enumerate(examples): |
118 | | - highlighted_examples.append(self._highlight(i + 1, example)) |
| 90 | + # First, separate activating and non-activating examples |
| 91 | + activating_examples = [ |
| 92 | + ex for ex in examples if isinstance(ex, ActivatingExample) |
| 93 | + ] |
| 94 | + non_activating_examples = [ |
| 95 | + ex for ex in examples if not isinstance(ex, ActivatingExample) |
| 96 | + ] |
| 97 | + |
| 98 | + # Process activating examples |
| 99 | + if activating_examples: |
| 100 | + highlighted_examples.append("EXAMPLES:") |
| 101 | + for i, example in enumerate(activating_examples, 1): |
| 102 | + str_toks = example.str_tokens |
| 103 | + activations = example.activations.tolist() |
| 104 | + highlighted_examples.append( |
| 105 | + f"Example {i}: {self._highlight(str_toks, activations)}" |
| 106 | + ) |
119 | 107 |
|
120 | | - if self.activations: |
121 | | - highlighted_examples.append(self._join_activations(example)) |
| 108 | + if self.activations and example.normalized_activations is not None: |
| 109 | + normalized_activations = example.normalized_activations.tolist() |
| 110 | + highlighted_examples.append( |
| 111 | + self._join_activations( |
| 112 | + str_toks, activations, normalized_activations |
| 113 | + ) |
| 114 | + ) |
| 115 | + |
| 116 | + # Process non-activating examples |
| 117 | + if non_activating_examples: |
| 118 | + highlighted_examples.append("\nCOUNTEREXAMPLES:") |
| 119 | + for i, example in enumerate(non_activating_examples, 1): |
| 120 | + str_toks = example.str_tokens |
| 121 | + activations = example.activations.tolist() |
| 122 | + # Note: For non-activating examples, the _highlight method won't |
| 123 | + # highlight anything since activation values will be below threshold |
| 124 | + highlighted_examples.append( |
| 125 | + f"Example {i}: {self._highlight(str_toks, activations)}" |
| 126 | + ) |
122 | 127 |
|
123 | | - return build_single_token_prompt( |
124 | | - examples=highlighted_examples, |
125 | | - ) |
| 128 | + # Join all sections into a single string |
| 129 | + highlighted_examples_str = "\n".join(highlighted_examples) |
| 130 | + |
| 131 | + # Create messages array with the system prompt |
| 132 | + return [ |
| 133 | + { |
| 134 | + "role": "system", |
| 135 | + "content": SYSTEM_CONTRASTIVE.format(prompt=""), |
| 136 | + }, |
| 137 | + { |
| 138 | + "role": "user", |
| 139 | + "content": f"WORDS: {highlighted_examples_str}", |
| 140 | + }, |
| 141 | + ] |
126 | 142 |
|
127 | 143 | def call_sync(self, record): |
| 144 | + """Synchronous wrapper for the asynchronous __call__ method.""" |
128 | 145 | return asyncio.run(self.__call__(record)) |
0 commit comments