[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · luciaquirke · commit 37230458b663 · 2025-03-02T12:24:44.000Z
for more information, see https://pre-commit.ci
diff --git a/README.md b/README.md
@@ -230,7 +230,7 @@ The experiments discussed in [the blog post](https://blog.eleuther.ai/autointerp
 
 ## Development
 
-Run unit tests: 
+Run unit tests:
 
 ```pytest .```
 
diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -25,7 +25,6 @@
 from delphi.log.result_analysis import log_results
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
 from delphi.scorers import DetectionScorer, FuzzingScorer
-from delphi.semantic_index.index import build_or_load_index, load_index
 from delphi.sparse_coders import load_hook_to_sparse_encode, load_sparse_coders
 from delphi.utils import assert_type
 
@@ -97,7 +96,6 @@ def create_neighbours(
 
 async def process_cache(
     run_cfg: RunConfig,
-    base_path: Path,
     latents_path: Path,
     explanations_path: Path,
     scores_path: Path,
@@ -133,9 +131,6 @@ async def process_cache(
         tokenizer=tokenizer,
     )
 
-    if run_cfg.semantic_index:
-        index = load_index(base_path, run_cfg.cache_cfg)
-
     if run_cfg.explainer_provider == "offline":
         client = Offline(
             run_cfg.explainer_model,
@@ -165,16 +160,19 @@ async def process_cache(
             f"Explainer provider {run_cfg.explainer_provider} not supported"
         )
 
+    from delphi.explainers.explainer import ExplainerResult
+
     def explainer_postprocess(result):
         with open(explanations_path / f"{result.record.latent}.txt", "wb") as f:
             f.write(orjson.dumps(result.explanation))
+
+        if not isinstance(result, ExplainerResult):
+            breakpoint()
         return result
 
-    if run_cfg.semantic_index:
+    if run_cfg.constructor_cfg.non_activating_source == "FAISS":
         explainer = ContrastiveExplainer(
             client,
-            tokenizer=dataset.tokenizer,
-            index=index,
             threshold=0.3,
             verbose=run_cfg.verbose,
         )
@@ -189,6 +187,9 @@ def explainer_postprocess(result):
 
     # Builds the record from result returned by the pipeline
     def scorer_preprocess(result):
+        if isinstance(result, list):
+            result = result[0]
+
         record = result.record
         record.explanation = result.explanation
         record.extra_examples = record.not_active
@@ -259,8 +260,8 @@ def populate_cache(
     )
     data = data.shuffle(run_cfg.seed)
 
-    if run_cfg.semantic_index:
-        build_or_load_index(data, base_path, run_cfg.cache_cfg)
+    # if run_cfg.constructor_cfg.non_activating_source == "FAISS":
+    # build_or_load_index(data, base_path, run_cfg.cache_cfg)
 
     tokens_ds = chunk_and_tokenize(
         data,  # type: ignore
@@ -368,9 +369,6 @@ async def run(
             transcode,
         )
 
-    if run_cfg.semantic_index:
-        load_index(base_path, run_cfg.cache_cfg)
-
     del model, hookpoint_to_sparse_encode
     if run_cfg.constructor_cfg.non_activating_source == "neighbours":
         non_redundant_hookpoints = assert_type(
@@ -398,7 +396,6 @@ async def run(
     if non_redundant_hookpoints:
         await process_cache(
             run_cfg,
-            base_path,
             latents_path,
             explanations_path,
             scores_path,
diff --git a/delphi/config.py b/delphi/config.py
@@ -25,6 +25,9 @@ class SamplerConfig(Serializable):
 
 @dataclass
 class ConstructorConfig(Serializable):
+    faiss_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    """Embedding model to use for FAISS index."""
+
     example_ctx_len: int = 32
     """Length of each sampled example sequence. Longer sequences
     reduce detection scoring performance in weak models.
@@ -41,11 +44,12 @@ class ConstructorConfig(Serializable):
     n_non_activating: int = 50
     """Number of non-activating examples to be constructed."""
 
-    non_activating_source: Literal["random", "neighbours"] = "random"
+    non_activating_source: Literal["random", "neighbours", "FAISS"] = "FAISS"
     """Source of non-activating examples. Random uses non-activating contexts
     sampled from any non activating window. Neighbours uses actvating contexts
-    from pre-computed latent neighbours. They are still non-activating but
-    have a higher chance of being similar to the activating examples."""
+    from pre-computed latent neighbours. FAISS uses semantic similarity search
+    to find hard negatives that are semantically similar to activating examples
+    but don't activate the latent."""
 
     neighbours_type: Literal[
         "co-occurrence", "decoder_similarity", "encoder_similarity"
diff --git a/delphi/explainers/contrastive_explainer.py b/delphi/explainers/contrastive_explainer.py
@@ -1,128 +1,145 @@
 import asyncio
-import re
+from dataclasses import dataclass
 
-import faiss
+import torch
 
-from delphi.explainers.default.prompt_builder import build_single_token_prompt
+from delphi.explainers.default.prompts import SYSTEM_CONTRASTIVE
 from delphi.explainers.explainer import Explainer, ExplainerResult
-from delphi.logger import logger
+from delphi.latents.latents import ActivatingExample, LatentRecord, NonActivatingExample
 
 
+@dataclass
 class ContrastiveExplainer(Explainer):
-    name = "contrastive"
-
-    def __init__(
-        self,
-        client,
-        tokenizer,
-        index: faiss.Index,
-        verbose: bool = False,
-        activations: bool = False,
-        cot: bool = False,
-        threshold: float = 0.6,
-        temperature: float = 0.0,
-        **generation_kwargs,
-    ):
-        self.client = client
-        self.tokenizer = tokenizer
-        self.index = index
-        self.verbose = verbose
-
-        self.activations = activations
-        self.cot = cot
-        self.threshold = threshold
-        self.temperature = temperature
-        self.generation_kwargs = generation_kwargs
-
-    async def __call__(self, record):
-        breakpoint()
-        messages = self._build_prompt(record.train)
-
+    activations: bool = True
+    """Whether to show activations to the explainer."""
+    max_examples: int = 15
+    """Maximum number of activating examples to use."""
+    max_non_activating: int = 5
+    """Maximum number of non-activating examples to use."""
+
+    async def __call__(self, record: LatentRecord) -> ExplainerResult:
+        """
+        Override the base __call__ method to use both train and not_active examples.
+
+        Args:
+            record: The latent record containing both activating and
+                non-activating examples.
+
+        Returns:
+            ExplainerResult: The explainer result containing the explanation.
+        """
+        # Sample from both activating and non-activating examples
+        activating_examples = record.train[: self.max_examples]
+
+        non_activating_examples = []
+        if len(record.not_active) > 0:
+            non_activating_examples = record.not_active[: self.max_non_activating]
+
+            # Ensure non-activating examples have normalized activations for consistency
+            for example in non_activating_examples:
+                if example.normalized_activations is None:
+                    # Use zeros for non-activating examples
+                    example.normalized_activations = torch.zeros_like(
+                        example.activations
+                    )
+
+        # Combine examples for the prompt
+        combined_examples = activating_examples + non_activating_examples
+
+        # Build the prompt with both types of examples
+        messages = self._build_prompt(combined_examples)
+        print("message", messages[-1]["content"])
+
+        # Generate the explanation
         response = await self.client.generate(
             messages, temperature=self.temperature, **self.generation_kwargs
         )
 
         try:
             explanation = self.parse_explanation(response.text)
             if self.verbose:
-                return (
-                    messages[-1]["content"],
-                    response,
-                    ExplainerResult(record=record, explanation=explanation),
-                )
+                from ..logger import logger
+
+                logger.info(f"Explanation: {explanation}")
+                logger.info(f"Messages: {messages[-1]['content']}")
+                logger.info(f"Response: {response}")
 
             return ExplainerResult(record=record, explanation=explanation)
         except Exception as e:
+            from ..logger import logger
+
             logger.error(f"Explanation parsing failed: {e}")
             return ExplainerResult(
                 record=record, explanation="Explanation could not be parsed."
             )
 
-    def parse_explanation(self, text: str) -> str:
-        try:
-            match = re.search(r"\[EXPLANATION\]:\s*(.*)", text, re.DOTALL)
-            return (
-                match.group(1).strip() if match else "Explanation could not be parsed."
-            )
-        except Exception as e:
-            logger.error(f"Explanation parsing regex failed: {e}")
-            raise
-
-    def _highlight(self, index, example):
-        # result = f"Example {index}: "
-        result = ""
-        threshold = example.max_activation * self.threshold
-        if self.tokenizer is not None:
-            str_toks = self.tokenizer.batch_decode(example.tokens)
-            example.str_toks = str_toks
-        else:
-            str_toks = example.tokens
-            example.str_toks = str_toks
-        activations = example.activations
-
-        def check(i):
-            return activations[i] > threshold
-
-        i = 0
-        while i < len(str_toks):
-            if check(i):
-                # result += "<<"
-
-                while i < len(str_toks) and check(i):
-                    result += str_toks[i]
-                    i += 1
-                # result += ">>"
-            else:
-                # result += str_toks[i]
-                i += 1
-
-        return "".join(result)
-
-    def _join_activations(self, example):
-        activations = []
-
-        for i, activation in enumerate(example.activations):
-            if activation > example.max_activation * self.threshold:
-                activations.append(
-                    (example.str_toks[i], int(example.normalized_activations[i]))
-                )
-
-        acts = ", ".join(f'("{item[0]}" : {item[1]})' for item in activations)
+    def _build_prompt(
+        self, examples: list[ActivatingExample | NonActivatingExample]
+    ) -> list[dict]:
+        """
+        Build a prompt with both activating and non-activating examples clearly labeled.
 
-        return "Activations: " + acts
+        Args:
+            examples: List containing both activating and non-activating examples.
 
-    def _build_prompt(self, examples):
+        Returns:
+            A list of message dictionaries for the prompt.
+        """
         highlighted_examples = []
 
-        for i, example in enumerate(examples):
-            highlighted_examples.append(self._highlight(i + 1, example))
+        # First, separate activating and non-activating examples
+        activating_examples = [
+            ex for ex in examples if isinstance(ex, ActivatingExample)
+        ]
+        non_activating_examples = [
+            ex for ex in examples if not isinstance(ex, ActivatingExample)
+        ]
+
+        # Process activating examples
+        if activating_examples:
+            highlighted_examples.append("EXAMPLES:")
+            for i, example in enumerate(activating_examples, 1):
+                str_toks = example.str_tokens
+                activations = example.activations.tolist()
+                highlighted_examples.append(
+                    f"Example {i}:  {self._highlight(str_toks, activations)}"
+                )
 
-            if self.activations:
-                highlighted_examples.append(self._join_activations(example))
+                if self.activations and example.normalized_activations is not None:
+                    normalized_activations = example.normalized_activations.tolist()
+                    highlighted_examples.append(
+                        self._join_activations(
+                            str_toks, activations, normalized_activations
+                        )
+                    )
+
+        # Process non-activating examples
+        if non_activating_examples:
+            highlighted_examples.append("\nCOUNTEREXAMPLES:")
+            for i, example in enumerate(non_activating_examples, 1):
+                str_toks = example.str_tokens
+                activations = example.activations.tolist()
+                # Note: For non-activating examples, the _highlight method won't
+                # highlight anything since activation values will be below threshold
+                highlighted_examples.append(
+                    f"Example {i}:  {self._highlight(str_toks, activations)}"
+                )
 
-        return build_single_token_prompt(
-            examples=highlighted_examples,
-        )
+        # Join all sections into a single string
+        highlighted_examples_str = "\n".join(highlighted_examples)
+
+        # Create messages array with the system prompt
+        return [
+            {
+                "role": "system",
+                "content": SYSTEM_CONTRASTIVE.format(prompt=""),
+            },
+            {
+                "role": "user",
+                "content": f"WORDS: {highlighted_examples_str}",
+            },
+        ]
 
     def call_sync(self, record):
+        """Synchronous wrapper for the asynchronous __call__ method."""
         return asyncio.run(self.__call__(record))
diff --git a/delphi/explainers/default/prompts.py b/delphi/explainers/default/prompts.py
@@ -37,6 +37,21 @@
 {prompt}
 """
 
+SYSTEM_CONTRASTIVE = """You are a meticulous AI researcher conducting an important investigation into patterns found in language. Your task is to analyze text and provide an explanation that thoroughly encapsulates possible patterns found in it.
+Guidelines:
+
+You will be given a list of text examples on which special words are selected and between delimiters like <<this>>. If a sequence of consecutive tokens all are important, the entire sequence of tokens will be contained between delimiters <<just like this>>. How important each token is for the behavior is listed after each example in parentheses.
+
+- Try to produce a concise final description. Simply describe the text latents that are common in the examples, and what patterns you found.
+- Counterexamples where no special words are present are also provided to help you understand the patterns' edge cases.
+- If the examples are uninformative, you don't need to mention them. Don't focus on giving examples of important tokens, but try to summarize the patterns found in the examples.
+- Do not mention the marker tokens (<< >>) in your explanation.
+- Do not make lists of possible explanations. Keep your explanations short and concise.
+- The last line of your response must be the formatted explanation, using [EXPLANATION]:
+
+{prompt}
+"""
+
 
 COT = """
 To better find the explanation for the language patterns go through the following stages:
@@ -228,3 +243,7 @@ def system(cot=False):
 
 def system_single_token():
     return [{"role": "system", "content": SYSTEM_SINGLE_TOKEN}]
+
+
+def system_contrastive():
+    return [{"role": "system", "content": SYSTEM_CONTRASTIVE}]
diff --git a/delphi/latents/constructors.py b/delphi/latents/constructors.py
diff --git a/delphi/latents/loader.py b/delphi/latents/loader.py