EleutherAI
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎delphi/__main__.py‎
Lines changed: 2 additions & 2 deletions b/‎delphi/__main__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎delphi/explainers/default/prompt_builder.py‎
Lines changed: 2 additions & 1 deletion b/‎delphi/explainers/default/prompt_builder.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎delphi/sparse_coders/__init__.py‎
Lines changed: 2 additions & 65 deletions b/‎delphi/sparse_coders/__init__.py‎
Lines changed: 2 additions & 65 deletions
diff --git a/‎delphi/sparse_coders/custom/gemmascope.py‎
Lines changed: 33 additions & 7 deletions b/‎delphi/sparse_coders/custom/gemmascope.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎delphi/sparse_coders/load_sparsify.py‎
Lines changed: 44 additions & 14 deletions b/‎delphi/sparse_coders/load_sparsify.py‎
Lines changed: 44 additions & 14 deletions
@@ -16,13 +16,13 @@ Install this library as a local editable installation. Run the following command
 
 To run the default pipeline from the command line, use the following command:
 
-`python -m delphi meta-llama/Meta-Llama-3-8B EleutherAI/sae-llama-3-8b-32x --explainer_model 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4' --dataset_repo 'EleutherAI/rpj-v2-sample' --dataset_split 'train[:1%]' --n_tokens 10_000_000 --max_features 100 --hookpoints layers.5 --filter_bos`
+`python -m delphi meta-llama/Meta-Llama-3-8B EleutherAI/sae-llama-3-8b-32x --explainer_model 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4' --dataset_repo 'EleutherAI/fineweb-edu-dedup-10b' --dataset_split 'train[:1%]' --n_tokens 10_000_000 --max_latents 100 --hookpoints layers.5 --filter_bos --name llama-3-8B` 
 
 This command will:
 1. Cache activations for the first 10 million tokens of EleutherAI/rpj-v2-sample.
 2. Generate explanations for the first 100 features of layer 5 using the specified explainer model.
 3. Score the explanations uses fuzzing and detection scorers.
-4. Log summary metrics including per-scorer F1 scores and confusion matrices.
+4. Log summary metrics including per-scorer F1 scores and confusion matrices, and produce histograms of the scorer classification accuracies.
 
 The pipeline is highly configurable and can also be called programmatically (see the [end-to-end test](https://github.com/EleutherAI/delphi/blob/main/delphi/tests/e2e.py) for an example).
 
 
@@ -31,7 +31,7 @@
 from delphi.log.result_analysis import log_results
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
 from delphi.scorers import DetectionScorer, FuzzingScorer
-from delphi.sparse_coders import load_sparse_coders
+from delphi.sparse_coders import load_hooks_sparse_coders
 
 
 def load_artifacts(run_cfg: RunConfig):
@@ -54,7 +54,7 @@ def load_artifacts(run_cfg: RunConfig):
         token=run_cfg.hf_token,
     )
 
-    hookpoint_to_sparse_encode = load_sparse_coders(model, run_cfg, compile=True)
+    hookpoint_to_sparse_encode = load_hooks_sparse_coders(model, run_cfg, compile=True)
 
     return run_cfg.hookpoints, hookpoint_to_sparse_encode, model
 
 
@@ -41,14 +41,15 @@ def build_prompt(
 
     messages.extend(few_shot_examples)
 
-    user_start = f"WORDS: {examples}"
+    user_start = f"\n{examples}\n"
 
     messages.append(
         {
             "role": "user",
             "content": user_start,
         }
     )
+    print(messages)
 
     return messages
 
 
@@ -1,67 +1,4 @@
-from typing import Callable
+from .sparse_model import load_hooks_sparse_coders, load_sparse_coders
 
-from transformers import PreTrainedModel
+__all__ = ["load_hooks_sparse_coders", "load_sparse_coders"]
 
-from delphi.config import RunConfig
-
-from .custom.gemmascope import load_gemma_autoencoders
-from .load_sparsify import load_sparsify_sparse_coders
-
-__all__ = ["load_sparse_coders"]
-
-
-def load_sparse_coders(
-    model: PreTrainedModel,
-    run_cfg: RunConfig,
-    compile: bool = False,
-) -> dict[str, Callable]:
-    """
-    Load sparse coders for specified hookpoints.
-
-    Args:
-        model (PreTrainedModel): The model to load sparse coders for.
-        run_cfg (RunConfig): The run configuration.
-
-    Returns:
-        dict[str, Callable]: A dictionary mapping hookpoints to sparse coders.
-    """
-
-    # Add SAE hooks to the model
-    if "gemma" not in run_cfg.sparse_model:
-        hookpoint_to_sparse_encode = load_sparsify_sparse_coders(
-            model,
-            run_cfg.sparse_model,
-            run_cfg.hookpoints,
-            compile=compile,
-        )
-    else:
-        # model path will always be of the form google/gemma-scope-<size>-pt-<type>/
-        # where <size> is the size of the model and <type> is either res or mlp
-        model_path = "google/" + run_cfg.sparse_model.split("/")[1]
-        type = model_path.split("-")[-1]
-        # we can use the hookpoints to determine the layer, size and l0,
-        # because the module is determined by the model name
-        # the hookpoint should be in the format
-        # layer_<layer>/width_<sae_size>/average_l0_<l0>
-        layers = []
-        l0s = []
-        sae_sizes = []
-        for hookpoint in run_cfg.hookpoints:
-            layer = int(hookpoint.split("/")[0].split("_")[1])
-            sae_size = hookpoint.split("/")[1].split("_")[1]
-            l0 = int(hookpoint.split("/")[2].split("_")[2])
-            layers.append(layer)
-            sae_sizes.append(sae_size)
-            l0s.append(l0)
-
-        hookpoint_to_sparse_encode = load_gemma_autoencoders(
-            model_path=model_path,
-            ae_layers=layers,
-            average_l0s=l0s,
-            sizes=sae_sizes,
-            type=type,
-            dtype=model.dtype,
-            device=model.device,
-        )
-
-    return hookpoint_to_sparse_encode
@@ -15,18 +15,14 @@ def load_gemma_autoencoders(
     dtype: torch.dtype = torch.bfloat16,
     device: torch.device = torch.device("cuda"),
 ):
-    submodules = {}
+    saes = {}
 
     for layer, size, l0 in zip(ae_layers, sizes, average_l0s):
         path = f"layer_{layer}/width_{size}/average_l0_{l0}"
         sae = JumpReluSae.from_pretrained(model_path, path, device)
 
         sae.to(dtype)
 
-        def _forward(sae, x):
-            encoded = sae.encode(x)
-            return encoded
-
         assert type in [
             "res",
             "mlp",
@@ -37,9 +33,39 @@ def _forward(sae, x):
             else f"layers.{layer}.post_feedforward_layernorm"
         )
 
-        submodules[hookpoint] = partial(_forward, sae)
+        saes[hookpoint] = sae
+
+    return saes
+
+
+def load_gemma_hooks(
+    model_path: str,
+    ae_layers: list[int],
+    average_l0s: list[int],
+    sizes: list[str],
+    type: str,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cuda"),
+):
+    saes = load_gemma_autoencoders(
+        model_path,
+        ae_layers,
+        average_l0s,
+        sizes,
+        type,
+        dtype,
+        device,
+    )
+    hookpoint_to_sparse_encode = {}
+    for hookpoint, sae in saes.items():
+
+        def _forward(sae, x):
+            encoded = sae.encode(x)
+            return encoded
+
+        hookpoint_to_sparse_encode[hookpoint] = partial(_forward, sae)
 
-    return submodules
+    return hookpoint_to_sparse_encode
 
 
 # This is from the GemmaScope tutorial
 
@@ -54,49 +54,79 @@ def load_sparsify_sparse_coders(
         model (Any): The model to load autoencoders for.
         name (str): The name of the sparse model to load. If the model is on-disk
             this is the path to the directory containing the sparse model weights.
-        hookpoints (list[str]): list of hookpoints to load autoencoders for.
+        hookpoints (list[str]): list of hookpoints to identify the sparse models.
         device (str | torch.device | None, optional): The device to load the
             sparse models on. If not specified the sparse models will be loaded
             on the same device as the base model.
 
     Returns:
-        tuple[dict[str, Any], Any]: A tuple containing the submodules dictionary
-            and the edited model.
+        dict[str, Any]: A dictionary mapping hookpoints to sparse models.
     """
     if device is None:
         device = model.device or "cpu"
 
     # Load the sparse models
-    hookpoint_to_sparse = {}
+    sparse_model_dict = {}
     name_path = Path(name)
     if name_path.exists():
         for hookpoint in hookpoints:
-            hookpoint_to_sparse[hookpoint] = Sae.load_from_disk(
+            sparse_model_dict[hookpoint] = Sae.load_from_disk(
                 name_path / hookpoint, device=device
             )
             if compile:
-                hookpoint_to_sparse[hookpoint] = torch.compile(
-                    hookpoint_to_sparse[hookpoint]
+                sparse_model_dict[hookpoint] = torch.compile(
+                    sparse_model_dict[hookpoint]
                 )
     else:
         sparse_models = Sae.load_many(name, device=device)
         for hookpoint in hookpoints:
-            hookpoint_to_sparse[hookpoint] = sparse_models[hookpoint]
+            sparse_model_dict[hookpoint] = sparse_models[hookpoint]
             if compile:
-                hookpoint_to_sparse[hookpoint] = torch.compile(
-                    hookpoint_to_sparse[hookpoint]
+                sparse_model_dict[hookpoint] = torch.compile(
+                    sparse_model_dict[hookpoint]
                 )
 
         del sparse_models
+    return sparse_model_dict
 
-    submodules = {}
-    for hookpoint, sparse_model in hookpoint_to_sparse.items():
+
+def load_sparsify_hooks(
+    model: PreTrainedModel,
+    name: str,
+    hookpoints: list[str],
+    device: str | torch.device | None = None,
+    compile: bool = False,
+) -> dict[str, Callable]:
+    """
+    Load the encode functions for sparsify sparse coders on specified hookpoints.
+
+    Args:
+        model (Any): The model to load autoencoders for.
+        name (str): The name of the sparse model to load. If the model is on-disk
+            this is the path to the directory containing the sparse model weights.
+        hookpoints (list[str]): list of hookpoints to identify the sparse models.
+        device (str | torch.device | None, optional): The device to load the
+            sparse models on. If not specified the sparse models will be loaded
+            on the same device as the base model.
+
+    Returns:
+        dict[str, Callable]: A dictionary mapping hookpoints to encode functions.
+    """
+    sparse_model_dict = load_sparsify_sparse_coders(
+        model,
+        name,
+        hookpoints,
+        device,
+        compile,
+    )
+    hookpoint_to_sparse_encode = {}
+    for hookpoint, sparse_model in sparse_model_dict.items():
         path_segments = resolve_path(model, hookpoint.split("."))
         if path_segments is None:
             raise ValueError(f"Could not find valid path for hookpoint: {hookpoint}")
 
-        submodules[".".join(path_segments)] = partial(
+        hookpoint_to_sparse_encode[".".join(path_segments)] = partial(
             sae_dense_latents, sae=sparse_model
         )
 
-    return submodules
+    return hookpoint_to_sparse_encode
Original file line number	Diff line number	Diff line change
`@@ -41,14 +41,15 @@ def build_prompt(`
`41`	`41`
`42`	`42`	`messages.extend(few_shot_examples)`
`43`	`43`
`44`		`- user_start = f"WORDS: {examples}"`
	`44`	`+ user_start = f"\n{examples}\n"`
`45`	`45`
`46`	`46`	`messages.append(`
`47`	`47`	`{`
`48`	`48`	`"role": "user",`
`49`	`49`	`"content": user_start,`
`50`	`50`	`}`
`51`	`51`	`)`
	`52`	`+ print(messages)`
`52`	`53`
`53`	`54`	`return messages`
`54`	`55`