Renaming hooks

SrGonao · SrGonao · commit 7810443d72a8 · 2025-02-18T09:23:40.000Z
diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -31,7 +31,7 @@
 from delphi.log.result_analysis import log_results
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
 from delphi.scorers import DetectionScorer, FuzzingScorer
-from delphi.sparse_coders import load_sparse_coders
+from delphi.sparse_coders import load_hooks_sparse_coders
 
 
 def load_artifacts(run_cfg: RunConfig):
@@ -54,7 +54,7 @@ def load_artifacts(run_cfg: RunConfig):
         token=run_cfg.hf_token,
     )
 
-    hookpoint_to_sparse_encode = load_sparse_coders(model, run_cfg, compile=True)
+    hookpoint_to_sparse_encode = load_hooks_sparse_coders(model, run_cfg, compile=True)
 
     return run_cfg.hookpoints, hookpoint_to_sparse_encode, model
 
diff --git a/delphi/sparse_coders/sparse_model.py b/delphi/sparse_coders/sparse_model.py
@@ -0,0 +1,123 @@
+from typing import Callable
+
+import torch.nn as nn
+from transformers import PreTrainedModel
+
+from delphi.config import RunConfig
+
+from .custom.gemmascope import load_gemma_autoencoders
+from .load_sparsify import load_sparsify_hooks, load_sparsify_sparse_coders
+
+
+def load_hooks_sparse_coders(
+    model: PreTrainedModel,
+    run_cfg: RunConfig,
+    compile: bool = False,
+) -> dict[str, Callable]:
+    """
+    Load sparse coders for specified hookpoints.
+
+    Args:
+        model (PreTrainedModel): The model to load sparse coders for.
+        run_cfg (RunConfig): The run configuration.
+
+    Returns:
+        dict[str, Callable]: A dictionary mapping hookpoints to sparse coders.
+    """
+
+    # Add SAE hooks to the model
+    if "gemma" not in run_cfg.sparse_model:
+        hookpoint_to_sparse_encode = load_sparsify_hooks(
+            model,
+            run_cfg.sparse_model,
+            run_cfg.hookpoints,
+            compile=compile,
+        )
+    else:
+        # model path will always be of the form google/gemma-scope-<size>-pt-<type>/
+        # where <size> is the size of the model and <type> is either res or mlp
+        model_path = "google/" + run_cfg.sparse_model.split("/")[1]
+        type = model_path.split("-")[-1]
+        # we can use the hookpoints to determine the layer, size and l0,
+        # because the module is determined by the model name
+        # the hookpoint should be in the format
+        # layer_<layer>/width_<sae_size>/average_l0_<l0>
+        layers = []
+        l0s = []
+        sae_sizes = []
+        for hookpoint in run_cfg.hookpoints:
+            layer = int(hookpoint.split("/")[0].split("_")[1])
+            sae_size = hookpoint.split("/")[1].split("_")[1]
+            l0 = int(hookpoint.split("/")[2].split("_")[2])
+            layers.append(layer)
+            sae_sizes.append(sae_size)
+            l0s.append(l0)
+
+        hookpoint_to_sparse_encode = load_gemma_autoencoders(
+            model_path=model_path,
+            ae_layers=layers,
+            average_l0s=l0s,
+            sizes=sae_sizes,
+            type=type,
+            dtype=model.dtype,
+            device=model.device,
+        )
+
+    return hookpoint_to_sparse_encode
+
+
+def load_sparse_coders(
+    model: PreTrainedModel,
+    run_cfg: RunConfig,
+    compile: bool = False,
+) -> dict[str, nn.Module]:
+    """
+    Load sparse coders for specified hookpoints.
+
+    Args:
+        model (PreTrainedModel): The model to load sparse coders for.
+        run_cfg (RunConfig): The run configuration.
+
+    Returns:
+        dict[str, Callable]: A dictionary mapping hookpoints to sparse coders.
+    """
+
+    # Add SAE hooks to the model
+    if "gemma" not in run_cfg.sparse_model:
+        hookpoint_to_sparse_model = load_sparsify_sparse_coders(
+            model,
+            run_cfg.sparse_model,
+            run_cfg.hookpoints,
+            compile=compile,
+        )
+    else:
+        # model path will always be of the form google/gemma-scope-<size>-pt-<type>/
+        # where <size> is the size of the model and <type> is either res or mlp
+        model_path = "google/" + run_cfg.sparse_model.split("/")[1]
+        type = model_path.split("-")[-1]
+        # we can use the hookpoints to determine the layer, size and l0,
+        # because the module is determined by the model name
+        # the hookpoint should be in the format
+        # layer_<layer>/width_<sae_size>/average_l0_<l0>
+        layers = []
+        l0s = []
+        sae_sizes = []
+        for hookpoint in run_cfg.hookpoints:
+            layer = int(hookpoint.split("/")[0].split("_")[1])
+            sae_size = hookpoint.split("/")[1].split("_")[1]
+            l0 = int(hookpoint.split("/")[2].split("_")[2])
+            layers.append(layer)
+            sae_sizes.append(sae_size)
+            l0s.append(l0)
+
+        hookpoint_to_sparse_model = load_gemma_autoencoders(
+            model_path=model_path,
+            ae_layers=layers,
+            average_l0s=l0s,
+            sizes=sae_sizes,
+            type=type,
+            dtype=model.dtype,
+            device=model.device,
+        )
+
+    return hookpoint_to_sparse_model
diff --git a/delphi/tests/conftest.py b/delphi/tests/conftest.py
@@ -4,7 +4,7 @@
 
 from delphi.config import CacheConfig, RunConfig
 from delphi.latents import LatentCache
-from delphi.sparse_coders import load_sparse_coders
+from delphi.sparse_coders import load_hooks_sparse_coders
 
 random_text = [
     "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
@@ -62,7 +62,7 @@ def cache_setup(
         sparse_model="EleutherAI/sae-pythia-70m-32k",
         hookpoints=["layers.1"],
     )
-    hookpoint_to_sparse_encode = load_sparse_coders(model, run_cfg_gemma)
+    hookpoint_to_sparse_encode = load_hooks_sparse_coders(model, run_cfg_gemma)
 
     # Define cache config and initialize cache
     cache_cfg = CacheConfig(batch_size=1, ctx_len=16, n_tokens=100)
diff --git a/delphi/tests/test_autoencoders/test_sparse_coders.py b/delphi/tests/test_autoencoders/test_sparse_coders.py
@@ -3,7 +3,7 @@
 import torch.nn as nn
 
 # Import the function to be tested
-from delphi.sparse_coders import load_sparse_coders
+from delphi.sparse_coders import load_hooks_sparse_coders
 
 
 # A simple dummy run configuration for testing.
@@ -69,16 +69,17 @@ def run_cfg_gemma():
 
 def test_retrieve_autoencoders_from_sparsify(dummy_model, run_cfg_sparsify):
     """
-    Tests that load_sparse_coders retrieves autoencoders from Sparsify.
+    Tests that load_hooks_sparse_coders retrieves autoencoders from Sparsify.
     """
-    submodules = load_sparse_coders(dummy_model, run_cfg_sparsify)
+    hookpoint_to_sparse_encode = load_hooks_sparse_coders(dummy_model, run_cfg_sparsify)
     # Verify that we received a dictionary of autoencoders.
     assert (
-        isinstance(submodules, dict) and len(submodules) > 0
+        isinstance(hookpoint_to_sparse_encode, dict)
+        and len(hookpoint_to_sparse_encode) > 0
     ), "No autoencoders retrieved from the Sparsify branch."
 
     # Validate that at least one autoencoder is callable.
-    for key, autoencoder in submodules.items():
+    for key, autoencoder in hookpoint_to_sparse_encode.items():
         dummy_input = torch.randn(2, 512)
         try:
             _ = autoencoder(dummy_input)
@@ -91,16 +92,17 @@ def test_retrieve_autoencoders_from_sparsify(dummy_model, run_cfg_sparsify):
 
 def test_retrieve_autoencoders_from_gemma(dummy_model, run_cfg_gemma):
     """
-    Tests that load_sparse_coders retrieves autoencoders from Gemma.
+    Tests that load_hooks_sparse_coders retrieves autoencoders from Gemma.
     """
-    submodules = load_sparse_coders(dummy_model, run_cfg_gemma)
+    hookpoint_to_sparse_encode = load_hooks_sparse_coders(dummy_model, run_cfg_gemma)
     # Verify that we received a dictionary of autoencoders.
     assert (
-        isinstance(submodules, dict) and len(submodules) > 0
+        isinstance(hookpoint_to_sparse_encode, dict)
+        and len(hookpoint_to_sparse_encode) > 0
     ), "No autoencoders retrieved from the Gemma branch."
 
     # Validate that at least one autoencoder is callable.
-    for key, autoencoder in submodules.items():
+    for key, autoencoder in hookpoint_to_sparse_encode.items():
         dummy_input = torch.randn(2, 2304)
         try:
             _ = autoencoder(dummy_input)
diff --git a/examples/caching_activations.ipynb b/examples/caching_activations.ipynb
@@ -36,7 +36,7 @@
    "source": [
     "from transformers import AutoModel\n",
     "\n",
-    "from delphi.sparse_coders import load_sparse_coders\n",
+    "from delphi.sparse_coders import load_hooks_sparse_coders\n",
     "from delphi.config import RunConfig\n"
    ]
   },
@@ -87,7 +87,7 @@
     "    hookpoints=[\"layer_10/width_16k/average_l0_39\"],\n",
     ")\n",
     "\n",
-    "hookpoint_to_sparse_encode = load_sparse_coders(model, run_cfg)"
+    "hookpoint_to_sparse_encode = load_hooks_sparse_coders(model, run_cfg)"
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`"source": [`
`37`	`37`	`"from transformers import AutoModel\n",`
`38`	`38`	`"\n",`
`39`		`- "from delphi.sparse_coders import load_sparse_coders\n",`
	`39`	`+ "from delphi.sparse_coders import load_hooks_sparse_coders\n",`
`40`	`40`	`"from delphi.config import RunConfig\n"`
`41`	`41`	`]`
`42`	`42`	`},`
`@@ -87,7 +87,7 @@`
`87`	`87`	`" hookpoints=[\"layer_10/width_16k/average_l0_39\"],\n",`
`88`	`88`	`")\n",`
`89`	`89`	`"\n",`
`90`		`- "hookpoint_to_sparse_encode = load_sparse_coders(model, run_cfg)"`
	`90`	`+ "hookpoint_to_sparse_encode = load_hooks_sparse_coders(model, run_cfg)"`
`91`	`91`	`]`
`92`	`92`	`},`
`93`	`93`	`{`