Add load sparse_coders and rename load_hooks

SrGonao · SrGonao · commit bcae2b8f2da5 · 2025-02-18T09:22:55.000Z
diff --git a/delphi/sparse_coders/custom/gemmascope.py b/delphi/sparse_coders/custom/gemmascope.py
@@ -15,18 +15,14 @@ def load_gemma_autoencoders(
     dtype: torch.dtype = torch.bfloat16,
     device: torch.device = torch.device("cuda"),
 ):
-    submodules = {}
+    saes = {}
 
     for layer, size, l0 in zip(ae_layers, sizes, average_l0s):
         path = f"layer_{layer}/width_{size}/average_l0_{l0}"
         sae = JumpReluSae.from_pretrained(model_path, path, device)
 
         sae.to(dtype)
 
-        def _forward(sae, x):
-            encoded = sae.encode(x)
-            return encoded
-
         assert type in [
             "res",
             "mlp",
@@ -37,9 +33,39 @@ def _forward(sae, x):
             else f"layers.{layer}.post_feedforward_layernorm"
         )
 
-        submodules[hookpoint] = partial(_forward, sae)
+        saes[hookpoint] = sae
+
+    return saes
+
+
+def load_gemma_hooks(
+    model_path: str,
+    ae_layers: list[int],
+    average_l0s: list[int],
+    sizes: list[str],
+    type: str,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cuda"),
+):
+    saes = load_gemma_autoencoders(
+        model_path,
+        ae_layers,
+        average_l0s,
+        sizes,
+        type,
+        dtype,
+        device,
+    )
+    hookpoint_to_sparse_encode = {}
+    for hookpoint, sae in saes.items():
+
+        def _forward(sae, x):
+            encoded = sae.encode(x)
+            return encoded
+
+        hookpoint_to_sparse_encode[hookpoint] = partial(_forward, sae)
 
-    return submodules
+    return hookpoint_to_sparse_encode
 
 
 # This is from the GemmaScope tutorial
diff --git a/delphi/sparse_coders/load_sparsify.py b/delphi/sparse_coders/load_sparsify.py
@@ -54,49 +54,79 @@ def load_sparsify_sparse_coders(
         model (Any): The model to load autoencoders for.
         name (str): The name of the sparse model to load. If the model is on-disk
             this is the path to the directory containing the sparse model weights.
-        hookpoints (list[str]): list of hookpoints to load autoencoders for.
+        hookpoints (list[str]): list of hookpoints to identify the sparse models.
         device (str | torch.device | None, optional): The device to load the
             sparse models on. If not specified the sparse models will be loaded
             on the same device as the base model.
 
     Returns:
-        tuple[dict[str, Any], Any]: A tuple containing the submodules dictionary
-            and the edited model.
+        dict[str, Any]: A dictionary mapping hookpoints to sparse models.
     """
     if device is None:
         device = model.device or "cpu"
 
     # Load the sparse models
-    hookpoint_to_sparse = {}
+    sparse_model_dict = {}
     name_path = Path(name)
     if name_path.exists():
         for hookpoint in hookpoints:
-            hookpoint_to_sparse[hookpoint] = Sae.load_from_disk(
+            sparse_model_dict[hookpoint] = Sae.load_from_disk(
                 name_path / hookpoint, device=device
             )
             if compile:
-                hookpoint_to_sparse[hookpoint] = torch.compile(
-                    hookpoint_to_sparse[hookpoint]
+                sparse_model_dict[hookpoint] = torch.compile(
+                    sparse_model_dict[hookpoint]
                 )
     else:
         sparse_models = Sae.load_many(name, device=device)
         for hookpoint in hookpoints:
-            hookpoint_to_sparse[hookpoint] = sparse_models[hookpoint]
+            sparse_model_dict[hookpoint] = sparse_models[hookpoint]
             if compile:
-                hookpoint_to_sparse[hookpoint] = torch.compile(
-                    hookpoint_to_sparse[hookpoint]
+                sparse_model_dict[hookpoint] = torch.compile(
+                    sparse_model_dict[hookpoint]
                 )
 
         del sparse_models
+    return sparse_model_dict
 
-    submodules = {}
-    for hookpoint, sparse_model in hookpoint_to_sparse.items():
+
+def load_sparsify_hooks(
+    model: PreTrainedModel,
+    name: str,
+    hookpoints: list[str],
+    device: str | torch.device | None = None,
+    compile: bool = False,
+) -> dict[str, Callable]:
+    """
+    Load the encode functions for sparsify sparse coders on specified hookpoints.
+
+    Args:
+        model (Any): The model to load autoencoders for.
+        name (str): The name of the sparse model to load. If the model is on-disk
+            this is the path to the directory containing the sparse model weights.
+        hookpoints (list[str]): list of hookpoints to identify the sparse models.
+        device (str | torch.device | None, optional): The device to load the
+            sparse models on. If not specified the sparse models will be loaded
+            on the same device as the base model.
+
+    Returns:
+        dict[str, Callable]: A dictionary mapping hookpoints to encode functions.
+    """
+    sparse_model_dict = load_sparsify_sparse_coders(
+        model,
+        name,
+        hookpoints,
+        device,
+        compile,
+    )
+    hookpoint_to_sparse_encode = {}
+    for hookpoint, sparse_model in sparse_model_dict.items():
         path_segments = resolve_path(model, hookpoint.split("."))
         if path_segments is None:
             raise ValueError(f"Could not find valid path for hookpoint: {hookpoint}")
 
-        submodules[".".join(path_segments)] = partial(
+        hookpoint_to_sparse_encode[".".join(path_segments)] = partial(
             sae_dense_latents, sae=sparse_model
         )
 
-    return submodules
+    return hookpoint_to_sparse_encode