feat: adding support for loading goodfire llama 3 SAEs (#579)

chanind · web-flow · commit f5beb329dfc7 · 2025-10-27T17:08:03.000-04:00
diff --git a/sae_lens/loading/pretrained_sae_loaders.py b/sae_lens/loading/pretrained_sae_loaders.py
@@ -523,6 +523,82 @@ def gemma_2_sae_huggingface_loader(
     return cfg_dict, state_dict, log_sparsity
 
 
+def get_goodfire_config_from_hf(
+    repo_id: str,
+    folder_name: str,  # noqa: ARG001
+    device: str,
+    force_download: bool = False,  # noqa: ARG001
+    cfg_overrides: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    cfg_dict = None
+    if repo_id == "Goodfire/Llama-3.3-70B-Instruct-SAE-l50":
+        if folder_name != "Llama-3.3-70B-Instruct-SAE-l50.pt":
+            raise ValueError(f"Unsupported Goodfire SAE: {repo_id}/{folder_name}")
+        cfg_dict = {
+            "architecture": "standard",
+            "d_in": 8192,
+            "d_sae": 65536,
+            "model_name": "meta-llama/Llama-3.3-70B-Instruct",
+            "hook_name": "blocks.50.hook_resid_post",
+            "hook_head_index": None,
+            "dataset_path": "lmsys/lmsys-chat-1m",
+            "apply_b_dec_to_input": False,
+        }
+    elif repo_id == "Goodfire/Llama-3.1-8B-Instruct-SAE-l19":
+        if folder_name != "Llama-3.1-8B-Instruct-SAE-l19.pth":
+            raise ValueError(f"Unsupported Goodfire SAE: {repo_id}/{folder_name}")
+        cfg_dict = {
+            "architecture": "standard",
+            "d_in": 4096,
+            "d_sae": 65536,
+            "model_name": "meta-llama/Llama-3.1-8B-Instruct",
+            "hook_name": "blocks.19.hook_resid_post",
+            "hook_head_index": None,
+            "dataset_path": "lmsys/lmsys-chat-1m",
+            "apply_b_dec_to_input": False,
+        }
+    if cfg_dict is None:
+        raise ValueError(f"Unsupported Goodfire SAE: {repo_id}/{folder_name}")
+    if device is not None:
+        cfg_dict["device"] = device
+    if cfg_overrides is not None:
+        cfg_dict.update(cfg_overrides)
+    return cfg_dict
+
+
+def get_goodfire_huggingface_loader(
+    repo_id: str,
+    folder_name: str,
+    device: str = "cpu",
+    force_download: bool = False,
+    cfg_overrides: dict[str, Any] | None = None,
+) -> tuple[dict[str, Any], dict[str, torch.Tensor], torch.Tensor | None]:
+    cfg_dict = get_goodfire_config_from_hf(
+        repo_id,
+        folder_name,
+        device,
+        force_download,
+        cfg_overrides,
+    )
+
+    # Download the SAE weights
+    sae_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=folder_name,
+        force_download=force_download,
+    )
+    raw_state_dict = torch.load(sae_path, map_location=device)
+
+    state_dict = {
+        "W_enc": raw_state_dict["encoder_linear.weight"].T,
+        "W_dec": raw_state_dict["decoder_linear.weight"].T,
+        "b_enc": raw_state_dict["encoder_linear.bias"],
+        "b_dec": raw_state_dict["decoder_linear.bias"],
+    }
+
+    return cfg_dict, state_dict, None
+
+
 def get_llama_scope_config_from_hf(
     repo_id: str,
     folder_name: str,
@@ -1487,6 +1563,7 @@ def get_mntss_clt_layer_config_from_hf(
     "gemma_2_transcoder": gemma_2_transcoder_huggingface_loader,
     "mwhanna_transcoder": mwhanna_transcoder_huggingface_loader,
     "mntss_clt_layer_transcoder": mntss_clt_layer_huggingface_loader,
+    "goodfire": get_goodfire_huggingface_loader,
 }
 
 
@@ -1502,4 +1579,5 @@ def get_mntss_clt_layer_config_from_hf(
     "gemma_2_transcoder": get_gemma_2_transcoder_config_from_hf,
     "mwhanna_transcoder": get_mwhanna_transcoder_config_from_hf,
     "mntss_clt_layer_transcoder": get_mntss_clt_layer_config_from_hf,
+    "goodfire": get_goodfire_config_from_hf,
 }
diff --git a/sae_lens/pretrained_saes.yaml b/sae_lens/pretrained_saes.yaml
@@ -14882,4 +14882,22 @@ qwen2.5-7b-instruct-andyrdt:
     neuronpedia: qwen2.5-7b-it/23-resid-post-aa
   - id: resid_post_layer_27_trainer_1
     path: resid_post_layer_27/trainer_1
-    neuronpedia: qwen2.5-7b-it/27-resid-post-aa
+    neuronpedia: qwen2.5-7b-it/27-resid-post-aa
+
+goodfire-llama-3.3-70b-instruct:
+  conversion_func: goodfire
+  model: meta-llama/Llama-3.3-70B-Instruct
+  repo_id: Goodfire/Llama-3.3-70B-Instruct-SAE-l50
+  saes:
+  - id: layer_50
+    path: Llama-3.3-70B-Instruct-SAE-l50.pt
+    l0: 121
+
+goodfire-llama-3.1-8b-instruct:
+  conversion_func: goodfire
+  model: meta-llama/Llama-3.1-8B-Instruct
+  repo_id: Goodfire/Llama-3.1-8B-Instruct-SAE-l19
+  saes:
+  - id: layer_19
+    path: Llama-3.1-8B-Instruct-SAE-l19.pth
+    l0: 91
diff --git a/tests/loading/test_pretrained_sae_loaders.py b/tests/loading/test_pretrained_sae_loaders.py
@@ -1,15 +1,19 @@
 from pathlib import Path
+from typing import Any
 
 import pytest
 import torch
 import yaml
 from safetensors.torch import save_file
 from sparsify import SparseCoder, SparseCoderConfig
 
+from sae_lens import StandardSAE, StandardSAEConfig
 from sae_lens.loading.pretrained_sae_loaders import (
     dictionary_learning_sae_huggingface_loader_1,
     get_deepseek_r1_config_from_hf,
     get_gemma_2_transcoder_config_from_hf,
+    get_goodfire_config_from_hf,
+    get_goodfire_huggingface_loader,
     get_llama_scope_config_from_hf,
     get_llama_scope_r1_distill_config_from_hf,
     get_mntss_clt_layer_config_from_hf,
@@ -21,6 +25,7 @@
     sparsify_huggingface_loader,
 )
 from sae_lens.saes.sae import SAE
+from tests.helpers import assert_close, random_params
 
 
 def test_load_sae_config_from_huggingface():
@@ -500,6 +505,230 @@ def test_get_llama_scope_config_from_hf():
     assert cfg == expected_cfg
 
 
+def test_get_goodfire_config_from_hf():
+    cfg = get_goodfire_config_from_hf(
+        repo_id="Goodfire/Llama-3.3-70B-Instruct-SAE-l50",
+        folder_name="Llama-3.3-70B-Instruct-SAE-l50.pt",
+        device="cpu",
+    )
+    expected_cfg = {
+        "architecture": "standard",
+        "d_in": 8192,
+        "d_sae": 65536,
+        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
+        "hook_name": "blocks.50.hook_resid_post",
+        "hook_head_index": None,
+        "dataset_path": "lmsys/lmsys-chat-1m",
+        "apply_b_dec_to_input": False,
+        "device": "cpu",
+    }
+    assert cfg == expected_cfg
+
+
+def test_get_goodfire_llama_8b_config_from_hf():
+    cfg = get_goodfire_config_from_hf(
+        repo_id="Goodfire/Llama-3.1-8B-Instruct-SAE-l19",
+        folder_name="Llama-3.1-8B-Instruct-SAE-l19.pth",
+        device="cpu",
+    )
+    expected_cfg = {
+        "architecture": "standard",
+        "d_in": 4096,
+        "d_sae": 65536,
+        "model_name": "meta-llama/Llama-3.1-8B-Instruct",
+        "hook_name": "blocks.19.hook_resid_post",
+        "hook_head_index": None,
+        "dataset_path": "lmsys/lmsys-chat-1m",
+        "apply_b_dec_to_input": False,
+        "device": "cpu",
+    }
+    assert cfg == expected_cfg
+
+
+def test_get_goodfire_config_from_hf_errors_on_unsupported_sae():
+    with pytest.raises(
+        ValueError,
+        match="Unsupported Goodfire SAE: wrong/repo",
+    ):
+        get_goodfire_config_from_hf(
+            repo_id="wrong/repo",
+            folder_name="Llama-3.3-70B-Instruct-SAE-l50.pt",
+            device="cpu",
+        )
+    with pytest.raises(
+        ValueError,
+        match="Unsupported Goodfire SAE: Goodfire/Llama-3.3-70B-Instruct-SAE-l50/wrong_filename.pt",
+    ):
+        get_goodfire_config_from_hf(
+            repo_id="Goodfire/Llama-3.3-70B-Instruct-SAE-l50",
+            folder_name="wrong_filename.pt",
+            device="cpu",
+        )
+
+
+def test_our_sae_matches_goodfires_implementation():
+    # from https://colab.research.google.com/drive/1IBMQtJqy8JiRk1Q48jDEgTISmtxhlCRL
+    class GoodfireSAE(torch.nn.Module):
+        def __init__(
+            self,
+            d_in: int,
+            d_hidden: int,
+            device: torch.device,
+            dtype: torch.dtype = torch.float32,
+        ):
+            super().__init__()
+            self.d_in = d_in
+            self.d_hidden = d_hidden
+            self.device = device
+            self.encoder_linear = torch.nn.Linear(d_in, d_hidden)
+            self.decoder_linear = torch.nn.Linear(d_hidden, d_in)
+            self.dtype = dtype
+            self.to(self.device, self.dtype)
+
+        def encode(self, x: torch.Tensor) -> torch.Tensor:
+            """Encode a batch of data using a linear, followed by a ReLU."""
+            return torch.nn.functional.relu(self.encoder_linear(x))
+
+        def decode(self, x: torch.Tensor) -> torch.Tensor:
+            """Decode a batch of data using a linear."""
+            return self.decoder_linear(x)
+
+        def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            """SAE forward pass. Returns the reconstruction and the encoded features."""
+            f = self.encode(x)
+            return self.decode(f), f
+
+    cfg_dict = load_sae_config_from_huggingface(
+        release="goodfire-llama-3.3-70b-instruct",
+        sae_id="layer_50",
+        device="cpu",
+    )
+    cfg_dict["d_in"] = 128
+    cfg_dict["d_sae"] = 256
+    cfg_dict["dtype"] = "float32"
+
+    assert cfg_dict["architecture"] == "standard"
+    cfg = StandardSAEConfig.from_dict(cfg_dict)
+
+    # make a SAE base on the Goodfire config, but smaller since the real SAE class is huge
+    sae = StandardSAE(cfg)
+    random_params(sae)
+
+    sae_state_dict = sae.state_dict()
+    goodfire_state_dict = {
+        "encoder_linear.weight": sae_state_dict["W_enc"].T,
+        "encoder_linear.bias": sae_state_dict["b_enc"],
+        "decoder_linear.weight": sae_state_dict["W_dec"].T,
+        "decoder_linear.bias": sae_state_dict["b_dec"],
+    }
+
+    goodfire_sae = GoodfireSAE(d_in=128, d_hidden=256, device=torch.device("cpu"))
+    goodfire_sae.load_state_dict(goodfire_state_dict)
+
+    test_input = torch.randn(10, 128)
+
+    output = sae(test_input)
+    features = sae.encode(test_input)
+    goodfire_output, goodfire_features = goodfire_sae(test_input)
+
+    assert_close(output, goodfire_output, rtol=1e-4, atol=1e-4)
+    assert_close(features, goodfire_features, rtol=1e-4, atol=1e-4)
+
+
+def test_get_goodfire_huggingface_loader_with_mocked_download(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+):
+    repo_id = "Goodfire/Llama-3.3-70B-Instruct-SAE-l50"
+    folder_name = "Llama-3.3-70B-Instruct-SAE-l50.pt"
+    device = "cpu"
+
+    d_in = 128
+    d_sae = 256
+
+    encoder_weight = torch.randn(d_sae, d_in)
+    decoder_weight = torch.randn(d_in, d_sae)
+    encoder_bias = torch.randn(d_sae)
+    decoder_bias = torch.randn(d_in)
+
+    raw_state_dict = {
+        "encoder_linear.weight": encoder_weight,
+        "decoder_linear.weight": decoder_weight,
+        "encoder_linear.bias": encoder_bias,
+        "decoder_linear.bias": decoder_bias,
+    }
+
+    sae_file_path = tmp_path / folder_name
+    torch.save(raw_state_dict, sae_file_path)
+
+    def mock_get_goodfire_config_from_hf(
+        repo_id: str,  # noqa: ARG001
+        folder_name: str,  # noqa: ARG001
+        device: str,
+        force_download: bool = False,  # noqa: ARG001
+        cfg_overrides: dict[str, Any] | None = None,  # noqa: ARG001
+    ) -> dict[str, Any]:
+        return {
+            "architecture": "standard",
+            "d_in": d_in,
+            "d_sae": d_sae,
+            "model_name": "meta-llama/Llama-3.3-70B-Instruct",
+            "hook_name": "blocks.50.hook_resid_post",
+            "hook_head_index": None,
+            "dataset_path": "lmsys/lmsys-chat-1m",
+            "apply_b_dec_to_input": False,
+            "device": device,
+        }
+
+    def mock_hf_hub_download(
+        repo_id: str,  # noqa: ARG001
+        filename: str,  # noqa: ARG001
+        force_download: bool = False,  # noqa: ARG001
+    ) -> str:
+        return str(sae_file_path)
+
+    monkeypatch.setattr(
+        "sae_lens.loading.pretrained_sae_loaders.get_goodfire_config_from_hf",
+        mock_get_goodfire_config_from_hf,
+    )
+    monkeypatch.setattr(
+        "sae_lens.loading.pretrained_sae_loaders.hf_hub_download", mock_hf_hub_download
+    )
+
+    cfg_dict, state_dict, log_sparsity = get_goodfire_huggingface_loader(
+        repo_id=repo_id,
+        folder_name=folder_name,
+        device=device,
+        force_download=False,
+        cfg_overrides=None,
+    )
+
+    expected_cfg = {
+        "architecture": "standard",
+        "d_in": d_in,
+        "d_sae": d_sae,
+        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
+        "hook_name": "blocks.50.hook_resid_post",
+        "hook_head_index": None,
+        "dataset_path": "lmsys/lmsys-chat-1m",
+        "apply_b_dec_to_input": False,
+        "device": device,
+    }
+
+    assert cfg_dict == expected_cfg
+    assert log_sparsity is None
+
+    assert set(state_dict.keys()) == {"W_enc", "W_dec", "b_enc", "b_dec"}
+    torch.testing.assert_close(state_dict["W_enc"], encoder_weight.T)
+    torch.testing.assert_close(state_dict["W_dec"], decoder_weight.T)
+    torch.testing.assert_close(state_dict["b_enc"], encoder_bias)
+    torch.testing.assert_close(state_dict["b_dec"], decoder_bias)
+
+    assert state_dict["W_enc"].shape == (d_in, d_sae)
+    assert state_dict["W_dec"].shape == (d_sae, d_in)
+    assert state_dict["b_enc"].shape == (d_sae,)
+    assert state_dict["b_dec"].shape == (d_in,)
+
+
 def test_get_llama_scope_r1_distill_config_from_hf():
     """Test that the Llama Scope R1 Distill config is generated correctly."""
     cfg = get_llama_scope_r1_distill_config_from_hf(