bergson/tests/test_build.py at 77755202408e361095f6e79fde8c52b13e161e9a · zywilliamli/bergson · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pytest
from bergson.data import load_gradients

try:
    import torch

    HAS_CUDA = torch.cuda.is_available()
except Exception:
    HAS_CUDA = False

if not HAS_CUDA:
    pytest.skip(
        "Skipping GPU-only tests: no CUDA/NVIDIA driver available.",
        allow_module_level=True,
    )

from pathlib import Path

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from bergson import (
    DataConfig,
    GradientProcessor,
    HeadConfig,
    IndexConfig,
    collect_gradients,
)
from bergson.data import tokenize


def test_disk_build_linear(tmp_path: Path):
    run_path = tmp_path / "example_with_heads"
    run_path.mkdir(parents=True, exist_ok=True)

    config = IndexConfig(
        run_path=str(run_path),
        model="RonenEldan/TinyStories-1M",
        data=DataConfig(dataset="NeelNanda/pile-10k", truncation=True),
        head_cfgs={
            "h.0.attn.attention.out_proj": HeadConfig(
                num_heads=16, head_size=4, head_dim=2
            ),
        },
    )

    model = AutoModelForCausalLM.from_pretrained(
        config.model, trust_remote_code=True, use_safetensors=True
    )
    tokenizer = AutoTokenizer.from_pretrained(config.model)
    data = load_dataset(config.data.dataset, split="train[:1%]")
    data = data.select(range(8))  # type: ignore

    processor = GradientProcessor(projection_dim=config.projection_dim)

    data = data.map(
        tokenize,
        batched=True,
        fn_kwargs=dict(args=config.data, tokenizer=tokenizer),
        remove_columns=data.column_names,
    )

    collect_gradients(
        model=model,
        data=data,
        processor=processor,
        path=config.run_path,
        head_cfgs=config.head_cfgs,
    )

    assert any(run_path.iterdir()), "Expected artifacts in the temp run_path"


def test_disk_build_conv1d(tmp_path: Path):
    run_path = tmp_path / "example_with_heads"
    run_path.mkdir(parents=True, exist_ok=True)

    config = IndexConfig(
        run_path=str(run_path),
        model="openai-community/gpt2",
        data=DataConfig(dataset="NeelNanda/pile-10k", truncation=True),
    )

    model = AutoModelForCausalLM.from_pretrained(
        config.model, trust_remote_code=True, use_safetensors=True
    )
    tokenizer = AutoTokenizer.from_pretrained(config.model)
    data = load_dataset(config.data.dataset, split="train")
    data = data.select(range(8))  # type: ignore

    processor = GradientProcessor(projection_dim=config.projection_dim)

    data = data.map(
        tokenize,
        batched=True,
        fn_kwargs=dict(args=config.data, tokenizer=tokenizer),
        remove_columns=data.column_names,
    )

    collect_gradients(
        model=model,
        data=data,
        processor=processor,
        path=config.run_path,
        head_cfgs=config.head_cfgs,
    )

    assert any(run_path.iterdir()), "Expected artifacts in the temp run_path"

    index = load_gradients(str(run_path))
    assert len(modules := index.dtype.names) != 0
    assert len(first_column := index[modules[0]]) != 0