Skip to content

Commit 2ed4e91

Browse files
committed
Merge remote-tracking branch 'upstream/main' into insop/kld
2 parents 6a7ae9e + 27fd3a1 commit 2ed4e91

17 files changed

+762
-8
lines changed
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Config for EleutherEvalRecipe in eleuther_eval.py
2+
#
3+
# To launch, run the following command from root torchtune directory:
4+
# tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"]
5+
6+
output_dir: ./ # Not needed
7+
8+
# Model Arguments
9+
model:
10+
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
11+
12+
checkpointer:
13+
_component_: torchtune.training.FullModelHFCheckpointer
14+
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
15+
checkpoint_files: [
16+
model.safetensors,
17+
]
18+
output_dir: ${output_dir}
19+
model_type: QWEN2
20+
21+
# Tokenizer
22+
tokenizer:
23+
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
24+
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
25+
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
26+
max_seq_len: null
27+
28+
# Environment
29+
device: cuda
30+
dtype: bf16
31+
seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
32+
33+
# EleutherAI specific eval args
34+
tasks: ["truthfulqa_mc2"]
35+
limit: null
36+
max_seq_length: 4096
37+
batch_size: 8
38+
enable_kv_cache: True
39+
40+
# Quantization specific args
41+
quantizer: null

tests/assets/sentencepiece.model

773 KB
Binary file not shown.

tests/torchtune/models/t5/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import pytest
8+
import torch
9+
10+
from torchtune.models.t5._component_builders import t5_encoder
11+
from torchtune.training.seed import set_seed
12+
13+
VOCAB_SIZE = 512
14+
MAX_SEQ_LEN = 8
15+
BSZ = 2
16+
EMBED_DIM = 2
17+
18+
19+
@pytest.fixture(autouse=True)
20+
def random():
21+
set_seed(0)
22+
23+
24+
class TestT5Encoder:
25+
@pytest.fixture
26+
def model(self):
27+
model = t5_encoder(
28+
embed_dim=EMBED_DIM,
29+
mlp_dim=4,
30+
num_heads=2,
31+
head_dim=EMBED_DIM // 2,
32+
num_layers=2,
33+
rel_pos_num_buckets=4,
34+
rel_pos_max_dist=4,
35+
vocab_size=VOCAB_SIZE,
36+
norm_eps=1e-6,
37+
max_seq_len=MAX_SEQ_LEN,
38+
)
39+
40+
for param in model.parameters():
41+
param.data.uniform_(0, 1)
42+
43+
return model
44+
45+
@pytest.fixture
46+
def inputs(self):
47+
return torch.randint(0, VOCAB_SIZE, (BSZ, MAX_SEQ_LEN))
48+
49+
def test_forward(self, model, inputs):
50+
actual = model(inputs)
51+
expected = torch.tensor(
52+
[
53+
[
54+
[0.3670, 0.2938],
55+
[0.3692, 0.2921],
56+
[0.3611, 0.2984],
57+
[0.4207, 0.2437],
58+
[0.3447, 0.3106],
59+
[0.3383, 0.3150],
60+
[0.3727, 0.2892],
61+
[0.3996, 0.2653],
62+
],
63+
[
64+
[0.3855, 0.2783],
65+
[0.2627, 0.3581],
66+
[0.3601, 0.2992],
67+
[0.3473, 0.3087],
68+
[0.3549, 0.3032],
69+
[0.2871, 0.3459],
70+
[0.2753, 0.3520],
71+
[0.2285, 0.3728],
72+
],
73+
]
74+
)
75+
assert actual.shape == (BSZ, MAX_SEQ_LEN, EMBED_DIM)
76+
torch.testing.assert_close(actual, expected, atol=1e-4, rtol=1e-4)
77+
78+
def test_backward(self, model, inputs):
79+
y = model(inputs)
80+
loss = y.mean()
81+
loss.backward()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
import pytest
7+
8+
from tests.common import ASSETS
9+
from torchtune.models.t5._model_builders import t5_tokenizer
10+
11+
12+
class TestT5Tokenizer:
13+
@pytest.fixture
14+
def tokenizer(self):
15+
return t5_tokenizer(str(ASSETS / "sentencepiece.model"))
16+
17+
def test_encoding(self, tokenizer):
18+
texts = [
19+
"a cow jumping over the moon",
20+
"a helpful AI assistant",
21+
]
22+
correct_tokens = [
23+
[3, 9, 9321, 15539, 147, 8, 8114, 1],
24+
[3, 9, 2690, 7833, 6165, 1],
25+
]
26+
for text, correct in zip(texts, correct_tokens):
27+
tokens = tokenizer.encode(text)
28+
print(tokens)
29+
assert tokens == correct
30+
31+
def test_decoding(self, tokenizer):
32+
text = "this is torchtune"
33+
assert text == tokenizer.decode(tokenizer.encode(text))
34+
35+
def test_call(self, tokenizer):
36+
sample = {"text": "hello world"}
37+
sample = tokenizer(sample)
38+
assert "text" not in sample
39+
assert "tokens" in sample

torchtune/_recipe_registry.py

+4
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,10 @@ class Recipe:
457457
name="qwen2/evaluation",
458458
file_path="qwen2/evaluation.yaml",
459459
),
460+
Config(
461+
name="qwen2_5/evaluation",
462+
file_path="qwen2_5/evaluation.yaml",
463+
),
460464
Config(
461465
name="gemma/evaluation",
462466
file_path="gemma/evaluation.yaml",

torchtune/datasets/_alpaca.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def alpaca_dataset(
5151
:class:`~torchtune.data.AlpacaToMessages` to the new column names in the dataset. Keys should be
5252
"instruction", "input", and "output" and values should be the actual column names. If None, uses
5353
the default column names ``"instruction``, ``"input"``, and ``"output"`` in ``tatsu-lab/alpaca``.
54-
train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
54+
train_on_input (bool): Whether the model is trained on the prompt or not. Default is True.
5555
packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
5656
filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
5757
the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more

torchtune/datasets/multimodal/_the_cauldron.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]:
124124
def the_cauldron_dataset(
125125
model_transform: Transform,
126126
*,
127-
subset: str,
127+
subset: str = "orcvqa",
128128
source: str = "HuggingFaceM4/the_cauldron",
129129
column_map: Optional[Dict[str, str]] = None,
130130
new_system_prompt: Optional[str] = None,
@@ -138,8 +138,8 @@ def the_cauldron_dataset(
138138
`The Cauldron <https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_
139139
from Hugging Face Datasets.
140140
141-
The Cauldron consists of numerous datasets. You must specify one of the datasets
142-
using the ``subset`` argument.
141+
The Cauldron consists of numerous datasets. You can specify one of the datasets
142+
using the ``subset`` argument. The default value is the ``orcvqa`` dataset.
143143
144144
The model transform is expected to be a callable that applies pre-processing steps specific
145145
to a model. For multimodal datasets, this is expected to be at minimum a tokenizer and
@@ -181,8 +181,8 @@ def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]:
181181
transforms on the keys. It should consist of at minimum two components: text tokenization (called
182182
on the "messages" field) and image transform (called on the "images" field). The keys returned by
183183
the model transform should be aligned with the expected inputs into the model.
184-
subset (str): name of the subset of the dataset to load. See the `dataset card
185-
<https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_ for options.
184+
subset (str): name of the subset of the dataset to load. Default is `orcvqa`, see the `dataset card
185+
<https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_ for other options.
186186
source (str): path to dataset repository on Hugging Face. For local datasets,
187187
define source as the data file type (e.g. "json", "csv", "text") and pass
188188
in the filepath in ``data_files``. See `Hugging Face's

torchtune/models/t5/__init__.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from ._component_builders import t5_encoder
8+
from ._model_builders import t5_tokenizer, t5_v1_1_xxl_encoder
9+
10+
__all__ = [
11+
"t5_encoder",
12+
"t5_tokenizer",
13+
"t5_v1_1_xxl_encoder",
14+
]
+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
from torch import nn
7+
8+
from torchtune.models.t5._encoder import (
9+
T5Encoder,
10+
T5EncoderLayer,
11+
T5EncoderSelfAttention,
12+
)
13+
from torchtune.modules.feed_forward import FeedForward
14+
from torchtune.modules.rms_norm import RMSNorm
15+
16+
17+
def t5_encoder(
18+
embed_dim: int,
19+
mlp_dim: int,
20+
num_heads: int,
21+
head_dim: int,
22+
num_layers: int,
23+
rel_pos_num_buckets: int,
24+
rel_pos_max_dist: int,
25+
vocab_size: int,
26+
norm_eps: float,
27+
max_seq_len: int,
28+
):
29+
"""
30+
Builder for the T5 encoder.
31+
32+
T5 paper: https://arxiv.org/abs/1910.10683
33+
34+
Args:
35+
embed_dim (int): The model dimension.
36+
mlp_dim (int): The inner dimension of the feed forward layers.
37+
num_heads (int): The number of attention heads.
38+
head_dim (int): The dimension of the attention heads (should equal `embed_dim // num_heads`)
39+
num_layers (int): Number of encoder layers.
40+
rel_pos_num_buckets (int): Number of discrete buckets to divide the relative positions into.
41+
See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
42+
rel_pos_max_dist (int): Maximum distance for relative positions.
43+
Distances beyond this are grouped into the last bucket.
44+
See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
45+
vocab_size (int): Vocab size of the model's tokenizer.
46+
norm_eps (float): Small value added to denominator for numerical stability.
47+
max_seq_len (int): The maximum sequence length (context length) of the model.
48+
49+
Returns:
50+
T5Encoder
51+
"""
52+
token_embedding = nn.Embedding(vocab_size, embed_dim)
53+
54+
attn = T5EncoderSelfAttention(
55+
embed_dim=embed_dim,
56+
num_heads=num_heads,
57+
head_dim=head_dim,
58+
q_proj=nn.Linear(embed_dim, embed_dim, bias=False),
59+
k_proj=nn.Linear(embed_dim, embed_dim, bias=False),
60+
v_proj=nn.Linear(embed_dim, embed_dim, bias=False),
61+
output_proj=nn.Linear(embed_dim, embed_dim, bias=False),
62+
)
63+
64+
mlp = FeedForward(
65+
gate_proj=nn.Linear(embed_dim, mlp_dim, bias=False),
66+
down_proj=nn.Linear(mlp_dim, embed_dim, bias=False),
67+
up_proj=nn.Linear(embed_dim, mlp_dim, bias=False),
68+
activation=nn.GELU(),
69+
)
70+
71+
layer = T5EncoderLayer(
72+
attn=attn,
73+
mlp=mlp,
74+
sa_norm=RMSNorm(embed_dim, eps=norm_eps),
75+
mlp_norm=RMSNorm(embed_dim, eps=norm_eps),
76+
)
77+
78+
final_norm = RMSNorm(embed_dim, eps=norm_eps)
79+
80+
return T5Encoder(
81+
token_embedding=token_embedding,
82+
layer=layer,
83+
final_norm=final_norm,
84+
num_layers=num_layers,
85+
num_heads=num_heads,
86+
rel_pos_num_buckets=rel_pos_num_buckets,
87+
rel_pos_max_dist=rel_pos_max_dist,
88+
max_seq_len=max_seq_len,
89+
)
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from torchtune.models.convert_weights import get_mapped_key
8+
9+
# state dict key mappings from HF's format to torchtune's format
10+
_FROM_HF = {
11+
# emb
12+
"encoder.embed_tokens.weight": "token_embedding.weight",
13+
"encoder.block.{}.layer._0.SelfAttention.relative_attention_bias.weight": "relative_position_bias.embedding.weight",
14+
# attn
15+
"encoder.block.{}.layer._0.SelfAttention.q.weight": "layers.{}.attn.q_proj.weight",
16+
"encoder.block.{}.layer._0.SelfAttention.k.weight": "layers.{}.attn.k_proj.weight",
17+
"encoder.block.{}.layer._0.SelfAttention.v.weight": "layers.{}.attn.v_proj.weight",
18+
"encoder.block.{}.layer._0.SelfAttention.o.weight": "layers.{}.attn.output_proj.weight",
19+
# ff
20+
"encoder.block.{}.layer._1.DenseReluDense.wi_0.weight": "layers.{}.mlp.w1.weight",
21+
"encoder.block.{}.layer._1.DenseReluDense.wo.weight": "layers.{}.mlp.w2.weight",
22+
"encoder.block.{}.layer._1.DenseReluDense.wi_1.weight": "layers.{}.mlp.w3.weight",
23+
# norm
24+
"encoder.block.{}.layer._0.layer_norm.weight": "layers.{}.sa_norm.scale",
25+
"encoder.block.{}.layer._1.layer_norm.weight": "layers.{}.mlp_norm.scale",
26+
"encoder.final_layer_norm.weight": "final_norm.scale",
27+
}
28+
29+
_IGNORE = {
30+
"shared.weight",
31+
"lm_head.weight",
32+
}
33+
34+
35+
def t5_encoder_hf_to_tune(state_dict):
36+
converted_state_dict = {}
37+
for key, value in state_dict.items():
38+
if key.startswith("decoder.") or key in _IGNORE:
39+
continue
40+
41+
# NOTE: HF's T5 has ".<integer>." parts that we do NOT want to be dynamically mapped
42+
# to corresponding ".<integer>." parts in our converted state dict.
43+
# This breaks the `get_mapped_key` implementation, so as a temporary hack,
44+
# we add leading underscores to these parts here and in the `_FROM_HF` map above.
45+
key = key.replace("layer.0.", "layer._0.").replace("layer.1.", "layer._1.")
46+
47+
new_key = get_mapped_key(key, _FROM_HF)
48+
converted_state_dict[new_key] = value
49+
return converted_state_dict

0 commit comments

Comments
 (0)